AgPerry commited on
Commit
74fbf87
·
verified ·
1 Parent(s): 7ead7b7

UI: default to V2 Hermes (single harness) + add Reset button to mirror website headline view

Browse files
Files changed (1) hide show
  1. app.py +22 -11
app.py CHANGED
@@ -135,11 +135,6 @@ def load_results() -> pd.DataFrame:
135
  df["reward_rate_strict"] = pd.NA
136
  df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
137
  df["wall_hours"] = df["wall_hours"].map(_format_wall)
138
- if "avg_cost_per_task_usd" not in df.columns:
139
- df["avg_cost_per_task_usd"] = pd.NA
140
- df["avg_cost_per_task_usd"] = df["avg_cost_per_task_usd"].map(
141
- lambda v: "—" if pd.isna(v) else f"${float(v):.4f}"
142
- )
143
  df.rename(
144
  columns={
145
  "model": "Model",
@@ -150,13 +145,12 @@ def load_results() -> pd.DataFrame:
150
  "pass_rate": "Intercepted",
151
  "reward_rate": "Reward (lenient)",
152
  "reward_rate_strict": "Reward (strict)",
153
- "avg_cost_per_task_usd": "Cost / task",
154
  "wall_hours": "Wall (h)",
155
  "rank": "Rank",
156
  },
157
  inplace=True,
158
  )
159
- return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "Cost / task", "Pass", "Total", "Wall (h)"]]
160
 
161
 
162
  def filter_df(query: str, corpus: str, harness_filter: list[str]):
@@ -179,6 +173,18 @@ def all_harnesses() -> list[str]:
179
  return ["hermes", "openclaw"]
180
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
183
  gr.Markdown(INTRO)
184
 
@@ -190,10 +196,10 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
190
  gr.Markdown(TABLE_INTRO)
191
  with gr.Row():
192
  search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
193
- corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value="v2", label="Corpus", scale=2)
194
  harness_choice = gr.CheckboxGroup(
195
  choices=all_harnesses(),
196
- value=all_harnesses(),
197
  label="Harness",
198
  )
199
  # Empty placeholder — the real data is fetched fresh from the dataset on
@@ -204,9 +210,11 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
204
  value=None,
205
  interactive=False,
206
  wrap=True,
207
- column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "100px", "60px", "60px", "80px"],
208
  )
209
- refresh = gr.Button("🔄 Refresh from dataset")
 
 
210
 
211
  for control in (search_bar, corpus_choice, harness_choice):
212
  control.change(
@@ -215,6 +223,9 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
215
  outputs=table,
216
  )
217
  refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
 
 
 
218
 
219
  with gr.TabItem("📝 About"):
220
  gr.Markdown(ABOUT)
 
135
  df["reward_rate_strict"] = pd.NA
136
  df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
137
  df["wall_hours"] = df["wall_hours"].map(_format_wall)
 
 
 
 
 
138
  df.rename(
139
  columns={
140
  "model": "Model",
 
145
  "pass_rate": "Intercepted",
146
  "reward_rate": "Reward (lenient)",
147
  "reward_rate_strict": "Reward (strict)",
 
148
  "wall_hours": "Wall (h)",
149
  "rank": "Rank",
150
  },
151
  inplace=True,
152
  )
153
+ return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "Pass", "Total", "Wall (h)"]]
154
 
155
 
156
  def filter_df(query: str, corpus: str, harness_filter: list[str]):
 
173
  return ["hermes", "openclaw"]
174
 
175
 
176
+ # Default headline view: V2 + Hermes — matches the website's "V2 (Hermes)" main tab.
177
+ # Mirrors https://claw-bench.com/leaderboard where V2 Hermes is the primary corpus
178
+ # and other harness/corpus combos are tucked behind an "Others" toggle.
179
+ DEFAULT_CORPUS = "v2"
180
+ DEFAULT_HARNESSES = ["hermes"]
181
+
182
+
183
+ def reset_defaults():
184
+ """Restore the headline V2-Hermes view (search cleared, corpus=v2, harness=[hermes])."""
185
+ return "", DEFAULT_CORPUS, DEFAULT_HARNESSES
186
+
187
+
188
  with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
189
  gr.Markdown(INTRO)
190
 
 
196
  gr.Markdown(TABLE_INTRO)
197
  with gr.Row():
198
  search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
199
+ corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value=DEFAULT_CORPUS, label="Corpus", scale=2)
200
  harness_choice = gr.CheckboxGroup(
201
  choices=all_harnesses(),
202
+ value=DEFAULT_HARNESSES,
203
  label="Harness",
204
  )
205
  # Empty placeholder — the real data is fetched fresh from the dataset on
 
210
  value=None,
211
  interactive=False,
212
  wrap=True,
213
+ column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "60px", "60px", "80px"],
214
  )
215
+ with gr.Row():
216
+ refresh = gr.Button("🔄 Refresh from dataset")
217
+ reset = gr.Button("↺ Reset to V2 Hermes default")
218
 
219
  for control in (search_bar, corpus_choice, harness_choice):
220
  control.change(
 
223
  outputs=table,
224
  )
225
  refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
226
+ # Reset writes the defaults back into the three controls; their .change
227
+ # listeners then fan out and re-render the table from the new state.
228
+ reset.click(fn=reset_defaults, inputs=[], outputs=[search_bar, corpus_choice, harness_choice])
229
 
230
  with gr.TabItem("📝 About"):
231
  gr.Markdown(ABOUT)