UI: default to V2 Hermes (single harness) + add Reset button to mirror website headline view
Browse files
app.py
CHANGED
|
@@ -135,11 +135,6 @@ def load_results() -> pd.DataFrame:
|
|
| 135 |
df["reward_rate_strict"] = pd.NA
|
| 136 |
df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
|
| 137 |
df["wall_hours"] = df["wall_hours"].map(_format_wall)
|
| 138 |
-
if "avg_cost_per_task_usd" not in df.columns:
|
| 139 |
-
df["avg_cost_per_task_usd"] = pd.NA
|
| 140 |
-
df["avg_cost_per_task_usd"] = df["avg_cost_per_task_usd"].map(
|
| 141 |
-
lambda v: "—" if pd.isna(v) else f"${float(v):.4f}"
|
| 142 |
-
)
|
| 143 |
df.rename(
|
| 144 |
columns={
|
| 145 |
"model": "Model",
|
|
@@ -150,13 +145,12 @@ def load_results() -> pd.DataFrame:
|
|
| 150 |
"pass_rate": "Intercepted",
|
| 151 |
"reward_rate": "Reward (lenient)",
|
| 152 |
"reward_rate_strict": "Reward (strict)",
|
| 153 |
-
"avg_cost_per_task_usd": "Cost / task",
|
| 154 |
"wall_hours": "Wall (h)",
|
| 155 |
"rank": "Rank",
|
| 156 |
},
|
| 157 |
inplace=True,
|
| 158 |
)
|
| 159 |
-
return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "
|
| 160 |
|
| 161 |
|
| 162 |
def filter_df(query: str, corpus: str, harness_filter: list[str]):
|
|
@@ -179,6 +173,18 @@ def all_harnesses() -> list[str]:
|
|
| 179 |
return ["hermes", "openclaw"]
|
| 180 |
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 183 |
gr.Markdown(INTRO)
|
| 184 |
|
|
@@ -190,10 +196,10 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 190 |
gr.Markdown(TABLE_INTRO)
|
| 191 |
with gr.Row():
|
| 192 |
search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
|
| 193 |
-
corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value=
|
| 194 |
harness_choice = gr.CheckboxGroup(
|
| 195 |
choices=all_harnesses(),
|
| 196 |
-
value=
|
| 197 |
label="Harness",
|
| 198 |
)
|
| 199 |
# Empty placeholder — the real data is fetched fresh from the dataset on
|
|
@@ -204,9 +210,11 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 204 |
value=None,
|
| 205 |
interactive=False,
|
| 206 |
wrap=True,
|
| 207 |
-
column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "
|
| 208 |
)
|
| 209 |
-
|
|
|
|
|
|
|
| 210 |
|
| 211 |
for control in (search_bar, corpus_choice, harness_choice):
|
| 212 |
control.change(
|
|
@@ -215,6 +223,9 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 215 |
outputs=table,
|
| 216 |
)
|
| 217 |
refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
with gr.TabItem("📝 About"):
|
| 220 |
gr.Markdown(ABOUT)
|
|
|
|
| 135 |
df["reward_rate_strict"] = pd.NA
|
| 136 |
df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
|
| 137 |
df["wall_hours"] = df["wall_hours"].map(_format_wall)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
df.rename(
|
| 139 |
columns={
|
| 140 |
"model": "Model",
|
|
|
|
| 145 |
"pass_rate": "Intercepted",
|
| 146 |
"reward_rate": "Reward (lenient)",
|
| 147 |
"reward_rate_strict": "Reward (strict)",
|
|
|
|
| 148 |
"wall_hours": "Wall (h)",
|
| 149 |
"rank": "Rank",
|
| 150 |
},
|
| 151 |
inplace=True,
|
| 152 |
)
|
| 153 |
+
return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "Pass", "Total", "Wall (h)"]]
|
| 154 |
|
| 155 |
|
| 156 |
def filter_df(query: str, corpus: str, harness_filter: list[str]):
|
|
|
|
| 173 |
return ["hermes", "openclaw"]
|
| 174 |
|
| 175 |
|
| 176 |
+
# Default headline view: V2 + Hermes — matches the website's "V2 (Hermes)" main tab.
|
| 177 |
+
# Mirrors https://claw-bench.com/leaderboard where V2 Hermes is the primary corpus
|
| 178 |
+
# and other harness/corpus combos are tucked behind an "Others" toggle.
|
| 179 |
+
DEFAULT_CORPUS = "v2"
|
| 180 |
+
DEFAULT_HARNESSES = ["hermes"]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def reset_defaults():
|
| 184 |
+
"""Restore the headline V2-Hermes view (search cleared, corpus=v2, harness=[hermes])."""
|
| 185 |
+
return "", DEFAULT_CORPUS, DEFAULT_HARNESSES
|
| 186 |
+
|
| 187 |
+
|
| 188 |
with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 189 |
gr.Markdown(INTRO)
|
| 190 |
|
|
|
|
| 196 |
gr.Markdown(TABLE_INTRO)
|
| 197 |
with gr.Row():
|
| 198 |
search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
|
| 199 |
+
corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value=DEFAULT_CORPUS, label="Corpus", scale=2)
|
| 200 |
harness_choice = gr.CheckboxGroup(
|
| 201 |
choices=all_harnesses(),
|
| 202 |
+
value=DEFAULT_HARNESSES,
|
| 203 |
label="Harness",
|
| 204 |
)
|
| 205 |
# Empty placeholder — the real data is fetched fresh from the dataset on
|
|
|
|
| 210 |
value=None,
|
| 211 |
interactive=False,
|
| 212 |
wrap=True,
|
| 213 |
+
column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "60px", "60px", "80px"],
|
| 214 |
)
|
| 215 |
+
with gr.Row():
|
| 216 |
+
refresh = gr.Button("🔄 Refresh from dataset")
|
| 217 |
+
reset = gr.Button("↺ Reset to V2 Hermes default")
|
| 218 |
|
| 219 |
for control in (search_bar, corpus_choice, harness_choice):
|
| 220 |
control.change(
|
|
|
|
| 223 |
outputs=table,
|
| 224 |
)
|
| 225 |
refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
|
| 226 |
+
# Reset writes the defaults back into the three controls; their .change
|
| 227 |
+
# listeners then fan out and re-render the table from the new state.
|
| 228 |
+
reset.click(fn=reset_defaults, inputs=[], outputs=[search_bar, corpus_choice, harness_choice])
|
| 229 |
|
| 230 |
with gr.TabItem("📝 About"):
|
| 231 |
gr.Markdown(ABOUT)
|