Spaces:

TIGER-Lab
/

ClawBench

Running

App Files Files Community

AgPerry commited on 11 days ago

Commit

74fbf87

verified ·

1 Parent(s): 7ead7b7

UI: default to V2 Hermes (single harness) + add Reset button to mirror website headline view

Browse files

Files changed (1) hide show

app.py +22 -11

app.py CHANGED Viewed

@@ -135,11 +135,6 @@ def load_results() -> pd.DataFrame:
         df["reward_rate_strict"] = pd.NA
     df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
     df["wall_hours"] = df["wall_hours"].map(_format_wall)
-    if "avg_cost_per_task_usd" not in df.columns:
-        df["avg_cost_per_task_usd"] = pd.NA
-    df["avg_cost_per_task_usd"] = df["avg_cost_per_task_usd"].map(
-        lambda v: "—" if pd.isna(v) else f"${float(v):.4f}"
-    )
     df.rename(
         columns={
             "model": "Model",
@@ -150,13 +145,12 @@ def load_results() -> pd.DataFrame:
             "pass_rate": "Intercepted",
             "reward_rate": "Reward (lenient)",
             "reward_rate_strict": "Reward (strict)",
-            "avg_cost_per_task_usd": "Cost / task",
             "wall_hours": "Wall (h)",
             "rank": "Rank",
         },
         inplace=True,
     )
-    return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "Cost / task", "Pass", "Total", "Wall (h)"]]
 def filter_df(query: str, corpus: str, harness_filter: list[str]):
@@ -179,6 +173,18 @@ def all_harnesses() -> list[str]:
         return ["hermes", "openclaw"]
 with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
     gr.Markdown(INTRO)
@@ -190,10 +196,10 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
             gr.Markdown(TABLE_INTRO)
             with gr.Row():
                 search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
-                corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value="v2", label="Corpus", scale=2)
             harness_choice = gr.CheckboxGroup(
                 choices=all_harnesses(),
-                value=all_harnesses(),
                 label="Harness",
             )
             # Empty placeholder — the real data is fetched fresh from the dataset on
@@ -204,9 +210,11 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
                 value=None,
                 interactive=False,
                 wrap=True,
-                column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "100px", "60px", "60px", "80px"],
             )
-            refresh = gr.Button("🔄 Refresh from dataset")
             for control in (search_bar, corpus_choice, harness_choice):
                 control.change(
@@ -215,6 +223,9 @@ with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
                     outputs=table,
                 )
             refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
         with gr.TabItem("📝 About"):
             gr.Markdown(ABOUT)

         df["reward_rate_strict"] = pd.NA
     df["reward_rate_strict"] = df["reward_rate_strict"].map(_format_pct)
     df["wall_hours"] = df["wall_hours"].map(_format_wall)
     df.rename(
         columns={
             "model": "Model",
             "pass_rate": "Intercepted",
             "reward_rate": "Reward (lenient)",
             "reward_rate_strict": "Reward (strict)",
             "wall_hours": "Wall (h)",
             "rank": "Rank",
         },
         inplace=True,
     )
+    return df[["Rank", "Model", "Harness", "Corpus", "Intercepted", "Reward (lenient)", "Reward (strict)", "Pass", "Total", "Wall (h)"]]
 def filter_df(query: str, corpus: str, harness_filter: list[str]):
         return ["hermes", "openclaw"]
+# Default headline view: V2 + Hermes — matches the website's "V2 (Hermes)" main tab.
+# Mirrors https://claw-bench.com/leaderboard where V2 Hermes is the primary corpus
+# and other harness/corpus combos are tucked behind an "Others" toggle.
+DEFAULT_CORPUS = "v2"
+DEFAULT_HARNESSES = ["hermes"]
+def reset_defaults():
+    """Restore the headline V2-Hermes view (search cleared, corpus=v2, harness=[hermes])."""
+    return "", DEFAULT_CORPUS, DEFAULT_HARNESSES
 with gr.Blocks(title="ClawBench Leaderboard", theme=gr.themes.Soft()) as demo:
     gr.Markdown(INTRO)
             gr.Markdown(TABLE_INTRO)
             with gr.Row():
                 search_bar = gr.Textbox(placeholder="Search models…", show_label=False, scale=3)
+                corpus_choice = gr.Radio(choices=["all", "v2", "v1"], value=DEFAULT_CORPUS, label="Corpus", scale=2)
             harness_choice = gr.CheckboxGroup(
                 choices=all_harnesses(),
+                value=DEFAULT_HARNESSES,
                 label="Harness",
             )
             # Empty placeholder — the real data is fetched fresh from the dataset on
                 value=None,
                 interactive=False,
                 wrap=True,
+                column_widths=["60px", "260px", "100px", "70px", "110px", "110px", "110px", "60px", "60px", "80px"],
             )
+            with gr.Row():
+                refresh = gr.Button("🔄 Refresh from dataset")
+                reset = gr.Button("↺ Reset to V2 Hermes default")
             for control in (search_bar, corpus_choice, harness_choice):
                 control.change(
                     outputs=table,
                 )
             refresh.click(fn=filter_df, inputs=[search_bar, corpus_choice, harness_choice], outputs=table)
+            # Reset writes the defaults back into the three controls; their .change
+            # listeners then fan out and re-render the table from the new state.
+            reset.click(fn=reset_defaults, inputs=[], outputs=[search_bar, corpus_choice, harness_choice])
         with gr.TabItem("📝 About"):
             gr.Markdown(ABOUT)