Spaces:

nvidia
/

ProfBench

Running

zhilinw commited on 3 days ago

Commit

f9cfd53

verified ·

1 Parent(s): e2c7604

Upload 2 files

Files changed (2) hide show

app.py CHANGED Viewed

@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("Report Generation"):
             with gr.Row():
                 with gr.Column(scale=7):
-                    gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 16 Feb 2026.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):

         with gr.TabItem("Report Generation"):
             with gr.Row():
                 with gr.Column(scale=7):
+                    gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 17 Feb 2026.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):

report_generation.jsonl CHANGED Viewed

@@ -60,3 +60,4 @@
 {"Model": "Qwen/Qwen3.5-397B-A17B", "Category": "Open-weight Instruct", "Overall": 45.1, "Physics": 38.5, "Chemistry": 51.6, "Finance": 27.4, "Consulting": 62.8, "Extraction": 37.1, "Reasoning": 46.0, "Style": 61.6, "Response Characters": 6134, "Input Tokens": 496, "Output Tokens": 11052, "Cost": 6.41}
 {"Model": "MiniMax/MiniMax-M2.5 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 42.2, "Physics": 29.6, "Chemistry": 50.5, "Finance": 24.8, "Consulting": 63.8, "Extraction": 30.2, "Reasoning": 41.5, "Style": 62.6, "Response Characters": 15835, "Input Tokens": 484, "Output Tokens": 20003, "Cost": 3.86}
 {"Model": "MiniMax/MiniMax-M2.5", "Category": "Open-weight Instruct", "Overall": 40.0, "Physics": 29.6, "Chemistry": 43.9, "Finance": 22.8, "Consulting": 63.5, "Extraction": 28.9, "Reasoning": 39.8, "Style": 53.6, "Response Characters": 5547, "Input Tokens": 484, "Output Tokens": 18356, "Cost": 3.55}

 {"Model": "Qwen/Qwen3.5-397B-A17B", "Category": "Open-weight Instruct", "Overall": 45.1, "Physics": 38.5, "Chemistry": 51.6, "Finance": 27.4, "Consulting": 62.8, "Extraction": 37.1, "Reasoning": 46.0, "Style": 61.6, "Response Characters": 6134, "Input Tokens": 496, "Output Tokens": 11052, "Cost": 6.41}
 {"Model": "MiniMax/MiniMax-M2.5 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 42.2, "Physics": 29.6, "Chemistry": 50.5, "Finance": 24.8, "Consulting": 63.8, "Extraction": 30.2, "Reasoning": 41.5, "Style": 62.6, "Response Characters": 15835, "Input Tokens": 484, "Output Tokens": 20003, "Cost": 3.86}
 {"Model": "MiniMax/MiniMax-M2.5", "Category": "Open-weight Instruct", "Overall": 40.0, "Physics": 29.6, "Chemistry": 43.9, "Finance": 22.8, "Consulting": 63.5, "Extraction": 28.9, "Reasoning": 39.8, "Style": 53.6, "Response Characters": 5547, "Input Tokens": 484, "Output Tokens": 18356, "Cost": 3.55}
+{"Model": "Anthropic/claude-sonnet-4.6 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 55.6, "Physics": 41.8, "Chemistry": 65.4, "Finance": 48.1, "Consulting": 67.3, "Extraction": 49.6, "Reasoning": 56.4, "Style": 55.0, "Response Characters": 19002, "Input Tokens": 531, "Output Tokens": 7205, "Cost": 17.55}