Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +2 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 17 Jun 2026.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -85,3 +85,5 @@
|
|
| 85 |
{"Model": "Anthropic/claude-opus-4.8 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.8, "Physics": 38.9, "Chemistry": 73.5, "Finance": 44.8, "Consulting": 70.0, "Extraction": 49.4, "Reasoning": 56.6, "Style": 62.1, "Response Characters": 5125, "Input Tokens": 731, "Output Tokens": 12625, "Cost": 51.08}
|
| 86 |
{"Model": "MiniMax/MiniMax-M3 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.4, "Physics": 20.2, "Chemistry": 57.4, "Finance": 33.9, "Consulting": 66.3, "Extraction": 38.9, "Reasoning": 42.8, "Style": 51.7, "Response Characters": 6904, "Input Tokens": 617, "Output Tokens": 26030, "Cost": 5.03}
|
| 87 |
{"Model": "MiniMax/MiniMax-M3", "Category": "Open-weight Instruct", "Overall": 39.2, "Physics": 18.7, "Chemistry": 43.7, "Finance": 31.8, "Consulting": 62.4, "Extraction": 33.2, "Reasoning": 36.9, "Style": 53.6, "Response Characters": 6827, "Input Tokens": 621, "Output Tokens": 18799, "Cost": 3.64}
|
|
|
|
|
|
|
|
|
| 85 |
{"Model": "Anthropic/claude-opus-4.8 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 56.8, "Physics": 38.9, "Chemistry": 73.5, "Finance": 44.8, "Consulting": 70.0, "Extraction": 49.4, "Reasoning": 56.6, "Style": 62.1, "Response Characters": 5125, "Input Tokens": 731, "Output Tokens": 12625, "Cost": 51.08}
|
| 86 |
{"Model": "MiniMax/MiniMax-M3 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.4, "Physics": 20.2, "Chemistry": 57.4, "Finance": 33.9, "Consulting": 66.3, "Extraction": 38.9, "Reasoning": 42.8, "Style": 51.7, "Response Characters": 6904, "Input Tokens": 617, "Output Tokens": 26030, "Cost": 5.03}
|
| 87 |
{"Model": "MiniMax/MiniMax-M3", "Category": "Open-weight Instruct", "Overall": 39.2, "Physics": 18.7, "Chemistry": 43.7, "Finance": 31.8, "Consulting": 62.4, "Extraction": 33.2, "Reasoning": 36.9, "Style": 53.6, "Response Characters": 6827, "Input Tokens": 621, "Output Tokens": 18799, "Cost": 3.64}
|
| 88 |
+
{"Model": "Z-AI/GLM-5.2 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.8, "Physics": 30.6, "Chemistry": 57.6, "Finance": 25.4, "Consulting": 69.6, "Extraction": 38.2, "Reasoning": 46.5, "Style": 62.1, "Response Characters": 4924, "Input Tokens": 477, "Output Tokens": 12091, "Cost": 8.62}
|
| 89 |
+
{"Model": "Z-AI/GLM-5.2", "Category": "Open-weight Instruct", "Overall": 39.6, "Physics": 21.8, "Chemistry": 45.0, "Finance": 24.8, "Consulting": 66.8, "Extraction": 32.1, "Reasoning": 38.6, "Style": 53.6, "Response Characters": 4867, "Input Tokens": 465, "Output Tokens": 10330, "Cost": 6.69}
|