Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +2 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 18 Apr 2026.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -71,3 +71,5 @@
|
|
| 71 |
{"Model": "Google/Gemma-4-31B-It (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.4, "Physics": 35.6, "Chemistry": 52.2, "Finance": 21.4, "Consulting": 68.2, "Extraction": 33.0, "Reasoning": 44.6, "Style": 65.9, "Response Characters": 4102, "Input Tokens": 494, "Output Tokens": 6741, "Cost": 0.44}
|
| 72 |
{"Model": "Google/Gemma-4-31B-It", "Category": "Open-weight Instruct", "Overall": 42.1, "Physics": 33.2, "Chemistry": 46.8, "Finance": 22.0, "Consulting": 66.3, "Extraction": 31.6, "Reasoning": 43.0, "Style": 66.5, "Response Characters": 3903, "Input Tokens": 492, "Output Tokens": 1333, "Cost": 0.1}
|
| 73 |
{"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
|
|
|
|
|
|
|
|
|
| 71 |
{"Model": "Google/Gemma-4-31B-It (Thinking)", "Category": "Open-weight Reasoning", "Overall": 44.4, "Physics": 35.6, "Chemistry": 52.2, "Finance": 21.4, "Consulting": 68.2, "Extraction": 33.0, "Reasoning": 44.6, "Style": 65.9, "Response Characters": 4102, "Input Tokens": 494, "Output Tokens": 6741, "Cost": 0.44}
|
| 72 |
{"Model": "Google/Gemma-4-31B-It", "Category": "Open-weight Instruct", "Overall": 42.1, "Physics": 33.2, "Chemistry": 46.8, "Finance": 22.0, "Consulting": 66.3, "Extraction": 31.6, "Reasoning": 43.0, "Style": 66.5, "Response Characters": 3903, "Input Tokens": 492, "Output Tokens": 1333, "Cost": 0.1}
|
| 73 |
{"Model": "Anthropic/claude-opus-4.7 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 61.3, "Physics": 42.9, "Chemistry": 75.5, "Finance": 52.9, "Consulting": 73.9, "Extraction": 56.9, "Reasoning": 62.3, "Style": 63.8, "Response Characters": 10050, "Input Tokens": 736, "Output Tokens": 5668, "Cost": 23.26}
|
| 74 |
+
{"Model": "Z-AI/GLM-5.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.6, "Physics": 25.7, "Chemistry": 50.1, "Finance": 24.3, "Consulting": 66.1, "Extraction": 32.6, "Reasoning": 41.7, "Style": 61.1, "Response Characters": 4913, "Input Tokens": 470, "Output Tokens": 20217, "Cost": 10.26}
|
| 75 |
+
{"Model": "Z-AI/GLM-5.1", "Category": "Open-weight Instruct", "Overall": 38.1, "Physics": 16.9, "Chemistry": 42.2, "Finance": 28.1, "Consulting": 65.1, "Extraction": 32.3, "Reasoning": 35.8, "Style": 59.7, "Response Characters": 4996, "Input Tokens": 472, "Output Tokens": 18489, "Cost": 9.39}
|