sxie78-dd commited on
Commit
3a013b1
·
unverified ·
1 Parent(s): 3f9933c

update leaderboard results

Browse files
app.py CHANGED
@@ -10,14 +10,15 @@ from src.about import (
10
  )
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import (
13
- BENCHMARK_COLS,
14
- COLS,
15
- AutoEvalColumn,
 
 
 
16
  fields,
17
  )
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
20
- from src.submission.submit import add_new_eval
21
  from src.populate import get_leaderboard_df
22
 
23
 
@@ -25,31 +26,51 @@ def restart_space():
25
  API.restart_space(repo_id=REPO_ID)
26
 
27
 
28
- LEADERBOARD_DF = get_leaderboard_df(
29
- EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
 
 
 
 
30
  )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def init_leaderboard(dataframe):
 
34
  if dataframe is None or dataframe.empty:
35
  raise ValueError("Leaderboard DataFrame is empty or None.")
36
  return Leaderboard(
37
  value=dataframe,
38
- datatype=[c.type for c in fields(AutoEvalColumn)],
39
  select_columns=SelectColumns(
40
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
41
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
42
  label="Select Columns to Display:",
43
  ),
44
- search_columns=[AutoEvalColumn.model.name],
45
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
46
  filter_columns=[
47
  ColumnFilter(
48
- AutoEvalColumn.pass_at_1.name,
49
  type="slider",
50
  min=0,
51
  max=100,
52
- label="pass@1 score",
53
  ),
54
  ],
55
  bool_checkboxgroup_label="Hide models",
@@ -64,7 +85,30 @@ with demo:
64
 
65
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
66
  with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
67
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
70
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
10
  )
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import (
13
+ CATEGORY_ACCURACY_COLS,
14
+ CATEGORY_F1_COLS,
15
+ OVERALL_TIER_COLS,
16
+ CategoryAccuracyColumn,
17
+ CategoryF1Column,
18
+ OverallTierColumn,
19
  fields,
20
  )
21
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
 
22
  from src.populate import get_leaderboard_df
23
 
24
 
 
26
  API.restart_space(repo_id=REPO_ID)
27
 
28
 
29
+ OVERALL_TIER_LEADERBOARD_DF = get_leaderboard_df(
30
+ EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
31
+ EVAL_REQUESTS_PATH,
32
+ OVERALL_TIER_COLS,
33
+ OVERALL_TIER_COLS,
34
+ sort_by="overall_f1",
35
  )
36
 
37
+ CATEGORY_F1_LEADERBOARD_DF = get_leaderboard_df(
38
+ EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_f1.csv",
39
+ EVAL_REQUESTS_PATH,
40
+ CATEGORY_F1_COLS,
41
+ CATEGORY_F1_COLS,
42
+ sort_by="overall_f1",
43
+ )
44
+
45
+ CATEGORY_ACCURACY_LEADERBOARD_DF = get_leaderboard_df(
46
+ EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_accuracy.csv",
47
+ EVAL_REQUESTS_PATH,
48
+ CATEGORY_ACCURACY_COLS,
49
+ CATEGORY_ACCURACY_COLS,
50
+ sort_by="overall_accuracy",
51
+ )
52
 
53
+
54
+ def init_custom_leaderboard(dataframe, column_class, filter_column_name, filter_label):
55
  if dataframe is None or dataframe.empty:
56
  raise ValueError("Leaderboard DataFrame is empty or None.")
57
  return Leaderboard(
58
  value=dataframe,
59
+ datatype=[c.type for c in fields(column_class)],
60
  select_columns=SelectColumns(
61
+ default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
62
+ cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
63
  label="Select Columns to Display:",
64
  ),
65
+ search_columns=[column_class.model.name],
66
+ hide_columns=[c.name for c in fields(column_class) if c.hidden],
67
  filter_columns=[
68
  ColumnFilter(
69
+ filter_column_name,
70
  type="slider",
71
  min=0,
72
  max=100,
73
+ label=filter_label,
74
  ),
75
  ],
76
  bool_checkboxgroup_label="Hide models",
 
85
 
86
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
87
  with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
88
+ with gr.Tabs(selected=0):
89
+ with gr.TabItem("Overall + Tier (Default)", id=0):
90
+ leaderboard_overall_tier = init_custom_leaderboard(
91
+ OVERALL_TIER_LEADERBOARD_DF,
92
+ OverallTierColumn,
93
+ OverallTierColumn.overall_f1.name,
94
+ "Overall F1 score",
95
+ )
96
+
97
+ with gr.TabItem("Per-Category F1", id=1):
98
+ leaderboard_category_f1 = init_custom_leaderboard(
99
+ CATEGORY_F1_LEADERBOARD_DF,
100
+ CategoryF1Column,
101
+ CategoryF1Column.overall_f1.name,
102
+ "Overall F1 score",
103
+ )
104
+
105
+ with gr.TabItem("Per-Category Accuracy", id=2):
106
+ leaderboard_category_accuracy = init_custom_leaderboard(
107
+ CATEGORY_ACCURACY_LEADERBOARD_DF,
108
+ CategoryAccuracyColumn,
109
+ CategoryAccuracyColumn.overall_accuracy.name,
110
+ "Overall Accuracy score",
111
+ )
112
 
113
  with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
114
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
results/ARFBench_leaderboard.csv CHANGED
@@ -1,14 +1,22 @@
1
- Model,pass@1,pass@5,Presence,Identification,Start Time,End Time,Magnitude,Categorization,Correlation,Indicator
2
- Random Choice,23.5,-,50.0,12.5,18.2,16.7,12.5,16.7,20.0,20.0
3
- Frequent Choice,46.9,-,82.9,36.8,21.4,31.3,26.3,30.8,82.9,31.7
4
- Oracle GPT-4o,57.5,-,87.4,34.2,26.8,25.0,55.3,62.5,82.9,28.6
5
- GPT-4.1,57.6,62.2,82.9,39.5,39.2,37.5,60.5,54.8,72.9,34.9
6
- Claude 3.7 Sonnet,56.7,57.4,85.6,34.2,41.0,40.6,53.9,54.8,67.1,36.5
7
- GPT-4o,54.4,60.2,82.0,28.9,23.2,34.4,52.6,52.9,80.0,34.9
8
- o4-mini,48.5,64.5,80.2,13.2,33.9,43.8,48.7,47.1,57.1,22.2
9
- InternVL3-78B,43.5,47.2,84.7,31.6,32.1,31.3,30.3,47.1,24.3,25.4
10
- Qwen2.5-VL-72B,41.1,53.8,83.8,21.1,25.0,6.3,32.9,26.0,48.6,36.5
11
- LlaVa-OneVision-72B,38.9,44.6,76.6,26.3,23.2,15.6,21.1,42.3,42.9,17.5
12
- Llama-3.2-Vision-90B,34.9,37.8,76.6,13.2,26.8,21.8,28.9,21.2,35.7,17.5
13
- QvQ,19.8,22.9,11.7,15.8,1.79,0.00,27.6,36.5,28.6,15.9
14
- ChatTS,10.0,10.4,3.60,10.5,14.3,3.13,9.21,14.4,18.6,4.76
 
 
 
 
 
 
 
 
 
1
+ Model,Model Type,Accuracy,Tier I Accuracy,Tier II Accuracy,Tier III Accuracy,Overall F1,Tier I F1,Tier II F1,Tier III F1
2
+ Random Choice,Baseline,24.5,50.0,20.0,20.0,22.5,45.6,20.5,16.8
3
+ Per-category Frequent Choice,Baseline,45.1,84.7,30.1,45.6,17.3,45.9,12.3,12.5
4
+ Non-domain Experts (n=2),Baseline,69.7,80.4,63.2,72.0,60.7,68.0,59.9,59.0
5
+ Domain Experts (n=2),Baseline,72.7,89.3,67.7,71.4,64.6,76.1,64.5,60.9
6
+ Model-Expert Oracle,Baseline,87.2,96.4,80.3,90.5,82.8,89.0,77.1,86.3
7
+ Qwen3 32B,LLM,47.9,80.9,35.1,48.6,36.1,55.7,31.5,33.8
8
+ GPT-5 (text),LLM,56.4,82.6,45.2,57.9,43.8,66.1,39.6,40.3
9
+ Qwen3-VL 8B,VLM,45.3,80.2,40.8,37.8,34.7,63.5,36.1,23.6
10
+ Claude Sonnet 4.5,VLM,47.2,83.8,43.5,38.4,37.9,63.2,40.6,26.9
11
+ GPT-4o,VLM,47.2,79.3,49.0,34.8,42.4,64.2,43.8,33.8
12
+ GPT-4.1,VLM,47.9,80.2,50.3,34.8,44.0,65.1,48.0,33.1
13
+ Qwen3-VL 32B,VLM,52.8,80.2,46.7,49.2,45.1,65.1,41.9,41.3
14
+ Claude Opus 4.6,VLM,54.8,88.3,52.3,45.9,46.7,65.8,49.1,38.2
15
+ Gemini 3 Pro,VLM,58.1,82.9,51.0,56.5,49.6,67.8,49.7,43.4
16
+ GPT-5.4,VLM,61.3,81.1,54.2,61.3,51.4,62.6,50.4,48.4
17
+ GPT-5,VLM,62.7,82.0,55.9,62.5,51.9,66.9,51.2,47.5
18
+ OpenTSLM 1B (TS-LLM),Post-trained TSFM,0.8,0.0,2.0,0.0,1.2,0.0,3.0,0.0
19
+ ChatTS 8B (TS-LLM),Post-trained TSFM,31.1,60.4,26.5,25.5,22.1,48.1,20.0,15.4
20
+ Toto-Qwen3 32B (TSFM-LLM),Post-trained TSFM,48.8,82.9,47.4,38.7,33.9,60.0,43.6,16.4
21
+ Qwen3-VL 32B (post-trained),Post-trained TSFM,56.9,84.7,50.3,53.8,46.6,69.8,44.9,40.5
22
+ Toto-VLM 32B (TSFM-VLM),Post-trained TSFM,63.9,84.7,55.6,64.6,48.9,66.3,48.4,43.5
results/ARFBench_leaderboard_category_accuracy.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Model Type,Overall Accuracy,Presence,Identification,Start Time,End Time,Magnitude,Categorization,Correlation,Indicator
2
+ Random Choice,Baseline,24.5,50.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
3
+ Per-category Frequent Choice,Baseline,45.1,84.7,36.8,35.7,34.4,17.1,32.7,42.9,48.5
4
+ Non-domain Experts (n=2),Baseline,69.7,80.4,66.7,64.3,68.8,60.5,61.5,72.1,72.0
5
+ Domain Experts (n=2),Baseline,72.7,89.3,77.8,67.9,75.0,60.5,72.4,74.4,68.3
6
+ Model-Expert Oracle,Baseline,87.2,96.4,77.8,78.6,100.0,68.4,84.6,95.4,85.4
7
+ Qwen3 32B (text),LLM,47.9,80.9,28.9,27.3,35.5,37.3,39.8,50.9,46.3
8
+ GPT-5 (text),LLM,56.4,82.6,47.4,29.6,38.7,51.4,50.0,56.9,59.0
9
+ Qwen3-VL 8B,VLM,45.3,80.2,26.3,25.0,31.3,57.9,45.2,57.1,17.8
10
+ Claude Sonnet 4.5,VLM,47.2,83.8,18.4,30.4,37.5,53.9,53.8,58.8,17.2
11
+ GPT-4o,VLM,47.2,79.3,39.5,35.7,43.8,61.8,51.9,45.3,23.9
12
+ GPT-4.1,VLM,47.9,80.2,28.9,33.9,40.6,68.4,56.7,45.9,23.3
13
+ Qwen3-VL 32B,VLM,52.8,80.2,23.7,33.9,56.3,59.2,50.0,61.8,36.2
14
+ Claude Opus 4.6,VLM,54.8,88.3,31.6,37.5,53.1,57.9,63.5,65.9,25.2
15
+ Gemini 3 Pro,VLM,58.1,82.9,28.9,44.6,62.5,56.7,54.8,71.2,41.1
16
+ GPT-5.4,VLM,61.3,81.1,31.6,63.6,65.6,57.9,56.7,61.8,60.7
17
+ GPT-5,VLM,62.7,82.0,31.6,44.6,68.8,65.8,59.6,63.5,61.3
18
+ OpenTSLM (TS-LLM),Post-trained TSFM,0.8,0.0,0.0,3.6,0.0,5.3,0.0,0.0,0.0
19
+ ChatTS (TS-LLM),Post-trained TSFM,31.1,59.5,15.8,16.1,15.6,28.9,20.2,40.0,14.7
20
+ Toto-Qwen3 (TSFM-LLM),Post-trained TSFM,48.8,82.9,10.5,35.7,34.4,47.4,71.2,41.8,35.6
21
+ Qwen3-VL 32B (post-trained),Post-trained TSFM,56.9,84.7,36.8,41.1,43.8,63.2,52.9,67.6,39.3
22
+ Toto-VLM (TSFM-VLM),Post-trained TSFM,63.9,84.7,47.4,26.8,59.4,64.5,66.3,68.8,60.1
results/ARFBench_leaderboard_category_f1.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Model Type,Overall F1,Presence,Identification,Start Time,End Time,Magnitude,Categorization,Correlation,Indicator
2
+ Random Choice,Baseline,22.5,45.6,21.2,18.9,18.2,20.4,21.7,15.8,17.8
3
+ Per-category Frequent Choice,Baseline,17.3,45.9,10.8,16.3,14.1,6.0,14.6,12.0,13.1
4
+ Non-domain Experts (n=2),Baseline,61.3,68.0,79.0,67.4,67.2,40.3,61.2,58.4,62.4
5
+ Domain Experts (n=2),Baseline,64.6,76.1,77.5,74.2,72.6,51.8,67.3,64.1,57.6
6
+ Model-Expert Oracle,Baseline,82.8,89.0,68.3,83.4,1.0,67.0,75.6,94.4,77.8
7
+ Qwen3 32B,LLM,36.1,55.7,28.4,26.6,26.9,31.4,36.8,32.3,35.4
8
+ GPT-5 (text),LLM,43.8,66.1,38.1,27.9,27.0,44.8,47.6,38.0,42.4
9
+ Qwen3-VL 8B,VLM,34.7,63.5,28.6,21.8,23.5,47.0,42.8,33.1,13.8
10
+ Claude Sonnet 4.5,VLM,37.9,63.2,16.8,33.2,31.3,49.3,49.8,33.8,19.8
11
+ GPT-4o,VLM,42.4,64.2,34.6,30.3,36.1,51.8,50.8,40.1,27.2
12
+ GPT-4.1,VLM,44.0,65.1,29.2,33.5,32.7,63.7,55.9,42.9,23.3
13
+ Qwen3-VL 32B,VLM,45.1,65.1,25.0,30.8,46.7,46.9,49.0,47.5,34.7
14
+ Claude Opus 4.6,VLM,46.7,65.8,34.3,36.1,45.1,53.8,59.2,51.6,24.1
15
+ Gemini 3 Pro,VLM,49.6,67.8,38.6,43.3,57.1,50.3,54.5,57.0,29.2
16
+ GPT-5.4,VLM,51.4,62.6,29.6,53.3,55.1,51.7,54.1,47.7,49.1
17
+ GPT-5,VLM,51.9,66.8,32.8,44.2,47.8,59.1,57.0,49.0,45.9
18
+ OpenTSLM 1B (TS-LLM),Post-trained TSFM,1.2,0.0,8.2,2.7,0.0,6.0,0.0,0.0,0.0
19
+ ChatTS 8B (TS-LLM),Post-trained TSFM,22.1,48.1,22.2,15.0,14.4,27.9,17.9,21.4,9.2
20
+ Toto-Qwen3 32B (TSFM-LLM),Post-trained TSFM,33.9,59.9,17.5,41.3,23.0,35.9,66.2,18.6,14.1
21
+ Qwen3-VL 32B (post-trained),Post-trained TSFM,46.6,69.7,40.5,37.2,36.7,48.9,50.3,46.8,33.9
22
+ Toto-VLM 32B (TSFM-VLM),Post-trained TSFM,48.9,66.3,46.9,23.0,48.8,54.1,58.4,44.2,42.7
src/about.py CHANGED
@@ -27,7 +27,7 @@ TITLE = """<h1 align="center" id="space-title">ARFBench Multimodal Time Series R
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
  **ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a
30
- multimodal time-series reasoning benchmark consisting of 550 question-answer
31
  (QA) pairs composed from real-world incident data collected at Datadog,
32
  a leading observability platform.
33
 
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
  **ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a
30
+ multimodal time-series reasoning benchmark consisting of 750 question-answer
31
  (QA) pairs composed from real-world incident data collected at Datadog,
32
  a leading observability platform.
33
 
src/display/formatting.py CHANGED
@@ -1,24 +1,41 @@
1
  def model_hyperlink(link, model_name):
2
  if model_name == "":
3
  return model_name
4
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
 
 
 
 
5
 
6
 
7
  def make_clickable_model(model_name):
 
 
 
 
 
 
 
 
 
8
  link = f"https://huggingface.co/{model_name}"
9
  return model_hyperlink(link, model_name)
10
 
11
 
12
  def styled_error(error):
13
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
 
14
 
15
 
16
  def styled_warning(warn):
17
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
 
18
 
19
 
20
  def styled_message(message):
21
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
22
 
23
 
24
  def has_no_nan_values(df, columns):
 
1
  def model_hyperlink(link, model_name):
2
  if model_name == "":
3
  return model_name
4
+ style = (
5
+ "color: var(--link-text-color); "
6
+ "text-decoration: underline;"
7
+ "text-decoration-style: dotted;"
8
+ )
9
+ return f'<a target="_blank" href="{link}" style="{style}">{model_name}</a>'
10
 
11
 
12
  def make_clickable_model(model_name):
13
+ if not isinstance(model_name, str):
14
+ model_name = str(model_name)
15
+
16
+ model_name = model_name.strip()
17
+
18
+ # Only convert valid Hugging Face repository paths (org/model) into links.
19
+ if "/" not in model_name or " " in model_name:
20
+ return model_name
21
+
22
  link = f"https://huggingface.co/{model_name}"
23
  return model_hyperlink(link, model_name)
24
 
25
 
26
  def styled_error(error):
27
+ style = "color: red; font-size: 20px; text-align: center;"
28
+ return f"<p style='{style}'>{error}</p>"
29
 
30
 
31
  def styled_warning(warn):
32
+ style = "color: orange; font-size: 20px; text-align: center;"
33
+ return f"<p style='{style}'>{warn}</p>"
34
 
35
 
36
  def styled_message(message):
37
+ style = "color: green; font-size: 20px; text-align: center;"
38
+ return f"<p style='{style}'>{message}</p>"
39
 
40
 
41
  def has_no_nan_values(df, columns):
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
 
@@ -19,27 +19,76 @@ class ColumnContent:
19
 
20
 
21
  # ARFBench Leaderboard columns
22
- auto_eval_column_dict = []
23
- # Model column (always displayed)
24
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
25
- # Performance metrics
26
- auto_eval_column_dict.append(["pass_at_1", ColumnContent, ColumnContent("pass@1", "number", True)])
27
- auto_eval_column_dict.append(["pass_at_5", ColumnContent, ColumnContent("pass@5", "number", True)])
28
- # Specific benchmark metrics
29
- auto_eval_column_dict.append(["presence", ColumnContent, ColumnContent("Presence", "number", True)])
30
- auto_eval_column_dict.append(["identification", ColumnContent, ColumnContent("Identification", "number", True)])
31
- auto_eval_column_dict.append(["start_time", ColumnContent, ColumnContent("Start Time", "number", True)])
32
- auto_eval_column_dict.append(["end_time", ColumnContent, ColumnContent("End Time", "number", True)])
33
- auto_eval_column_dict.append(["magnitude", ColumnContent, ColumnContent("Magnitude", "number", True)])
34
- auto_eval_column_dict.append(["categorization", ColumnContent, ColumnContent("Categorization", "number", True)])
35
- auto_eval_column_dict.append(["correlation", ColumnContent, ColumnContent("Correlation", "number", True)])
36
- auto_eval_column_dict.append(["indicator", ColumnContent, ColumnContent("Indicator", "number", True)])
37
-
38
- # We use make dataclass to dynamically fill the scores
39
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
40
-
41
-
42
- ## For the queue columns in the submission tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  @dataclass(frozen=True)
44
  class EvalQueueColumn: # Queue column
45
  model = ColumnContent("model", "markdown", True)
@@ -50,7 +99,7 @@ class EvalQueueColumn: # Queue column
50
  status = ColumnContent("status", "str", True)
51
 
52
 
53
- ## All the model information that we might need
54
  @dataclass
55
  class ModelDetails:
56
  name: str
@@ -59,10 +108,9 @@ class ModelDetails:
59
 
60
 
61
  class ModelType(Enum):
62
- OS_VLM = ModelDetails(name="open vision-language", symbol="🟢")
63
- P_VLM = ModelDetails(name="proprietary vision-language", symbol="🔶")
64
- TSFM = ModelDetails(name="time-series FM", symbol="⭕")
65
- R = ModelDetails(name="reasoning", symbol="🟦")
66
  Unknown = ModelDetails(name="", symbol="?")
67
 
68
  def to_str(self, separator=" "):
@@ -70,13 +118,11 @@ class ModelType(Enum):
70
 
71
  @staticmethod
72
  def from_str(type):
73
- if "proprietary vision-language" in type or "🔶" in type:
74
- return ModelType.P_VLM
75
- if "open vision-language" in type or "🟢" in type:
76
- return ModelType.OS_VLM
77
- if "reasoning" in type or "🟦" in type:
78
- return ModelType.R
79
- if "time-series FM" in type or "⭕" in type:
80
  return ModelType.TSFM
81
  return ModelType.Unknown
82
 
@@ -108,8 +154,12 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  # Define the benchmark columns for ARFBench
110
  BENCHMARK_COLS = [
111
- "pass_at_1",
112
- "pass_at_5",
 
 
 
 
113
  "presence",
114
  "identification",
115
  "start_time",
@@ -119,3 +169,9 @@ BENCHMARK_COLS = [
119
  "correlation",
120
  "indicator",
121
  ]
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
  from enum import Enum
3
 
4
 
 
19
 
20
 
21
  # ARFBench Leaderboard columns
22
+ @dataclass(frozen=True)
23
+ class AutoEvalColumn:
24
+ # Model column (always displayed)
25
+ model = ColumnContent("model", "markdown", True, never_hidden=True)
26
+ # Model type column
27
+ model_type = ColumnContent("model_type", "str", True)
28
+ # Performance metrics
29
+ overall_f1 = ColumnContent("overall_f1", "number", True)
30
+ tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
31
+ tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
32
+ tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
33
+ # Specific benchmark metrics
34
+ presence = ColumnContent("presence", "number", True)
35
+ identification = ColumnContent("identification", "number", True)
36
+ start_time = ColumnContent("start_time", "number", True)
37
+ end_time = ColumnContent("end_time", "number", True)
38
+ magnitude = ColumnContent("magnitude", "number", True)
39
+ categorization = ColumnContent("categorization", "number", True)
40
+ correlation = ColumnContent("correlation", "number", True)
41
+ indicator = ColumnContent("indicator", "number", True)
42
+
43
+
44
+ # Overall + per-tier leaderboard columns
45
+ @dataclass(frozen=True)
46
+ class OverallTierColumn:
47
+ model = ColumnContent("model", "markdown", True, never_hidden=True)
48
+ model_type = ColumnContent("model_type", "str", True)
49
+ accuracy = ColumnContent("accuracy", "number", True)
50
+ tier_i_accuracy = ColumnContent("tier_i_accuracy", "number", True)
51
+ tier_ii_accuracy = ColumnContent("tier_ii_accuracy", "number", True)
52
+ tier_iii_accuracy = ColumnContent("tier_iii_accuracy", "number", True)
53
+ overall_f1 = ColumnContent("overall_f1", "number", True)
54
+ tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
55
+ tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
56
+ tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
57
+
58
+
59
+ # Per-category F1 leaderboard columns
60
+ @dataclass(frozen=True)
61
+ class CategoryF1Column:
62
+ model = ColumnContent("model", "markdown", True, never_hidden=True)
63
+ model_type = ColumnContent("model_type", "str", True)
64
+ overall_f1 = ColumnContent("overall_f1", "number", True)
65
+ presence = ColumnContent("presence", "number", True)
66
+ identification = ColumnContent("identification", "number", True)
67
+ start_time = ColumnContent("start_time", "number", True)
68
+ end_time = ColumnContent("end_time", "number", True)
69
+ magnitude = ColumnContent("magnitude", "number", True)
70
+ categorization = ColumnContent("categorization", "number", True)
71
+ correlation = ColumnContent("correlation", "number", True)
72
+ indicator = ColumnContent("indicator", "number", True)
73
+
74
+
75
+ # Per-category accuracy leaderboard columns
76
+ @dataclass(frozen=True)
77
+ class CategoryAccuracyColumn:
78
+ model = ColumnContent("model", "markdown", True, never_hidden=True)
79
+ model_type = ColumnContent("model_type", "str", True)
80
+ overall_accuracy = ColumnContent("overall_accuracy", "number", True)
81
+ presence = ColumnContent("presence", "number", True)
82
+ identification = ColumnContent("identification", "number", True)
83
+ start_time = ColumnContent("start_time", "number", True)
84
+ end_time = ColumnContent("end_time", "number", True)
85
+ magnitude = ColumnContent("magnitude", "number", True)
86
+ categorization = ColumnContent("categorization", "number", True)
87
+ correlation = ColumnContent("correlation", "number", True)
88
+ indicator = ColumnContent("indicator", "number", True)
89
+
90
+
91
+ # For the queue columns in the submission tab
92
  @dataclass(frozen=True)
93
  class EvalQueueColumn: # Queue column
94
  model = ColumnContent("model", "markdown", True)
 
99
  status = ColumnContent("status", "str", True)
100
 
101
 
102
+ # All the model information that we might need
103
  @dataclass
104
  class ModelDetails:
105
  name: str
 
108
 
109
 
110
  class ModelType(Enum):
111
+ LLM = ModelDetails(name="LLM", symbol="🟢")
112
+ VLM = ModelDetails(name="VLM", symbol="🔶")
113
+ TSFM = ModelDetails(name="Post-trained TSFM", symbol="⭕")
 
114
  Unknown = ModelDetails(name="", symbol="?")
115
 
116
  def to_str(self, separator=" "):
 
118
 
119
  @staticmethod
120
  def from_str(type):
121
+ if "VLM" in type or "🔶" in type:
122
+ return ModelType.VLM
123
+ if "LLM" in type or "🟢" in type:
124
+ return ModelType.LLM
125
+ if "TSFM" in type or "" in type:
 
 
126
  return ModelType.TSFM
127
  return ModelType.Unknown
128
 
 
154
 
155
  # Define the benchmark columns for ARFBench
156
  BENCHMARK_COLS = [
157
+ "model",
158
+ "model_type",
159
+ "overall_f1",
160
+ "tier_i_f1",
161
+ "tier_ii_f1",
162
+ "tier_iii_f1",
163
  "presence",
164
  "identification",
165
  "start_time",
 
169
  "correlation",
170
  "indicator",
171
  ]
172
+
173
+
174
+ # New leaderboard datasets
175
+ OVERALL_TIER_COLS = [c.name for c in fields(OverallTierColumn) if not c.hidden]
176
+ CATEGORY_F1_COLS = [c.name for c in fields(CategoryF1Column) if not c.hidden]
177
+ CATEGORY_ACCURACY_COLS = [c.name for c in fields(CategoryAccuracyColumn) if not c.hidden]
src/populate.py CHANGED
@@ -1,51 +1,48 @@
1
- import os
2
  import pandas as pd
3
 
4
  from src.display.formatting import make_clickable_model
5
 
6
 
7
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
8
- """Creates a dataframe from the static CSV file"""
9
- # Read the static CSV file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  df = pd.read_csv(results_path)
 
11
 
12
- # Clean up column names to match our schema
13
- df.columns = df.columns.str.replace("pass@1", "pass_at_1")
14
- df.columns = df.columns.str.replace("pass@5", "pass_at_5")
15
- df.columns = df.columns.str.replace("Start Time", "start_time")
16
- df.columns = df.columns.str.replace("End Time", "end_time")
17
- df.columns = df.columns.str.lower()
18
-
19
- # Make model names clickable
20
- df["model"] = df["model"].apply(make_clickable_model)
21
-
22
- # Sort by pass@1 performance (descending)
23
- df = df.sort_values(by=["pass_at_1"], ascending=False)
24
-
25
- # Round numeric columns to 2 decimal places
26
- numeric_cols = [
27
- "pass_at_1",
28
- "pass_at_5",
29
- "presence",
30
- "identification",
31
- "start_time",
32
- "end_time",
33
- "magnitude",
34
- "categorization",
35
- "correlation",
36
- "indicator",
37
- ]
38
-
39
- for col in numeric_cols:
40
- if col in df.columns:
41
- df[col] = df[col].round(2)
42
-
43
- # Handle missing values - replace with 0 or appropriate value
44
- df = df.fillna(0)
45
-
46
- # Select only the columns we need
47
- available_cols = [col for col in cols if col in df.columns]
48
- df = df[available_cols]
49
 
50
  return df
51
 
 
 
1
  import pandas as pd
2
 
3
  from src.display.formatting import make_clickable_model
4
 
5
 
6
+ def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
7
+ rename_map = {
8
+ "model type": "model_type",
9
+ "overall f1": "overall_f1",
10
+ "overall accuracy": "overall_accuracy",
11
+ "accuracy": "accuracy",
12
+ "tier i": "tier_i_f1",
13
+ "tier ii": "tier_ii_f1",
14
+ "tier iii": "tier_iii_f1",
15
+ "tier i accuracy": "tier_i_accuracy",
16
+ "tier ii accuracy": "tier_ii_accuracy",
17
+ "tier iii accuracy": "tier_iii_accuracy",
18
+ "start time": "start_time",
19
+ "end time": "end_time",
20
+ }
21
+
22
+ normalized = {}
23
+ for col in df.columns:
24
+ cleaned = col.strip().lower()
25
+ normalized[col] = rename_map.get(cleaned, cleaned.replace(" ", "_"))
26
+
27
+ return df.rename(columns=normalized)
28
+
29
+
30
+ def get_leaderboard_df(
31
+ results_path: str,
32
+ _requests_path: str,
33
+ _cols: list,
34
+ _benchmark_cols: list,
35
+ sort_by: str = "overall_f1",
36
+ ) -> pd.DataFrame:
37
+ """Creates a dataframe from a static CSV leaderboard file."""
38
  df = pd.read_csv(results_path)
39
+ df = _normalize_columns(df)
40
 
41
+ if "model" in df.columns:
42
+ df["model"] = df["model"].apply(make_clickable_model)
43
+
44
+ if sort_by in df.columns:
45
+ df = df.sort_values(by=[sort_by], ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  return df
48