update leaderboard results
Browse files- app.py +60 -16
- results/ARFBench_leaderboard.csv +22 -14
- results/ARFBench_leaderboard_category_accuracy.csv +22 -0
- results/ARFBench_leaderboard_category_f1.csv +22 -0
- src/about.py +1 -1
- src/display/formatting.py +21 -4
- src/display/utils.py +92 -36
- src/populate.py +38 -41
app.py
CHANGED
|
@@ -10,14 +10,15 @@ from src.about import (
|
|
| 10 |
)
|
| 11 |
from src.display.css_html_js import custom_css
|
| 12 |
from src.display.utils import (
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
| 16 |
fields,
|
| 17 |
)
|
| 18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 19 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 20 |
-
from src.submission.submit import add_new_eval
|
| 21 |
from src.populate import get_leaderboard_df
|
| 22 |
|
| 23 |
|
|
@@ -25,31 +26,51 @@ def restart_space():
|
|
| 25 |
API.restart_space(repo_id=REPO_ID)
|
| 26 |
|
| 27 |
|
| 28 |
-
|
| 29 |
-
EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
|
|
|
| 34 |
if dataframe is None or dataframe.empty:
|
| 35 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 36 |
return Leaderboard(
|
| 37 |
value=dataframe,
|
| 38 |
-
datatype=[c.type for c in fields(
|
| 39 |
select_columns=SelectColumns(
|
| 40 |
-
default_selection=[c.name for c in fields(
|
| 41 |
-
cant_deselect=[c.name for c in fields(
|
| 42 |
label="Select Columns to Display:",
|
| 43 |
),
|
| 44 |
-
search_columns=[
|
| 45 |
-
hide_columns=[c.name for c in fields(
|
| 46 |
filter_columns=[
|
| 47 |
ColumnFilter(
|
| 48 |
-
|
| 49 |
type="slider",
|
| 50 |
min=0,
|
| 51 |
max=100,
|
| 52 |
-
label=
|
| 53 |
),
|
| 54 |
],
|
| 55 |
bool_checkboxgroup_label="Hide models",
|
|
@@ -64,7 +85,30 @@ with demo:
|
|
| 64 |
|
| 65 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 66 |
with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
|
| 70 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 10 |
)
|
| 11 |
from src.display.css_html_js import custom_css
|
| 12 |
from src.display.utils import (
|
| 13 |
+
CATEGORY_ACCURACY_COLS,
|
| 14 |
+
CATEGORY_F1_COLS,
|
| 15 |
+
OVERALL_TIER_COLS,
|
| 16 |
+
CategoryAccuracyColumn,
|
| 17 |
+
CategoryF1Column,
|
| 18 |
+
OverallTierColumn,
|
| 19 |
fields,
|
| 20 |
)
|
| 21 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
|
|
|
|
|
|
| 22 |
from src.populate import get_leaderboard_df
|
| 23 |
|
| 24 |
|
|
|
|
| 26 |
API.restart_space(repo_id=REPO_ID)
|
| 27 |
|
| 28 |
|
| 29 |
+
OVERALL_TIER_LEADERBOARD_DF = get_leaderboard_df(
|
| 30 |
+
EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
|
| 31 |
+
EVAL_REQUESTS_PATH,
|
| 32 |
+
OVERALL_TIER_COLS,
|
| 33 |
+
OVERALL_TIER_COLS,
|
| 34 |
+
sort_by="overall_f1",
|
| 35 |
)
|
| 36 |
|
| 37 |
+
CATEGORY_F1_LEADERBOARD_DF = get_leaderboard_df(
|
| 38 |
+
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_f1.csv",
|
| 39 |
+
EVAL_REQUESTS_PATH,
|
| 40 |
+
CATEGORY_F1_COLS,
|
| 41 |
+
CATEGORY_F1_COLS,
|
| 42 |
+
sort_by="overall_f1",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
CATEGORY_ACCURACY_LEADERBOARD_DF = get_leaderboard_df(
|
| 46 |
+
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_accuracy.csv",
|
| 47 |
+
EVAL_REQUESTS_PATH,
|
| 48 |
+
CATEGORY_ACCURACY_COLS,
|
| 49 |
+
CATEGORY_ACCURACY_COLS,
|
| 50 |
+
sort_by="overall_accuracy",
|
| 51 |
+
)
|
| 52 |
|
| 53 |
+
|
| 54 |
+
def init_custom_leaderboard(dataframe, column_class, filter_column_name, filter_label):
|
| 55 |
if dataframe is None or dataframe.empty:
|
| 56 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 57 |
return Leaderboard(
|
| 58 |
value=dataframe,
|
| 59 |
+
datatype=[c.type for c in fields(column_class)],
|
| 60 |
select_columns=SelectColumns(
|
| 61 |
+
default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
|
| 62 |
+
cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
|
| 63 |
label="Select Columns to Display:",
|
| 64 |
),
|
| 65 |
+
search_columns=[column_class.model.name],
|
| 66 |
+
hide_columns=[c.name for c in fields(column_class) if c.hidden],
|
| 67 |
filter_columns=[
|
| 68 |
ColumnFilter(
|
| 69 |
+
filter_column_name,
|
| 70 |
type="slider",
|
| 71 |
min=0,
|
| 72 |
max=100,
|
| 73 |
+
label=filter_label,
|
| 74 |
),
|
| 75 |
],
|
| 76 |
bool_checkboxgroup_label="Hide models",
|
|
|
|
| 85 |
|
| 86 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 87 |
with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
|
| 88 |
+
with gr.Tabs(selected=0):
|
| 89 |
+
with gr.TabItem("Overall + Tier (Default)", id=0):
|
| 90 |
+
leaderboard_overall_tier = init_custom_leaderboard(
|
| 91 |
+
OVERALL_TIER_LEADERBOARD_DF,
|
| 92 |
+
OverallTierColumn,
|
| 93 |
+
OverallTierColumn.overall_f1.name,
|
| 94 |
+
"Overall F1 score",
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
with gr.TabItem("Per-Category F1", id=1):
|
| 98 |
+
leaderboard_category_f1 = init_custom_leaderboard(
|
| 99 |
+
CATEGORY_F1_LEADERBOARD_DF,
|
| 100 |
+
CategoryF1Column,
|
| 101 |
+
CategoryF1Column.overall_f1.name,
|
| 102 |
+
"Overall F1 score",
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
with gr.TabItem("Per-Category Accuracy", id=2):
|
| 106 |
+
leaderboard_category_accuracy = init_custom_leaderboard(
|
| 107 |
+
CATEGORY_ACCURACY_LEADERBOARD_DF,
|
| 108 |
+
CategoryAccuracyColumn,
|
| 109 |
+
CategoryAccuracyColumn.overall_accuracy.name,
|
| 110 |
+
"Overall Accuracy score",
|
| 111 |
+
)
|
| 112 |
|
| 113 |
with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
|
| 114 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
results/ARFBench_leaderboard.csv
CHANGED
|
@@ -1,14 +1,22 @@
|
|
| 1 |
-
Model,
|
| 2 |
-
Random Choice,
|
| 3 |
-
Frequent Choice,
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Model Type,Accuracy,Tier I Accuracy,Tier II Accuracy,Tier III Accuracy,Overall F1,Tier I F1,Tier II F1,Tier III F1
|
| 2 |
+
Random Choice,Baseline,24.5,50.0,20.0,20.0,22.5,45.6,20.5,16.8
|
| 3 |
+
Per-category Frequent Choice,Baseline,45.1,84.7,30.1,45.6,17.3,45.9,12.3,12.5
|
| 4 |
+
Non-domain Experts (n=2),Baseline,69.7,80.4,63.2,72.0,60.7,68.0,59.9,59.0
|
| 5 |
+
Domain Experts (n=2),Baseline,72.7,89.3,67.7,71.4,64.6,76.1,64.5,60.9
|
| 6 |
+
Model-Expert Oracle,Baseline,87.2,96.4,80.3,90.5,82.8,89.0,77.1,86.3
|
| 7 |
+
Qwen3 32B,LLM,47.9,80.9,35.1,48.6,36.1,55.7,31.5,33.8
|
| 8 |
+
GPT-5 (text),LLM,56.4,82.6,45.2,57.9,43.8,66.1,39.6,40.3
|
| 9 |
+
Qwen3-VL 8B,VLM,45.3,80.2,40.8,37.8,34.7,63.5,36.1,23.6
|
| 10 |
+
Claude Sonnet 4.5,VLM,47.2,83.8,43.5,38.4,37.9,63.2,40.6,26.9
|
| 11 |
+
GPT-4o,VLM,47.2,79.3,49.0,34.8,42.4,64.2,43.8,33.8
|
| 12 |
+
GPT-4.1,VLM,47.9,80.2,50.3,34.8,44.0,65.1,48.0,33.1
|
| 13 |
+
Qwen3-VL 32B,VLM,52.8,80.2,46.7,49.2,45.1,65.1,41.9,41.3
|
| 14 |
+
Claude Opus 4.6,VLM,54.8,88.3,52.3,45.9,46.7,65.8,49.1,38.2
|
| 15 |
+
Gemini 3 Pro,VLM,58.1,82.9,51.0,56.5,49.6,67.8,49.7,43.4
|
| 16 |
+
GPT-5.4,VLM,61.3,81.1,54.2,61.3,51.4,62.6,50.4,48.4
|
| 17 |
+
GPT-5,VLM,62.7,82.0,55.9,62.5,51.9,66.9,51.2,47.5
|
| 18 |
+
OpenTSLM 1B (TS-LLM),Post-trained TSFM,0.8,0.0,2.0,0.0,1.2,0.0,3.0,0.0
|
| 19 |
+
ChatTS 8B (TS-LLM),Post-trained TSFM,31.1,60.4,26.5,25.5,22.1,48.1,20.0,15.4
|
| 20 |
+
Toto-Qwen3 32B (TSFM-LLM),Post-trained TSFM,48.8,82.9,47.4,38.7,33.9,60.0,43.6,16.4
|
| 21 |
+
Qwen3-VL 32B (post-trained),Post-trained TSFM,56.9,84.7,50.3,53.8,46.6,69.8,44.9,40.5
|
| 22 |
+
Toto-VLM 32B (TSFM-VLM),Post-trained TSFM,63.9,84.7,55.6,64.6,48.9,66.3,48.4,43.5
|
results/ARFBench_leaderboard_category_accuracy.csv
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Model Type,Overall Accuracy,Presence,Identification,Start Time,End Time,Magnitude,Categorization,Correlation,Indicator
|
| 2 |
+
Random Choice,Baseline,24.5,50.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
|
| 3 |
+
Per-category Frequent Choice,Baseline,45.1,84.7,36.8,35.7,34.4,17.1,32.7,42.9,48.5
|
| 4 |
+
Non-domain Experts (n=2),Baseline,69.7,80.4,66.7,64.3,68.8,60.5,61.5,72.1,72.0
|
| 5 |
+
Domain Experts (n=2),Baseline,72.7,89.3,77.8,67.9,75.0,60.5,72.4,74.4,68.3
|
| 6 |
+
Model-Expert Oracle,Baseline,87.2,96.4,77.8,78.6,100.0,68.4,84.6,95.4,85.4
|
| 7 |
+
Qwen3 32B (text),LLM,47.9,80.9,28.9,27.3,35.5,37.3,39.8,50.9,46.3
|
| 8 |
+
GPT-5 (text),LLM,56.4,82.6,47.4,29.6,38.7,51.4,50.0,56.9,59.0
|
| 9 |
+
Qwen3-VL 8B,VLM,45.3,80.2,26.3,25.0,31.3,57.9,45.2,57.1,17.8
|
| 10 |
+
Claude Sonnet 4.5,VLM,47.2,83.8,18.4,30.4,37.5,53.9,53.8,58.8,17.2
|
| 11 |
+
GPT-4o,VLM,47.2,79.3,39.5,35.7,43.8,61.8,51.9,45.3,23.9
|
| 12 |
+
GPT-4.1,VLM,47.9,80.2,28.9,33.9,40.6,68.4,56.7,45.9,23.3
|
| 13 |
+
Qwen3-VL 32B,VLM,52.8,80.2,23.7,33.9,56.3,59.2,50.0,61.8,36.2
|
| 14 |
+
Claude Opus 4.6,VLM,54.8,88.3,31.6,37.5,53.1,57.9,63.5,65.9,25.2
|
| 15 |
+
Gemini 3 Pro,VLM,58.1,82.9,28.9,44.6,62.5,56.7,54.8,71.2,41.1
|
| 16 |
+
GPT-5.4,VLM,61.3,81.1,31.6,63.6,65.6,57.9,56.7,61.8,60.7
|
| 17 |
+
GPT-5,VLM,62.7,82.0,31.6,44.6,68.8,65.8,59.6,63.5,61.3
|
| 18 |
+
OpenTSLM (TS-LLM),Post-trained TSFM,0.8,0.0,0.0,3.6,0.0,5.3,0.0,0.0,0.0
|
| 19 |
+
ChatTS (TS-LLM),Post-trained TSFM,31.1,59.5,15.8,16.1,15.6,28.9,20.2,40.0,14.7
|
| 20 |
+
Toto-Qwen3 (TSFM-LLM),Post-trained TSFM,48.8,82.9,10.5,35.7,34.4,47.4,71.2,41.8,35.6
|
| 21 |
+
Qwen3-VL 32B (post-trained),Post-trained TSFM,56.9,84.7,36.8,41.1,43.8,63.2,52.9,67.6,39.3
|
| 22 |
+
Toto-VLM (TSFM-VLM),Post-trained TSFM,63.9,84.7,47.4,26.8,59.4,64.5,66.3,68.8,60.1
|
results/ARFBench_leaderboard_category_f1.csv
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Model Type,Overall F1,Presence,Identification,Start Time,End Time,Magnitude,Categorization,Correlation,Indicator
|
| 2 |
+
Random Choice,Baseline,22.5,45.6,21.2,18.9,18.2,20.4,21.7,15.8,17.8
|
| 3 |
+
Per-category Frequent Choice,Baseline,17.3,45.9,10.8,16.3,14.1,6.0,14.6,12.0,13.1
|
| 4 |
+
Non-domain Experts (n=2),Baseline,61.3,68.0,79.0,67.4,67.2,40.3,61.2,58.4,62.4
|
| 5 |
+
Domain Experts (n=2),Baseline,64.6,76.1,77.5,74.2,72.6,51.8,67.3,64.1,57.6
|
| 6 |
+
Model-Expert Oracle,Baseline,82.8,89.0,68.3,83.4,1.0,67.0,75.6,94.4,77.8
|
| 7 |
+
Qwen3 32B,LLM,36.1,55.7,28.4,26.6,26.9,31.4,36.8,32.3,35.4
|
| 8 |
+
GPT-5 (text),LLM,43.8,66.1,38.1,27.9,27.0,44.8,47.6,38.0,42.4
|
| 9 |
+
Qwen3-VL 8B,VLM,34.7,63.5,28.6,21.8,23.5,47.0,42.8,33.1,13.8
|
| 10 |
+
Claude Sonnet 4.5,VLM,37.9,63.2,16.8,33.2,31.3,49.3,49.8,33.8,19.8
|
| 11 |
+
GPT-4o,VLM,42.4,64.2,34.6,30.3,36.1,51.8,50.8,40.1,27.2
|
| 12 |
+
GPT-4.1,VLM,44.0,65.1,29.2,33.5,32.7,63.7,55.9,42.9,23.3
|
| 13 |
+
Qwen3-VL 32B,VLM,45.1,65.1,25.0,30.8,46.7,46.9,49.0,47.5,34.7
|
| 14 |
+
Claude Opus 4.6,VLM,46.7,65.8,34.3,36.1,45.1,53.8,59.2,51.6,24.1
|
| 15 |
+
Gemini 3 Pro,VLM,49.6,67.8,38.6,43.3,57.1,50.3,54.5,57.0,29.2
|
| 16 |
+
GPT-5.4,VLM,51.4,62.6,29.6,53.3,55.1,51.7,54.1,47.7,49.1
|
| 17 |
+
GPT-5,VLM,51.9,66.8,32.8,44.2,47.8,59.1,57.0,49.0,45.9
|
| 18 |
+
OpenTSLM 1B (TS-LLM),Post-trained TSFM,1.2,0.0,8.2,2.7,0.0,6.0,0.0,0.0,0.0
|
| 19 |
+
ChatTS 8B (TS-LLM),Post-trained TSFM,22.1,48.1,22.2,15.0,14.4,27.9,17.9,21.4,9.2
|
| 20 |
+
Toto-Qwen3 32B (TSFM-LLM),Post-trained TSFM,33.9,59.9,17.5,41.3,23.0,35.9,66.2,18.6,14.1
|
| 21 |
+
Qwen3-VL 32B (post-trained),Post-trained TSFM,46.6,69.7,40.5,37.2,36.7,48.9,50.3,46.8,33.9
|
| 22 |
+
Toto-VLM 32B (TSFM-VLM),Post-trained TSFM,48.9,66.3,46.9,23.0,48.8,54.1,58.4,44.2,42.7
|
src/about.py
CHANGED
|
@@ -27,7 +27,7 @@ TITLE = """<h1 align="center" id="space-title">ARFBench Multimodal Time Series R
|
|
| 27 |
# What does your leaderboard evaluate?
|
| 28 |
INTRODUCTION_TEXT = """
|
| 29 |
**ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a
|
| 30 |
-
multimodal time-series reasoning benchmark consisting of
|
| 31 |
(QA) pairs composed from real-world incident data collected at Datadog,
|
| 32 |
a leading observability platform.
|
| 33 |
|
|
|
|
| 27 |
# What does your leaderboard evaluate?
|
| 28 |
INTRODUCTION_TEXT = """
|
| 29 |
**ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a
|
| 30 |
+
multimodal time-series reasoning benchmark consisting of 750 question-answer
|
| 31 |
(QA) pairs composed from real-world incident data collected at Datadog,
|
| 32 |
a leading observability platform.
|
| 33 |
|
src/display/formatting.py
CHANGED
|
@@ -1,24 +1,41 @@
|
|
| 1 |
def model_hyperlink(link, model_name):
|
| 2 |
if model_name == "":
|
| 3 |
return model_name
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def make_clickable_model(model_name):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
link = f"https://huggingface.co/{model_name}"
|
| 9 |
return model_hyperlink(link, model_name)
|
| 10 |
|
| 11 |
|
| 12 |
def styled_error(error):
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def styled_warning(warn):
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def styled_message(message):
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def has_no_nan_values(df, columns):
|
|
|
|
| 1 |
def model_hyperlink(link, model_name):
|
| 2 |
if model_name == "":
|
| 3 |
return model_name
|
| 4 |
+
style = (
|
| 5 |
+
"color: var(--link-text-color); "
|
| 6 |
+
"text-decoration: underline;"
|
| 7 |
+
"text-decoration-style: dotted;"
|
| 8 |
+
)
|
| 9 |
+
return f'<a target="_blank" href="{link}" style="{style}">{model_name}</a>'
|
| 10 |
|
| 11 |
|
| 12 |
def make_clickable_model(model_name):
|
| 13 |
+
if not isinstance(model_name, str):
|
| 14 |
+
model_name = str(model_name)
|
| 15 |
+
|
| 16 |
+
model_name = model_name.strip()
|
| 17 |
+
|
| 18 |
+
# Only convert valid Hugging Face repository paths (org/model) into links.
|
| 19 |
+
if "/" not in model_name or " " in model_name:
|
| 20 |
+
return model_name
|
| 21 |
+
|
| 22 |
link = f"https://huggingface.co/{model_name}"
|
| 23 |
return model_hyperlink(link, model_name)
|
| 24 |
|
| 25 |
|
| 26 |
def styled_error(error):
|
| 27 |
+
style = "color: red; font-size: 20px; text-align: center;"
|
| 28 |
+
return f"<p style='{style}'>{error}</p>"
|
| 29 |
|
| 30 |
|
| 31 |
def styled_warning(warn):
|
| 32 |
+
style = "color: orange; font-size: 20px; text-align: center;"
|
| 33 |
+
return f"<p style='{style}'>{warn}</p>"
|
| 34 |
|
| 35 |
|
| 36 |
def styled_message(message):
|
| 37 |
+
style = "color: green; font-size: 20px; text-align: center;"
|
| 38 |
+
return f"<p style='{style}'>{message}</p>"
|
| 39 |
|
| 40 |
|
| 41 |
def has_no_nan_values(df, columns):
|
src/display/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from dataclasses import dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
|
|
@@ -19,27 +19,76 @@ class ColumnContent:
|
|
| 19 |
|
| 20 |
|
| 21 |
# ARFBench Leaderboard columns
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
@dataclass(frozen=True)
|
| 44 |
class EvalQueueColumn: # Queue column
|
| 45 |
model = ColumnContent("model", "markdown", True)
|
|
@@ -50,7 +99,7 @@ class EvalQueueColumn: # Queue column
|
|
| 50 |
status = ColumnContent("status", "str", True)
|
| 51 |
|
| 52 |
|
| 53 |
-
#
|
| 54 |
@dataclass
|
| 55 |
class ModelDetails:
|
| 56 |
name: str
|
|
@@ -59,10 +108,9 @@ class ModelDetails:
|
|
| 59 |
|
| 60 |
|
| 61 |
class ModelType(Enum):
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
TSFM = ModelDetails(name="
|
| 65 |
-
R = ModelDetails(name="reasoning", symbol="🟦")
|
| 66 |
Unknown = ModelDetails(name="", symbol="?")
|
| 67 |
|
| 68 |
def to_str(self, separator=" "):
|
|
@@ -70,13 +118,11 @@ class ModelType(Enum):
|
|
| 70 |
|
| 71 |
@staticmethod
|
| 72 |
def from_str(type):
|
| 73 |
-
if "
|
| 74 |
-
return ModelType.
|
| 75 |
-
if "
|
| 76 |
-
return ModelType.
|
| 77 |
-
if "
|
| 78 |
-
return ModelType.R
|
| 79 |
-
if "time-series FM" in type or "⭕" in type:
|
| 80 |
return ModelType.TSFM
|
| 81 |
return ModelType.Unknown
|
| 82 |
|
|
@@ -108,8 +154,12 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
| 108 |
|
| 109 |
# Define the benchmark columns for ARFBench
|
| 110 |
BENCHMARK_COLS = [
|
| 111 |
-
"
|
| 112 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"presence",
|
| 114 |
"identification",
|
| 115 |
"start_time",
|
|
@@ -119,3 +169,9 @@ BENCHMARK_COLS = [
|
|
| 119 |
"correlation",
|
| 120 |
"indicator",
|
| 121 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
# ARFBench Leaderboard columns
|
| 22 |
+
@dataclass(frozen=True)
|
| 23 |
+
class AutoEvalColumn:
|
| 24 |
+
# Model column (always displayed)
|
| 25 |
+
model = ColumnContent("model", "markdown", True, never_hidden=True)
|
| 26 |
+
# Model type column
|
| 27 |
+
model_type = ColumnContent("model_type", "str", True)
|
| 28 |
+
# Performance metrics
|
| 29 |
+
overall_f1 = ColumnContent("overall_f1", "number", True)
|
| 30 |
+
tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
|
| 31 |
+
tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
|
| 32 |
+
tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
|
| 33 |
+
# Specific benchmark metrics
|
| 34 |
+
presence = ColumnContent("presence", "number", True)
|
| 35 |
+
identification = ColumnContent("identification", "number", True)
|
| 36 |
+
start_time = ColumnContent("start_time", "number", True)
|
| 37 |
+
end_time = ColumnContent("end_time", "number", True)
|
| 38 |
+
magnitude = ColumnContent("magnitude", "number", True)
|
| 39 |
+
categorization = ColumnContent("categorization", "number", True)
|
| 40 |
+
correlation = ColumnContent("correlation", "number", True)
|
| 41 |
+
indicator = ColumnContent("indicator", "number", True)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Overall + per-tier leaderboard columns
|
| 45 |
+
@dataclass(frozen=True)
|
| 46 |
+
class OverallTierColumn:
|
| 47 |
+
model = ColumnContent("model", "markdown", True, never_hidden=True)
|
| 48 |
+
model_type = ColumnContent("model_type", "str", True)
|
| 49 |
+
accuracy = ColumnContent("accuracy", "number", True)
|
| 50 |
+
tier_i_accuracy = ColumnContent("tier_i_accuracy", "number", True)
|
| 51 |
+
tier_ii_accuracy = ColumnContent("tier_ii_accuracy", "number", True)
|
| 52 |
+
tier_iii_accuracy = ColumnContent("tier_iii_accuracy", "number", True)
|
| 53 |
+
overall_f1 = ColumnContent("overall_f1", "number", True)
|
| 54 |
+
tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
|
| 55 |
+
tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
|
| 56 |
+
tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Per-category F1 leaderboard columns
|
| 60 |
+
@dataclass(frozen=True)
|
| 61 |
+
class CategoryF1Column:
|
| 62 |
+
model = ColumnContent("model", "markdown", True, never_hidden=True)
|
| 63 |
+
model_type = ColumnContent("model_type", "str", True)
|
| 64 |
+
overall_f1 = ColumnContent("overall_f1", "number", True)
|
| 65 |
+
presence = ColumnContent("presence", "number", True)
|
| 66 |
+
identification = ColumnContent("identification", "number", True)
|
| 67 |
+
start_time = ColumnContent("start_time", "number", True)
|
| 68 |
+
end_time = ColumnContent("end_time", "number", True)
|
| 69 |
+
magnitude = ColumnContent("magnitude", "number", True)
|
| 70 |
+
categorization = ColumnContent("categorization", "number", True)
|
| 71 |
+
correlation = ColumnContent("correlation", "number", True)
|
| 72 |
+
indicator = ColumnContent("indicator", "number", True)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Per-category accuracy leaderboard columns
|
| 76 |
+
@dataclass(frozen=True)
|
| 77 |
+
class CategoryAccuracyColumn:
|
| 78 |
+
model = ColumnContent("model", "markdown", True, never_hidden=True)
|
| 79 |
+
model_type = ColumnContent("model_type", "str", True)
|
| 80 |
+
overall_accuracy = ColumnContent("overall_accuracy", "number", True)
|
| 81 |
+
presence = ColumnContent("presence", "number", True)
|
| 82 |
+
identification = ColumnContent("identification", "number", True)
|
| 83 |
+
start_time = ColumnContent("start_time", "number", True)
|
| 84 |
+
end_time = ColumnContent("end_time", "number", True)
|
| 85 |
+
magnitude = ColumnContent("magnitude", "number", True)
|
| 86 |
+
categorization = ColumnContent("categorization", "number", True)
|
| 87 |
+
correlation = ColumnContent("correlation", "number", True)
|
| 88 |
+
indicator = ColumnContent("indicator", "number", True)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# For the queue columns in the submission tab
|
| 92 |
@dataclass(frozen=True)
|
| 93 |
class EvalQueueColumn: # Queue column
|
| 94 |
model = ColumnContent("model", "markdown", True)
|
|
|
|
| 99 |
status = ColumnContent("status", "str", True)
|
| 100 |
|
| 101 |
|
| 102 |
+
# All the model information that we might need
|
| 103 |
@dataclass
|
| 104 |
class ModelDetails:
|
| 105 |
name: str
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
class ModelType(Enum):
|
| 111 |
+
LLM = ModelDetails(name="LLM", symbol="🟢")
|
| 112 |
+
VLM = ModelDetails(name="VLM", symbol="🔶")
|
| 113 |
+
TSFM = ModelDetails(name="Post-trained TSFM", symbol="⭕")
|
|
|
|
| 114 |
Unknown = ModelDetails(name="", symbol="?")
|
| 115 |
|
| 116 |
def to_str(self, separator=" "):
|
|
|
|
| 118 |
|
| 119 |
@staticmethod
|
| 120 |
def from_str(type):
|
| 121 |
+
if "VLM" in type or "🔶" in type:
|
| 122 |
+
return ModelType.VLM
|
| 123 |
+
if "LLM" in type or "🟢" in type:
|
| 124 |
+
return ModelType.LLM
|
| 125 |
+
if "TSFM" in type or "⭕" in type:
|
|
|
|
|
|
|
| 126 |
return ModelType.TSFM
|
| 127 |
return ModelType.Unknown
|
| 128 |
|
|
|
|
| 154 |
|
| 155 |
# Define the benchmark columns for ARFBench
|
| 156 |
BENCHMARK_COLS = [
|
| 157 |
+
"model",
|
| 158 |
+
"model_type",
|
| 159 |
+
"overall_f1",
|
| 160 |
+
"tier_i_f1",
|
| 161 |
+
"tier_ii_f1",
|
| 162 |
+
"tier_iii_f1",
|
| 163 |
"presence",
|
| 164 |
"identification",
|
| 165 |
"start_time",
|
|
|
|
| 169 |
"correlation",
|
| 170 |
"indicator",
|
| 171 |
]
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# New leaderboard datasets
|
| 175 |
+
OVERALL_TIER_COLS = [c.name for c in fields(OverallTierColumn) if not c.hidden]
|
| 176 |
+
CATEGORY_F1_COLS = [c.name for c in fields(CategoryF1Column) if not c.hidden]
|
| 177 |
+
CATEGORY_ACCURACY_COLS = [c.name for c in fields(CategoryAccuracyColumn) if not c.hidden]
|
src/populate.py
CHANGED
|
@@ -1,51 +1,48 @@
|
|
| 1 |
-
import os
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
from src.display.formatting import make_clickable_model
|
| 5 |
|
| 6 |
|
| 7 |
-
def
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
df = pd.read_csv(results_path)
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
df.columns = df.columns.str.lower()
|
| 18 |
-
|
| 19 |
-
# Make model names clickable
|
| 20 |
-
df["model"] = df["model"].apply(make_clickable_model)
|
| 21 |
-
|
| 22 |
-
# Sort by pass@1 performance (descending)
|
| 23 |
-
df = df.sort_values(by=["pass_at_1"], ascending=False)
|
| 24 |
-
|
| 25 |
-
# Round numeric columns to 2 decimal places
|
| 26 |
-
numeric_cols = [
|
| 27 |
-
"pass_at_1",
|
| 28 |
-
"pass_at_5",
|
| 29 |
-
"presence",
|
| 30 |
-
"identification",
|
| 31 |
-
"start_time",
|
| 32 |
-
"end_time",
|
| 33 |
-
"magnitude",
|
| 34 |
-
"categorization",
|
| 35 |
-
"correlation",
|
| 36 |
-
"indicator",
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
for col in numeric_cols:
|
| 40 |
-
if col in df.columns:
|
| 41 |
-
df[col] = df[col].round(2)
|
| 42 |
-
|
| 43 |
-
# Handle missing values - replace with 0 or appropriate value
|
| 44 |
-
df = df.fillna(0)
|
| 45 |
-
|
| 46 |
-
# Select only the columns we need
|
| 47 |
-
available_cols = [col for col in cols if col in df.columns]
|
| 48 |
-
df = df[available_cols]
|
| 49 |
|
| 50 |
return df
|
| 51 |
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
from src.display.formatting import make_clickable_model
|
| 4 |
|
| 5 |
|
| 6 |
+
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 7 |
+
rename_map = {
|
| 8 |
+
"model type": "model_type",
|
| 9 |
+
"overall f1": "overall_f1",
|
| 10 |
+
"overall accuracy": "overall_accuracy",
|
| 11 |
+
"accuracy": "accuracy",
|
| 12 |
+
"tier i": "tier_i_f1",
|
| 13 |
+
"tier ii": "tier_ii_f1",
|
| 14 |
+
"tier iii": "tier_iii_f1",
|
| 15 |
+
"tier i accuracy": "tier_i_accuracy",
|
| 16 |
+
"tier ii accuracy": "tier_ii_accuracy",
|
| 17 |
+
"tier iii accuracy": "tier_iii_accuracy",
|
| 18 |
+
"start time": "start_time",
|
| 19 |
+
"end time": "end_time",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
normalized = {}
|
| 23 |
+
for col in df.columns:
|
| 24 |
+
cleaned = col.strip().lower()
|
| 25 |
+
normalized[col] = rename_map.get(cleaned, cleaned.replace(" ", "_"))
|
| 26 |
+
|
| 27 |
+
return df.rename(columns=normalized)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_leaderboard_df(
|
| 31 |
+
results_path: str,
|
| 32 |
+
_requests_path: str,
|
| 33 |
+
_cols: list,
|
| 34 |
+
_benchmark_cols: list,
|
| 35 |
+
sort_by: str = "overall_f1",
|
| 36 |
+
) -> pd.DataFrame:
|
| 37 |
+
"""Creates a dataframe from a static CSV leaderboard file."""
|
| 38 |
df = pd.read_csv(results_path)
|
| 39 |
+
df = _normalize_columns(df)
|
| 40 |
|
| 41 |
+
if "model" in df.columns:
|
| 42 |
+
df["model"] = df["model"].apply(make_clickable_model)
|
| 43 |
+
|
| 44 |
+
if sort_by in df.columns:
|
| 45 |
+
df = df.sort_values(by=[sort_by], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
return df
|
| 48 |
|