ARFBench / src /display /utils.py
sxie78-dd's picture
update leaderboard results
3a013b1 unverified
from dataclasses import dataclass
from enum import Enum
def fields(raw_class):
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
# ARFBench Leaderboard columns
@dataclass(frozen=True)
class AutoEvalColumn:
# Model column (always displayed)
model = ColumnContent("model", "markdown", True, never_hidden=True)
# Model type column
model_type = ColumnContent("model_type", "str", True)
# Performance metrics
overall_f1 = ColumnContent("overall_f1", "number", True)
tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
# Specific benchmark metrics
presence = ColumnContent("presence", "number", True)
identification = ColumnContent("identification", "number", True)
start_time = ColumnContent("start_time", "number", True)
end_time = ColumnContent("end_time", "number", True)
magnitude = ColumnContent("magnitude", "number", True)
categorization = ColumnContent("categorization", "number", True)
correlation = ColumnContent("correlation", "number", True)
indicator = ColumnContent("indicator", "number", True)
# Overall + per-tier leaderboard columns
@dataclass(frozen=True)
class OverallTierColumn:
model = ColumnContent("model", "markdown", True, never_hidden=True)
model_type = ColumnContent("model_type", "str", True)
accuracy = ColumnContent("accuracy", "number", True)
tier_i_accuracy = ColumnContent("tier_i_accuracy", "number", True)
tier_ii_accuracy = ColumnContent("tier_ii_accuracy", "number", True)
tier_iii_accuracy = ColumnContent("tier_iii_accuracy", "number", True)
overall_f1 = ColumnContent("overall_f1", "number", True)
tier_i_f1 = ColumnContent("tier_i_f1", "number", True)
tier_ii_f1 = ColumnContent("tier_ii_f1", "number", True)
tier_iii_f1 = ColumnContent("tier_iii_f1", "number", True)
# Per-category F1 leaderboard columns
@dataclass(frozen=True)
class CategoryF1Column:
model = ColumnContent("model", "markdown", True, never_hidden=True)
model_type = ColumnContent("model_type", "str", True)
overall_f1 = ColumnContent("overall_f1", "number", True)
presence = ColumnContent("presence", "number", True)
identification = ColumnContent("identification", "number", True)
start_time = ColumnContent("start_time", "number", True)
end_time = ColumnContent("end_time", "number", True)
magnitude = ColumnContent("magnitude", "number", True)
categorization = ColumnContent("categorization", "number", True)
correlation = ColumnContent("correlation", "number", True)
indicator = ColumnContent("indicator", "number", True)
# Per-category accuracy leaderboard columns
@dataclass(frozen=True)
class CategoryAccuracyColumn:
model = ColumnContent("model", "markdown", True, never_hidden=True)
model_type = ColumnContent("model_type", "str", True)
overall_accuracy = ColumnContent("overall_accuracy", "number", True)
presence = ColumnContent("presence", "number", True)
identification = ColumnContent("identification", "number", True)
start_time = ColumnContent("start_time", "number", True)
end_time = ColumnContent("end_time", "number", True)
magnitude = ColumnContent("magnitude", "number", True)
categorization = ColumnContent("categorization", "number", True)
correlation = ColumnContent("correlation", "number", True)
indicator = ColumnContent("indicator", "number", True)
# For the queue columns in the submission tab
@dataclass(frozen=True)
class EvalQueueColumn: # Queue column
model = ColumnContent("model", "markdown", True)
revision = ColumnContent("revision", "str", True)
private = ColumnContent("private", "bool", True)
precision = ColumnContent("precision", "str", True)
weight_type = ColumnContent("weight_type", "str", "Original")
status = ColumnContent("status", "str", True)
# All the model information that we might need
@dataclass
class ModelDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class ModelType(Enum):
LLM = ModelDetails(name="LLM", symbol="🟢")
VLM = ModelDetails(name="VLM", symbol="🔶")
TSFM = ModelDetails(name="Post-trained TSFM", symbol="⭕")
Unknown = ModelDetails(name="", symbol="?")
def to_str(self, separator=" "):
return f"{self.value.symbol}{separator}{self.value.name}"
@staticmethod
def from_str(type):
if "VLM" in type or "🔶" in type:
return ModelType.VLM
if "LLM" in type or "🟢" in type:
return ModelType.LLM
if "TSFM" in type or "⭕" in type:
return ModelType.TSFM
return ModelType.Unknown
class WeightType(Enum):
Adapter = ModelDetails("Adapter")
Original = ModelDetails("Original")
Delta = ModelDetails("Delta")
class Precision(Enum):
float16 = ModelDetails("float16")
bfloat16 = ModelDetails("bfloat16")
Unknown = ModelDetails("?")
def from_str(precision):
if precision in ["torch.float16", "float16"]:
return Precision.float16
if precision in ["torch.bfloat16", "bfloat16"]:
return Precision.bfloat16
return Precision.Unknown
# Column selection
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
# Define the benchmark columns for ARFBench
BENCHMARK_COLS = [
"model",
"model_type",
"overall_f1",
"tier_i_f1",
"tier_ii_f1",
"tier_iii_f1",
"presence",
"identification",
"start_time",
"end_time",
"magnitude",
"categorization",
"correlation",
"indicator",
]
# New leaderboard datasets
OVERALL_TIER_COLS = [c.name for c in fields(OverallTierColumn) if not c.hidden]
CATEGORY_F1_COLS = [c.name for c in fields(CategoryF1Column) if not c.hidden]
CATEGORY_ACCURACY_COLS = [c.name for c in fields(CategoryAccuracyColumn) if not c.hidden]