ARFBench / app.py
sxie78-dd's picture
change view to acc
dcb2e05 unverified
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
CATEGORY_ACCURACY_COLS,
CATEGORY_F1_COLS,
OVERALL_TIER_COLS,
CategoryAccuracyColumn,
CategoryF1Column,
OverallTierColumn,
fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_leaderboard_df
def restart_space():
API.restart_space(repo_id=REPO_ID)
OVERALL_TIER_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
EVAL_REQUESTS_PATH,
OVERALL_TIER_COLS,
OVERALL_TIER_COLS,
sort_by="accuracy",
)
CATEGORY_F1_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_f1.csv",
EVAL_REQUESTS_PATH,
CATEGORY_F1_COLS,
CATEGORY_F1_COLS,
sort_by="overall_f1",
)
CATEGORY_ACCURACY_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_accuracy.csv",
EVAL_REQUESTS_PATH,
CATEGORY_ACCURACY_COLS,
CATEGORY_ACCURACY_COLS,
sort_by="overall_accuracy",
)
def init_custom_leaderboard(dataframe, column_class, filter_column_name, filter_label):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(column_class)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[column_class.model.name],
hide_columns=[c.name for c in fields(column_class) if c.hidden],
filter_columns=[
ColumnFilter(
filter_column_name,
type="slider",
min=0,
max=100,
label=filter_label,
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
with gr.Tabs(selected=0):
with gr.TabItem("Overall + Tier (Default)", id=0):
leaderboard_overall_tier = init_custom_leaderboard(
OVERALL_TIER_LEADERBOARD_DF,
OverallTierColumn,
OverallTierColumn.overall_f1.name,
"Overall F1 score",
)
with gr.TabItem("Per-Category F1", id=1):
leaderboard_category_f1 = init_custom_leaderboard(
CATEGORY_F1_LEADERBOARD_DF,
CategoryF1Column,
CategoryF1Column.overall_f1.name,
"Overall F1 score",
)
with gr.TabItem("Per-Category Accuracy", id=2):
leaderboard_category_accuracy = init_custom_leaderboard(
CATEGORY_ACCURACY_LEADERBOARD_DF,
CategoryAccuracyColumn,
CategoryAccuracyColumn.overall_accuracy.name,
"Overall Accuracy score",
)
with gr.TabItem("πŸ“ About", elem_id="about-tab-table", id=1):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = None
demo.queue(default_concurrency_limit=40)
if __name__ == "__main__":
demo.launch()