File size: 4,345 Bytes
c726497 3a013b1 c726497 8110fce c726497 3a013b1 dcb2e05 8110fce c726497 3a013b1 c726497 3a013b1 c726497 3a013b1 c726497 3a013b1 c726497 3a013b1 c726497 3a013b1 c726497 8110fce 3a013b1 c726497 8110fce 3a013b1 c726497 8110fce c726497 8110fce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
CATEGORY_ACCURACY_COLS,
CATEGORY_F1_COLS,
OVERALL_TIER_COLS,
CategoryAccuracyColumn,
CategoryF1Column,
OverallTierColumn,
fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_leaderboard_df
def restart_space():
API.restart_space(repo_id=REPO_ID)
OVERALL_TIER_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
EVAL_REQUESTS_PATH,
OVERALL_TIER_COLS,
OVERALL_TIER_COLS,
sort_by="accuracy",
)
CATEGORY_F1_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_f1.csv",
EVAL_REQUESTS_PATH,
CATEGORY_F1_COLS,
CATEGORY_F1_COLS,
sort_by="overall_f1",
)
CATEGORY_ACCURACY_LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_accuracy.csv",
EVAL_REQUESTS_PATH,
CATEGORY_ACCURACY_COLS,
CATEGORY_ACCURACY_COLS,
sort_by="overall_accuracy",
)
def init_custom_leaderboard(dataframe, column_class, filter_column_name, filter_label):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(column_class)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[column_class.model.name],
hide_columns=[c.name for c in fields(column_class) if c.hidden],
filter_columns=[
ColumnFilter(
filter_column_name,
type="slider",
min=0,
max=100,
label=filter_label,
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
with gr.Tabs(selected=0):
with gr.TabItem("Overall + Tier (Default)", id=0):
leaderboard_overall_tier = init_custom_leaderboard(
OVERALL_TIER_LEADERBOARD_DF,
OverallTierColumn,
OverallTierColumn.overall_f1.name,
"Overall F1 score",
)
with gr.TabItem("Per-Category F1", id=1):
leaderboard_category_f1 = init_custom_leaderboard(
CATEGORY_F1_LEADERBOARD_DF,
CategoryF1Column,
CategoryF1Column.overall_f1.name,
"Overall F1 score",
)
with gr.TabItem("Per-Category Accuracy", id=2):
leaderboard_category_accuracy = init_custom_leaderboard(
CATEGORY_ACCURACY_LEADERBOARD_DF,
CategoryAccuracyColumn,
CategoryAccuracyColumn.overall_accuracy.name,
"Overall Accuracy score",
)
with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = None
demo.queue(default_concurrency_limit=40)
if __name__ == "__main__":
demo.launch()
|