File size: 4,345 Bytes
c726497
 
 
 
 
 
 
 
 
 
 
 
3a013b1
 
 
 
 
 
c726497
 
 
8110fce
c726497
 
 
 
 
 
3a013b1
 
 
 
 
dcb2e05
8110fce
c726497
3a013b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c726497
3a013b1
 
c726497
 
 
 
3a013b1
c726497
3a013b1
 
c726497
 
3a013b1
 
c726497
 
3a013b1
c726497
8110fce
 
3a013b1
c726497
 
 
 
 
 
 
 
 
 
 
 
 
8110fce
3a013b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c726497
8110fce
c726497
 
 
 
 
 
 
 
 
 
 
 
8110fce
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    CATEGORY_ACCURACY_COLS,
    CATEGORY_F1_COLS,
    OVERALL_TIER_COLS,
    CategoryAccuracyColumn,
    CategoryF1Column,
    OverallTierColumn,
    fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_leaderboard_df


def restart_space():
    API.restart_space(repo_id=REPO_ID)


OVERALL_TIER_LEADERBOARD_DF = get_leaderboard_df(
    EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv",
    EVAL_REQUESTS_PATH,
    OVERALL_TIER_COLS,
    OVERALL_TIER_COLS,
    sort_by="accuracy",
)

CATEGORY_F1_LEADERBOARD_DF = get_leaderboard_df(
    EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_f1.csv",
    EVAL_REQUESTS_PATH,
    CATEGORY_F1_COLS,
    CATEGORY_F1_COLS,
    sort_by="overall_f1",
)

CATEGORY_ACCURACY_LEADERBOARD_DF = get_leaderboard_df(
    EVAL_RESULTS_PATH + "/ARFBench_leaderboard_category_accuracy.csv",
    EVAL_REQUESTS_PATH,
    CATEGORY_ACCURACY_COLS,
    CATEGORY_ACCURACY_COLS,
    sort_by="overall_accuracy",
)


def init_custom_leaderboard(dataframe, column_class, filter_column_name, filter_label):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    return Leaderboard(
        value=dataframe,
        datatype=[c.type for c in fields(column_class)],
        select_columns=SelectColumns(
            default_selection=[c.name for c in fields(column_class) if c.displayed_by_default],
            cant_deselect=[c.name for c in fields(column_class) if c.never_hidden],
            label="Select Columns to Display:",
        ),
        search_columns=[column_class.model.name],
        hide_columns=[c.name for c in fields(column_class) if c.hidden],
        filter_columns=[
            ColumnFilter(
                filter_column_name,
                type="slider",
                min=0,
                max=100,
                label=filter_label,
            ),
        ],
        bool_checkboxgroup_label="Hide models",
        interactive=False,
    )


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
            with gr.Tabs(selected=0):
                with gr.TabItem("Overall + Tier (Default)", id=0):
                    leaderboard_overall_tier = init_custom_leaderboard(
                        OVERALL_TIER_LEADERBOARD_DF,
                        OverallTierColumn,
                        OverallTierColumn.overall_f1.name,
                        "Overall F1 score",
                    )

                with gr.TabItem("Per-Category F1", id=1):
                    leaderboard_category_f1 = init_custom_leaderboard(
                        CATEGORY_F1_LEADERBOARD_DF,
                        CategoryF1Column,
                        CategoryF1Column.overall_f1.name,
                        "Overall F1 score",
                    )

                with gr.TabItem("Per-Category Accuracy", id=2):
                    leaderboard_category_accuracy = init_custom_leaderboard(
                        CATEGORY_ACCURACY_LEADERBOARD_DF,
                        CategoryAccuracyColumn,
                        CategoryAccuracyColumn.overall_accuracy.name,
                        "Overall Accuracy score",
                    )

        with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = None
demo.queue(default_concurrency_limit=40)
if __name__ == "__main__":
    demo.launch()