| | import gradio as gr |
| | import pandas as pd |
| | import logging |
| | import re |
| | from task_visualizations import TaskVisualizations |
| | import plotly.graph_objects as go |
| | from functools import partial |
| | from text_visualization import WordCloudExtractor |
| |
|
| | logging.basicConfig(level=logging.INFO) |
| |
|
| |
|
| | class AppConfig: |
| | repo_representations_path = "data/repo_representations.jsonl" |
| | task_counts_path = "data/repos_task_counts.csv" |
| | selected_task_counts_path = "data/selected_repos_task_counts.csv" |
| | tasks_path = "data/paperswithcode_tasks.csv" |
| |
|
| |
|
| | def load_repo_df(repo_representations_path): |
| | data = pd.read_json(repo_representations_path, lines=True, orient="records") |
| | return data.assign( |
| | text=data["text"] |
| | .str.replace(r"<img.*\/>", "", regex=True) |
| | .str.replace("│", "\n") |
| | .str.replace("⋮", "\n") |
| | ) |
| |
|
| |
|
| | def display_representations(repo, representation1, representation2): |
| | repo_data = repos_df[repos_df["repo_name"] == repo] |
| | logging.info(f"repo_data: {repo_data}") |
| | text1 = ( |
| | repo_data[repo_data["representation"] == representation1]["text"].iloc[0] |
| | if not repo_data[repo_data["representation"] == representation1].empty |
| | else "No data available" |
| | ) |
| | text2 = ( |
| | repo_data[repo_data["representation"] == representation2]["text"].iloc[0] |
| | if not repo_data[repo_data["representation"] == representation2].empty |
| | else "No data available" |
| | ) |
| |
|
| | return text1, text2 |
| |
|
| |
|
| | def get_representation_wordclouds(representations, repos_df): |
| | wordclouds = dict() |
| | for representation in representations: |
| | texts = list(repos_df[repos_df["representation"] == representation]["text"]) |
| | wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts) |
| | return wordclouds |
| |
|
| |
|
| | def setup_repository_representations_tab(repos, representation_types): |
| |
|
| | wordcloud_dict = get_representation_wordclouds(representation_types, repos_df) |
| | gr.Markdown("## Wordclouds") |
| | gr.Gallery([(wordcloud, representation_type) for representation_type, wordcloud in wordcloud_dict.items()], columns=[3], rows=[4], height=300) |
| |
|
| | gr.Markdown("Select a repository and two representation types to compare them.") |
| | with gr.Row(): |
| | repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0]) |
| | representation1 = gr.Dropdown( |
| | choices=representation_types, label="Representation 1", value="readme" |
| | ) |
| | representation2 = gr.Dropdown( |
| | choices=representation_types, |
| | label="Representation 2", |
| | value="generated_readme", |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column( |
| | elem_id="column1", |
| | variant="panel", |
| | scale=1, |
| | min_width=300, |
| | ): |
| | text1 = gr.Markdown() |
| | with gr.Column( |
| | elem_id="column2", |
| | variant="panel", |
| | scale=1, |
| | min_width=300, |
| | ): |
| | text2 = gr.Markdown() |
| |
|
| | def update_representations(repo, representation1, representation2): |
| | text1_content, text2_content = display_representations( |
| | repo, representation1, representation2 |
| | ) |
| | return ( |
| | f"### Representation 1: {representation1}\n\n{text1_content}", |
| | f"### Representation 2: {representation2}\n\n{text2_content}", |
| | ) |
| |
|
| | |
| | text1.value, text2.value = update_representations( |
| | repos[0], "readme", "generated_readme" |
| | ) |
| |
|
| | for component in [repo, representation1, representation2]: |
| | component.change( |
| | fn=update_representations, |
| | inputs=[repo, representation1, representation2], |
| | outputs=[text1, text2], |
| | ) |
| |
|
| |
|
| | |
| | repos_df = load_repo_df(AppConfig.repo_representations_path) |
| | repos = list(repos_df["repo_name"].unique()) |
| | representation_types = list(repos_df["representation"].unique()) |
| | logging.info(f"found {len(repos)} repositories") |
| | logging.info(f"representation types: {representation_types}") |
| | task_visualizations = TaskVisualizations( |
| | AppConfig.task_counts_path, |
| | AppConfig.selected_task_counts_path, |
| | AppConfig.tasks_path, |
| | ) |
| |
|
| | with gr.Blocks() as demo: |
| | with gr.Tab("Explore Repository Representations"): |
| | setup_repository_representations_tab(repos, representation_types) |
| | with gr.Tab("Explore PapersWithCode Tasks"): |
| | task_counts_description = """ |
| | ## PapersWithCode Tasks Visualization |
| | |
| | PapersWithCode tasks are grouped by area. |
| | |
| | In addition to showing task distribution across the original dataset we display task counts in the repositories we selected. |
| | """.strip() |
| |
|
| | gr.Markdown(task_counts_description) |
| |
|
| | with gr.Row(): |
| | min_task_counts_slider_all = gr.Slider( |
| | minimum=50, |
| | maximum=1000, |
| | value=150, |
| | step=50, |
| | label="Minimum Task Count (All Repositories)", |
| | ) |
| | update_button = gr.Button("Update Plots") |
| | min_task_counts_slider_selected = gr.Slider( |
| | minimum=10, |
| | maximum=100, |
| | value=50, |
| | step=10, |
| | label="Minimum Task Count (Selected Repositories)", |
| | ) |
| | update_selected_button = gr.Button("Update Plots") |
| |
|
| | with gr.Row("Task Counts"): |
| | all_repos_tasks_plot = gr.Plot(label="All Repositories") |
| | selected_repos_tasks_plot = gr.Plot(label="Selected Repositories") |
| |
|
| | update_button.click( |
| | fn=partial(task_visualizations.get_tasks_sunburst, which_df="all"), |
| | inputs=[min_task_counts_slider_all], |
| | outputs=[all_repos_tasks_plot], |
| | ) |
| |
|
| | update_selected_button.click( |
| | fn=partial(task_visualizations.get_tasks_sunburst, which_df="selected"), |
| | inputs=[min_task_counts_slider_selected], |
| | outputs=[selected_repos_tasks_plot], |
| | ) |
| |
|
| | demo.launch() |
| |
|