| | import pandas as pd |
| | import ast |
| | import json |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| |
|
| |
|
| | class TaskVisualizations: |
| | def __init__( |
| | self, task_counts_path, selected_task_counts_path, tasks_with_areas_path |
| | ): |
| | self.tasks_with_areas_df = self.load_tasks_with_areas_df( |
| | task_counts_path, tasks_with_areas_path |
| | ) |
| | self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df( |
| | selected_task_counts_path, tasks_with_areas_path |
| | ) |
| |
|
| | @classmethod |
| | def load_tasks_with_areas_df( |
| | cls, task_counts_path, tasks_with_areas_path="data/paperswithcode_tasks.csv" |
| | ): |
| | task_counts_df = pd.read_csv(task_counts_path) |
| | raw_tasks_with_areas_df = pd.read_csv(tasks_with_areas_path) |
| | return raw_tasks_with_areas_df.merge(task_counts_df, on="task") |
| |
|
| | @classmethod |
| | def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000): |
| | sorted_df = df.copy().sort_values(val_col, ascending=False) |
| | topk_dict = ( |
| | sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col] |
| | ) |
| | print(topk_dict) |
| | sorted_df[by_col] = sorted_df[by_col].apply( |
| | lambda k: k |
| | if k in topk_dict.keys() and topk_dict[k] >= val_threshold |
| | else "other" |
| | ) |
| | sorted_df = sorted_df.groupby(by_col).agg({val_col: sum}) |
| | return sorted_df |
| |
|
| | @classmethod |
| | def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count): |
| | displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy() |
| | displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply( |
| | lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1 |
| | ) |
| | displayed_tasks_with_areas_df = ( |
| | displayed_tasks_with_areas_df.groupby("area") |
| | .apply( |
| | lambda df: cls.get_topk_merge_others( |
| | df, "task", "count", val_threshold=min_task_count |
| | ) |
| | ) |
| | .reset_index() |
| | ) |
| | displayed_tasks_with_areas_df["task"] = ( |
| | displayed_tasks_with_areas_df["task"] |
| | + " " |
| | + displayed_tasks_with_areas_df["count"].apply(str) |
| | ) |
| | return displayed_tasks_with_areas_df |
| |
|
| | def get_tasks_sunburst(self, min_task_count, which_df="selected"): |
| | if which_df == "selected": |
| | df = self.selected_tasks_with_areas_df |
| | else: |
| | df = self.tasks_with_areas_df |
| |
|
| | displayed_tasks_with_areas_df = self.get_displayed_tasks_with_areas_df( |
| | df, min_task_count |
| | ) |
| |
|
| | return px.sunburst( |
| | displayed_tasks_with_areas_df, path=["area", "task"], values="count" |
| | ) |
| |
|