Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pathlib | |
| import pandas as pd | |
| import plotly.express as px | |
| from datasets import load_dataset | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| # ----------------------------- | |
| # Page Config | |
| # ----------------------------- | |
| st.set_page_config( | |
| page_title="K-MHaS Korean Hate Speech Dashboard", | |
| layout="wide" | |
| ) | |
| st.title("๐ฐ๐ท K-MHaS Korean Hate Speech Analytics Dashboard") | |
| # ----------------------------- | |
| # Label Mapping | |
| # ----------------------------- | |
| LABEL_MAP = { | |
| 0: "Origin", | |
| 1: "Physical", | |
| 2: "Politics", | |
| 3: "Profanity", | |
| 4: "Age", | |
| 5: "Gender", | |
| 6: "Race", | |
| 7: "Religion", | |
| 8: "Not Hate" | |
| } | |
| # ----------------------------- | |
| # Load Dataset (Parquet Revision) | |
| # ----------------------------- | |
| def load_data(): | |
| ds = load_dataset( | |
| "jeanlee/kmhas_korean_hate_speech", | |
| split="train", | |
| revision="refs/convert/parquet" | |
| ) | |
| df = pd.DataFrame(ds) | |
| return df | |
| df = load_data() | |
| # ----------------------------- | |
| # Preprocessing | |
| # ----------------------------- | |
| df["length"] = df["text"].apply(len) | |
| # label ์ซ์๋ฅผ ๋ฌธ์์ด๋ก ๋ณํ | |
| df["label_name"] = df["label"].apply( | |
| lambda labels: [LABEL_MAP[l] for l in labels] | |
| ) | |
| df_exploded = df.explode("label_name") | |
| # ----------------------------- | |
| # KPI Section | |
| # ----------------------------- | |
| col1, col2, col3, col4 = st.columns(4) | |
| total_samples = len(df) | |
| avg_length = df["length"].mean() | |
| label_counts = df_exploded["label_name"].value_counts() | |
| top_label = label_counts.idxmax() | |
| not_hate_count = label_counts.get("Not Hate", 0) | |
| hate_ratio = 1 - (not_hate_count / total_samples) | |
| col1.metric("์ด ์ํ ์", f"{total_samples:,}") | |
| col2.metric("ํ์ค ๋น์จ", f"{hate_ratio:.2%}") | |
| col3.metric("ํ๊ท ํ ์คํธ ๊ธธ์ด", f"{avg_length:.1f}") | |
| col4.metric("์ต๋ค ๋ผ๋ฒจ", top_label) | |
| st.markdown("---") | |
| # ----------------------------- | |
| # Charts Section | |
| # ----------------------------- | |
| left, right = st.columns(2) | |
| with left: | |
| fig1 = px.bar( | |
| label_counts, | |
| x=label_counts.index, | |
| y=label_counts.values, | |
| title="๋ผ๋ฒจ ๋ถํฌ" | |
| ) | |
| st.plotly_chart(fig1, use_container_width=True) | |
| with right: | |
| fig2 = px.histogram( | |
| df, | |
| x="length", | |
| nbins=50, | |
| title="ํ ์คํธ ๊ธธ์ด ๋ถํฌ" | |
| ) | |
| st.plotly_chart(fig2, use_container_width=True) | |
| st.markdown("---") | |
| # ----------------------------- | |
| # Label Filter Section | |
| # ----------------------------- | |
| st.subheader("๐ ๋ผ๋ฒจ ํํฐ") | |
| selected_label = st.selectbox( | |
| "๋ผ๋ฒจ ์ ํ", | |
| sorted(df_exploded["label_name"].unique()) | |
| ) | |
| filtered_df = df[df["label_name"].apply(lambda x: selected_label in x)] | |
| st.write(f"์ ํ๋ ๋ผ๋ฒจ ์ํ ์: {len(filtered_df):,}") | |
| st.dataframe( | |
| filtered_df[["text", "label_name"]].head(100), | |
| use_container_width=True | |
| ) | |
| st.markdown("---") | |
| # ----------------------------- | |
| # WordCloud Section | |
| # ----------------------------- | |
| BASE_DIR = pathlib.Path(__file__).resolve().parent | |
| FONT_PATH = BASE_DIR / "NanumGothic.ttf" | |
| st.subheader("โ๏ธ Word Cloud") | |
| text_data = " ".join(filtered_df["text"].tolist()) | |
| if len(text_data) > 0: | |
| # ๋๋ฌด ๋ง์ ํ ์คํธ๋ฉด ์ํ๋ง (์ฑ๋ฅ ์์ ํ) | |
| if len(filtered_df) > 3000: | |
| sample_df = filtered_df.sample(3000, random_state=42) | |
| text_data = " ".join(sample_df["text"].tolist()) | |
| wordcloud = WordCloud( | |
| font_path=str(FONT_PATH), | |
| background_color=None, | |
| mode="RGBA", | |
| colormap="viridis", | |
| width=1200, | |
| height=600, | |
| max_words=200, | |
| max_font_size=150, | |
| min_font_size=10, | |
| relative_scaling=0.5, | |
| collocations=False, | |
| ).generate(text_data) | |
| fig_wc, ax = plt.subplots(figsize=(14, 7)) | |
| ax.imshow(wordcloud, interpolation="bilinear") | |
| ax.axis("off") | |
| fig_wc.patch.set_alpha(0) | |
| st.pyplot(fig_wc, use_container_width=True) | |
| else: | |
| st.info("ํด๋น ๋ผ๋ฒจ์ ๋ํ ํ ์คํธ๊ฐ ์์ต๋๋ค.") |