import streamlit as st import pathlib import pandas as pd import plotly.express as px from datasets import load_dataset from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter # ----------------------------- # Page Config # ----------------------------- st.set_page_config( page_title="K-MHaS Korean Hate Speech Dashboard", layout="wide" ) st.title("πŸ‡°πŸ‡· K-MHaS Korean Hate Speech Analytics Dashboard") # ----------------------------- # Label Mapping # ----------------------------- LABEL_MAP = { 0: "Origin", 1: "Physical", 2: "Politics", 3: "Profanity", 4: "Age", 5: "Gender", 6: "Race", 7: "Religion", 8: "Not Hate" } # ----------------------------- # Load Dataset (Parquet Revision) # ----------------------------- @st.cache_data def load_data(): ds = load_dataset( "jeanlee/kmhas_korean_hate_speech", split="train", revision="refs/convert/parquet" ) df = pd.DataFrame(ds) return df df = load_data() # ----------------------------- # Preprocessing # ----------------------------- df["length"] = df["text"].apply(len) # label 숫자λ₯Ό λ¬Έμžμ—΄λ‘œ λ³€ν™˜ df["label_name"] = df["label"].apply( lambda labels: [LABEL_MAP[l] for l in labels] ) df_exploded = df.explode("label_name") # ----------------------------- # KPI Section # ----------------------------- col1, col2, col3, col4 = st.columns(4) total_samples = len(df) avg_length = df["length"].mean() label_counts = df_exploded["label_name"].value_counts() top_label = label_counts.idxmax() not_hate_count = label_counts.get("Not Hate", 0) hate_ratio = 1 - (not_hate_count / total_samples) col1.metric("총 μƒ˜ν”Œ 수", f"{total_samples:,}") col2.metric("혐였 λΉ„μœ¨", f"{hate_ratio:.2%}") col3.metric("평균 ν…μŠ€νŠΈ 길이", f"{avg_length:.1f}") col4.metric("μ΅œλ‹€ 라벨", top_label) st.markdown("---") # ----------------------------- # Charts Section # ----------------------------- left, right = st.columns(2) with left: fig1 = px.bar( label_counts, x=label_counts.index, y=label_counts.values, title="라벨 뢄포" ) st.plotly_chart(fig1, use_container_width=True) with right: fig2 = px.histogram( df, x="length", nbins=50, title="ν…μŠ€νŠΈ 길이 뢄포" ) st.plotly_chart(fig2, use_container_width=True) st.markdown("---") # ----------------------------- # Label Filter Section # ----------------------------- st.subheader("πŸ”Ž 라벨 ν•„ν„°") selected_label = st.selectbox( "라벨 선택", sorted(df_exploded["label_name"].unique()) ) filtered_df = df[df["label_name"].apply(lambda x: selected_label in x)] st.write(f"μ„ νƒλœ 라벨 μƒ˜ν”Œ 수: {len(filtered_df):,}") st.dataframe( filtered_df[["text", "label_name"]].head(100), use_container_width=True ) st.markdown("---") # ----------------------------- # WordCloud Section # ----------------------------- BASE_DIR = pathlib.Path(__file__).resolve().parent FONT_PATH = BASE_DIR / "NanumGothic.ttf" st.subheader("☁️ Word Cloud") text_data = " ".join(filtered_df["text"].tolist()) if len(text_data) > 0: # λ„ˆλ¬΄ λ§Žμ€ ν…μŠ€νŠΈλ©΄ μƒ˜ν”Œλ§ (μ„±λŠ₯ μ•ˆμ •ν™”) if len(filtered_df) > 3000: sample_df = filtered_df.sample(3000, random_state=42) text_data = " ".join(sample_df["text"].tolist()) wordcloud = WordCloud( font_path=str(FONT_PATH), background_color=None, mode="RGBA", colormap="viridis", width=1200, height=600, max_words=200, max_font_size=150, min_font_size=10, relative_scaling=0.5, collocations=False, ).generate(text_data) fig_wc, ax = plt.subplots(figsize=(14, 7)) ax.imshow(wordcloud, interpolation="bilinear") ax.axis("off") fig_wc.patch.set_alpha(0) st.pyplot(fig_wc, use_container_width=True) else: st.info("ν•΄λ‹Ή 라벨에 λŒ€ν•œ ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.")