streamlit-sample-space / src /streamlit_app.py
aug6th's picture
Update src/streamlit_app.py
1c1190b verified
import streamlit as st
import pathlib
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
# -----------------------------
# Page Config
# -----------------------------
st.set_page_config(
page_title="K-MHaS Korean Hate Speech Dashboard",
layout="wide"
)
st.title("๐Ÿ‡ฐ๐Ÿ‡ท K-MHaS Korean Hate Speech Analytics Dashboard")
# -----------------------------
# Label Mapping
# -----------------------------
LABEL_MAP = {
0: "Origin",
1: "Physical",
2: "Politics",
3: "Profanity",
4: "Age",
5: "Gender",
6: "Race",
7: "Religion",
8: "Not Hate"
}
# -----------------------------
# Load Dataset (Parquet Revision)
# -----------------------------
@st.cache_data
def load_data():
ds = load_dataset(
"jeanlee/kmhas_korean_hate_speech",
split="train",
revision="refs/convert/parquet"
)
df = pd.DataFrame(ds)
return df
df = load_data()
# -----------------------------
# Preprocessing
# -----------------------------
df["length"] = df["text"].apply(len)
# label ์ˆซ์ž๋ฅผ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
df["label_name"] = df["label"].apply(
lambda labels: [LABEL_MAP[l] for l in labels]
)
df_exploded = df.explode("label_name")
# -----------------------------
# KPI Section
# -----------------------------
col1, col2, col3, col4 = st.columns(4)
total_samples = len(df)
avg_length = df["length"].mean()
label_counts = df_exploded["label_name"].value_counts()
top_label = label_counts.idxmax()
not_hate_count = label_counts.get("Not Hate", 0)
hate_ratio = 1 - (not_hate_count / total_samples)
col1.metric("์ด ์ƒ˜ํ”Œ ์ˆ˜", f"{total_samples:,}")
col2.metric("ํ˜์˜ค ๋น„์œจ", f"{hate_ratio:.2%}")
col3.metric("ํ‰๊ท  ํ…์ŠคํŠธ ๊ธธ์ด", f"{avg_length:.1f}")
col4.metric("์ตœ๋‹ค ๋ผ๋ฒจ", top_label)
st.markdown("---")
# -----------------------------
# Charts Section
# -----------------------------
left, right = st.columns(2)
with left:
fig1 = px.bar(
label_counts,
x=label_counts.index,
y=label_counts.values,
title="๋ผ๋ฒจ ๋ถ„ํฌ"
)
st.plotly_chart(fig1, use_container_width=True)
with right:
fig2 = px.histogram(
df,
x="length",
nbins=50,
title="ํ…์ŠคํŠธ ๊ธธ์ด ๋ถ„ํฌ"
)
st.plotly_chart(fig2, use_container_width=True)
st.markdown("---")
# -----------------------------
# Label Filter Section
# -----------------------------
st.subheader("๐Ÿ”Ž ๋ผ๋ฒจ ํ•„ํ„ฐ")
selected_label = st.selectbox(
"๋ผ๋ฒจ ์„ ํƒ",
sorted(df_exploded["label_name"].unique())
)
filtered_df = df[df["label_name"].apply(lambda x: selected_label in x)]
st.write(f"์„ ํƒ๋œ ๋ผ๋ฒจ ์ƒ˜ํ”Œ ์ˆ˜: {len(filtered_df):,}")
st.dataframe(
filtered_df[["text", "label_name"]].head(100),
use_container_width=True
)
st.markdown("---")
# -----------------------------
# WordCloud Section
# -----------------------------
BASE_DIR = pathlib.Path(__file__).resolve().parent
FONT_PATH = BASE_DIR / "NanumGothic.ttf"
st.subheader("โ˜๏ธ Word Cloud")
text_data = " ".join(filtered_df["text"].tolist())
if len(text_data) > 0:
# ๋„ˆ๋ฌด ๋งŽ์€ ํ…์ŠคํŠธ๋ฉด ์ƒ˜ํ”Œ๋ง (์„ฑ๋Šฅ ์•ˆ์ •ํ™”)
if len(filtered_df) > 3000:
sample_df = filtered_df.sample(3000, random_state=42)
text_data = " ".join(sample_df["text"].tolist())
wordcloud = WordCloud(
font_path=str(FONT_PATH),
background_color=None,
mode="RGBA",
colormap="viridis",
width=1200,
height=600,
max_words=200,
max_font_size=150,
min_font_size=10,
relative_scaling=0.5,
collocations=False,
).generate(text_data)
fig_wc, ax = plt.subplots(figsize=(14, 7))
ax.imshow(wordcloud, interpolation="bilinear")
ax.axis("off")
fig_wc.patch.set_alpha(0)
st.pyplot(fig_wc, use_container_width=True)
else:
st.info("ํ•ด๋‹น ๋ผ๋ฒจ์— ๋Œ€ํ•œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")