Spaces:

aug6th
/

streamlit-sample-space

Sleeping

App Files Files Community

streamlit-sample-space / src /streamlit_app.py

aug6th

Update src/streamlit_app.py

1c1190b verified 9 days ago

raw

history blame contribute delete

4.02 kB

	import streamlit as st
	import pathlib
	import pandas as pd
	import plotly.express as px
	from datasets import load_dataset
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import Counter

	# -----------------------------
	# Page Config
	# -----------------------------
	st.set_page_config(
	page_title="K-MHaS Korean Hate Speech Dashboard",
	layout="wide"
	)

	st.title("🇰🇷 K-MHaS Korean Hate Speech Analytics Dashboard")

	# -----------------------------
	# Label Mapping
	# -----------------------------
	LABEL_MAP = {
	0: "Origin",
	1: "Physical",
	2: "Politics",
	3: "Profanity",
	4: "Age",
	5: "Gender",
	6: "Race",
	7: "Religion",
	8: "Not Hate"
	}

	# -----------------------------
	# Load Dataset (Parquet Revision)
	# -----------------------------
	@st.cache_data
	def load_data():
	ds = load_dataset(
	"jeanlee/kmhas_korean_hate_speech",
	split="train",
	revision="refs/convert/parquet"
	)
	df = pd.DataFrame(ds)
	return df

	df = load_data()

	# -----------------------------
	# Preprocessing
	# -----------------------------
	df["length"] = df["text"].apply(len)

	# label 숫자를 문자열로 변환
	df["label_name"] = df["label"].apply(
	lambda labels: [LABEL_MAP[l] for l in labels]
	)

	df_exploded = df.explode("label_name")

	# -----------------------------
	# KPI Section
	# -----------------------------
	col1, col2, col3, col4 = st.columns(4)

	total_samples = len(df)
	avg_length = df["length"].mean()

	label_counts = df_exploded["label_name"].value_counts()
	top_label = label_counts.idxmax()

	not_hate_count = label_counts.get("Not Hate", 0)
	hate_ratio = 1 - (not_hate_count / total_samples)

	col1.metric("총 샘플 수", f"{total_samples:,}")
	col2.metric("혐오 비율", f"{hate_ratio:.2%}")
	col3.metric("평균 텍스트 길이", f"{avg_length:.1f}")
	col4.metric("최다 라벨", top_label)

	st.markdown("---")

	# -----------------------------
	# Charts Section
	# -----------------------------
	left, right = st.columns(2)

	with left:
	fig1 = px.bar(
	label_counts,
	x=label_counts.index,
	y=label_counts.values,
	title="라벨 분포"
	)
	st.plotly_chart(fig1, use_container_width=True)

	with right:
	fig2 = px.histogram(
	df,
	x="length",
	nbins=50,
	title="텍스트 길이 분포"
	)
	st.plotly_chart(fig2, use_container_width=True)

	st.markdown("---")

	# -----------------------------
	# Label Filter Section
	# -----------------------------
	st.subheader("🔎 라벨 필터")

	selected_label = st.selectbox(
	"라벨 선택",
	sorted(df_exploded["label_name"].unique())
	)

	filtered_df = df[df["label_name"].apply(lambda x: selected_label in x)]

	st.write(f"선택된 라벨 샘플 수: {len(filtered_df):,}")
	st.dataframe(
	filtered_df[["text", "label_name"]].head(100),
	use_container_width=True
	)

	st.markdown("---")

	# -----------------------------
	# WordCloud Section
	# -----------------------------

	BASE_DIR = pathlib.Path(__file__).resolve().parent
	FONT_PATH = BASE_DIR / "NanumGothic.ttf"

	st.subheader("☁️ Word Cloud")

	text_data = " ".join(filtered_df["text"].tolist())

	if len(text_data) > 0:
	# 너무 많은 텍스트면 샘플링 (성능 안정화)
	if len(filtered_df) > 3000:
	sample_df = filtered_df.sample(3000, random_state=42)
	text_data = " ".join(sample_df["text"].tolist())

	wordcloud = WordCloud(
	font_path=str(FONT_PATH),
	background_color=None,
	mode="RGBA",
	colormap="viridis",
	width=1200,
	height=600,
	max_words=200,
	max_font_size=150,
	min_font_size=10,
	relative_scaling=0.5,
	collocations=False,
	).generate(text_data)

	fig_wc, ax = plt.subplots(figsize=(14, 7))
	ax.imshow(wordcloud, interpolation="bilinear")
	ax.axis("off")

	fig_wc.patch.set_alpha(0)
	st.pyplot(fig_wc, use_container_width=True)

	else:
	st.info("해당 라벨에 대한 텍스트가 없습니다.")