| | import streamlit as st |
| | from datasets import load_dataset |
| | import os |
| |
|
| | HF_TOKEN = os.environ.get("HF_TOKEN", None) |
| |
|
| | st.set_page_config(page_title="Web Clusters inspection", layout="wide") |
| | st.title("Web clusters inspection") |
| |
|
| | st.markdown(""" |
| | We clustered 100k web samples using [text-clustering](https://github.com/huggingface/text-clustering). |
| | |
| | Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \ |
| | Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics. |
| | |
| | Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples). |
| | """) |
| |
|
| |
|
| | @st.cache_data |
| | def load_data(min_score=1, max_score=10, show_special=False): |
| | |
| | ds = load_dataset("HuggingFaceTB/FW_clusters_100k_145_topics", split="train", token=HF_TOKEN, num_proc=2) |
| | def filter_func(x): |
| | try: |
| | score = int(x['educational_score']) |
| | value = False if show_special else min_score <= score <= max_score |
| | return value |
| | except (ValueError, TypeError): |
| | |
| | return show_special |
| |
|
| | ds = ds.filter(filter_func) |
| | return ds |
| |
|
| | st.subheader("Cluster information") |
| | col_1, col_2, col_3 = st.columns(3) |
| | with col_1: |
| | show_special = st.checkbox('Show only clusters with undefined educational score', False) |
| | with col_2: |
| | min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score') |
| | with col_3: |
| | max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score') |
| | |
| | |
| | ds = load_data(min_value, max_value, show_special) |
| | categories = list(set(ds["category"])) |
| | selected_category = st.selectbox("Select a topic", categories) |
| | selected_cluster = ds.filter(lambda x: x['category'] == selected_category) |
| |
|
| | |
| | n_samples = len(selected_cluster) |
| | if n_samples > 0: |
| | col_1, col_2 = st.columns(2) |
| | with col_1: |
| | index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1) |
| |
|
| | files = selected_cluster[index_cluster]["examples"] |
| |
|
| | with col_2: |
| | index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1) |
| |
|
| | sample = files[index_example] |
| | st.markdown(f"**Educational score of the cluster**: {selected_cluster[index_cluster]['educational_score']}") |
| | st.markdown(sample) |
| | else: |
| | st.markdown("No files found, change the cluster.") |