import gradio as gr from bertopic import BERTopic from sentence_transformers import SentenceTransformer import os import pandas as pd def run_from_textfile(file): if file is None: return "Please upload a .txt file.", "", "", None # ---- Handle file input ---- text = "" if hasattr(file, 'decode'): try: text = file.decode("utf-8") except Exception as e: return f"Error decoding NamedString: {e}", "", "", None elif hasattr(file, 'read'): try: text = file.read().decode("utf-8") except Exception as e: return f"Error reading/decoding file object: {e}", "", "", None elif isinstance(file, str) and os.path.exists(file): try: with open(file, 'r', encoding='utf-8') as f: text = f.read() except Exception as e: return f"Error reading file from path: {e}", "", "", None if not text: return "Could not read the file content. Please check the file type and content.", "", "", None # Split the text into documents (one per line) docs = [line.strip() for line in text.split("\n") if line.strip()] if len(docs) < 3: return "Need at least 3 documents (one per line).", "", "", None # ---- Embedding Model ---- embedder = SentenceTransformer("all-MiniLM-L6-v2") # ---- Topic Modeling ---- topic_model = BERTopic(embedding_model=embedder) topics, probs = topic_model.fit_transform(docs) # ---- Topic Summary ---- topic_info = topic_model.get_topic_info().to_string(index=False) # ---- TOPIC WEIGHTS (Word Importance per Topic) ---- weights_output = "=" * 80 + "\n" weights_output += "TOPIC WEIGHTS (Word Importance Scores)\n" weights_output += "=" * 80 + "\n\n" # Get all topics except outlier topic (-1) all_topics = [t for t in topic_model.get_topics().keys() if t != -1] for topic_id in all_topics: weights_output += f"TOPIC {topic_id}\n" weights_output += "-" * 40 + "\n" # Get top words and their weights for this topic topic_words = topic_model.get_topic(topic_id) if topic_words: for word, weight in topic_words[:10]: # Top 10 words weights_output += f" {word:20s} {weight:8.4f}\n" weights_output += "\n" # ---- Document → Topic Assignments ---- assignments = "\n".join([f"Doc {i+1}: Topic {topics[i]}" for i in range(len(docs))]) # ---- Visualization ---- fig = topic_model.visualize_barchart(top_n_topics=10) return topic_info, weights_output, assignments, fig # ---- Gradio Interface ---- with gr.Blocks() as demo: gr.Markdown("# 🧠 Topic Modeling from TXT File (BERTopic)") gr.Markdown( "Upload a plain text (.txt) file. Each line should contain **one LLM response**.\n" "\nExample format:\n```\nResponse 1...\nResponse 2...\nResponse 3...\n```" ) file_input = gr.File(label="Upload .txt file") run_button = gr.Button("Run Topic Modeling") topic_output = gr.Textbox(label="Topic Overview", lines=12) weights_output = gr.Textbox(label="📊 Topic Weights (Word Importance)", lines=20) assignment_output = gr.Textbox(label="Document → Topic Assignments", lines=12) fig_output = gr.Plot(label="Topic Visualization") run_button.click( fn=run_from_textfile, inputs=file_input, outputs=[topic_output, weights_output, assignment_output, fig_output] ) # Launch app demo.launch()