Spaces:
Running
Running
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class EmbeddingHandler: | |
| def __init__(self): | |
| print("TF-IDF Search ready — free, no download!") | |
| self.chunks_store = {} | |
| self.vectorizers = {} | |
| self.matrices = {} | |
| def process_and_store(self, transcript: str, session_id: str): | |
| chunks = self._chunk_transcript(transcript) | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| matrix = vectorizer.fit_transform(chunks) | |
| self.chunks_store[session_id] = chunks | |
| self.vectorizers[session_id] = vectorizer | |
| self.matrices[session_id] = matrix | |
| print(f"Stored {len(chunks)} chunks for session {session_id}") | |
| def retrieve(self, query: str, session_id: str, top_k: int = 5) -> str: | |
| if session_id not in self.chunks_store: | |
| raise ValueError("Session not found.") | |
| vectorizer = self.vectorizers[session_id] | |
| matrix = self.matrices[session_id] | |
| chunks = self.chunks_store[session_id] | |
| query_vec = vectorizer.transform([query]) | |
| scores = cosine_similarity(query_vec, matrix).flatten() | |
| top_indices = scores.argsort()[-top_k:][::-1] | |
| top_chunks = [chunks[i] for i in top_indices] | |
| return "\n\n---\n\n".join(top_chunks) | |
| def _chunk_transcript(self, transcript: str, chunk_size: int = 400, overlap: int = 50) -> list: | |
| words = transcript.split() | |
| chunks = [] | |
| for i in range(0, len(words), chunk_size - overlap): | |
| chunk = " ".join(words[i:i + chunk_size]) | |
| if chunk: | |
| chunks.append(chunk) | |
| return chunks | |
| def cleanup_session(self, session_id: str): | |
| for store in [self.chunks_store, self.vectorizers, self.matrices]: | |
| if session_id in store: | |
| del store[session_id] |