Spaces:

MLBench
/

Embedding-Calculation

Running

App Files Files Community

Embedding-Calculation / utils.py

saim1309

Upload 4 files

42db8ec verified 1 day ago

raw

history blame contribute delete

6.49 kB

	import openai
	import numpy as np
	import re
	from typing import List, Tuple
	from config import EMBED_MODEL

	def get_embedding(text: str) -> List[float]:
	"""Generate embedding for a given text."""
	text_strip = text.replace("\n", " ").strip()
	response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
	return response.data[0].embedding

	def cosine_similarity(a: List[float], b: List[float]) -> float:
	"""Calculate cosine similarity between two vectors."""
	a = np.array(a)
	b = np.array(b)
	if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
	return 0.0
	return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

	def clean_time(time_str: str) -> str:
	"""Clean up time string."""
	if not time_str:
	return ""

	time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM\|PM)', time_str, re.IGNORECASE)
	if time_match:
	hour = time_match.group(1)
	minute = time_match.group(2) or "00"
	ampm = time_match.group(3).upper()
	return f"{hour}:{minute} {ampm}"

	return time_str.strip()

	def find_top_k_matches(user_embedding, dataset, k=3):
	"""Find top k matching entries from a dataset."""
	scored = []
	for entry_id, text, emb in dataset:
	score = cosine_similarity(user_embedding, emb)
	scored.append((score, entry_id, text))
	scored.sort(reverse=True)
	return scored[:k]

	def classify_intent(question: str) -> str:
	"""
	Classify the user's intent into:
	Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
	Mode B: Front Desk Mode (Default - Everything else)
	"""
	prompt = f"""Classify the following user question into one of two modes:
	1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
	2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.

	User Question: "{question}"

	Response must be exactly "Mode A" or "Mode B"."""

	try:
	response = openai.chat.completions.create(
	model="gpt-4o-mini",
	messages=[{"role": "user", "content": prompt}],
	temperature=0,
	max_tokens=5
	)
	prediction = response.choices[0].message.content.strip()
	if "Mode A" in prediction:
	return "Mode A"
	return "Mode B"
	except Exception as e:
	print(f"Error in intent classification: {e}")
	return "Mode B" # Default to Front Desk Mode

	def should_include_email(question: str) -> bool:
	"""
	Determine if the contact email should be shown based on user intent.
	Allowed for: Payments, Refunds, Attendance issues, Account problems.
	"""
	from config import EMAIL_ONLY_KEYWORDS
	import re

	question_lower = question.lower()
	for word in EMAIL_ONLY_KEYWORDS:
	pattern = rf'\b{re.escape(word)}\b'
	if re.search(pattern, question_lower):
	return True

	return False

	def classify_user_type(question: str, history: List[dict] = None) -> str:
	"""
	Classify the user type into:
	- new_actor
	- experienced_actor
	- parent
	- current_student
	- unknown
	"""
	history_str = ""
	if history:
	history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])

	prompt = f"""Classify the user into exactly one of these categories based on their question and context:
	1. "new_actor": Just starting out, has no experience, or is asking how to begin.
	2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
	3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
	4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
	5. "unknown": Not enough information yet.

	User Question: "{question}"{history_str}

	Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""

	try:
	response = openai.chat.completions.create(
	model="gpt-4o-mini",
	messages=[{"role": "user", "content": prompt}],
	temperature=0,
	max_tokens=10
	)
	prediction = response.choices[0].message.content.strip().lower()
	valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
	for t in valid_types:
	if t in prediction:
	return t
	return "unknown"
	except Exception as e:
	print(f"Error in user type classification: {e}")
	return "unknown"

	def recalculate_all_embeddings():
	"""Recalculate embeddings for all entries in faq_entries and podcast_episodes that are missing embeddings."""
	from database import get_db_connection
	import json

	with get_db_connection() as conn:
	cur = conn.cursor()

	# 1. Update FAQs
	print("Starting FAQ embedding recalculation...")
	cur.execute("SELECT id, question FROM faq_entries WHERE embedding IS NULL")
	faqs = cur.fetchall()
	for faq_id, question in faqs:
	try:
	emb = get_embedding(question)
	cur.execute("UPDATE faq_entries SET embedding = ? WHERE id = ?", (json.dumps(emb), faq_id))
	print(f" ✓ Updated FAQ ID {faq_id}")
	except Exception as e:
	print(f" ✗ Error updating FAQ ID {faq_id}: {e}")

	# 2. Update Podcasts
	print("Starting Podcast embedding recalculation...")
	cur.execute("SELECT id, full_text FROM podcast_episodes WHERE embedding IS NULL")
	podcasts = cur.fetchall()
	for pod_id, full_text in podcasts:
	try:
	emb = get_embedding(full_text)
	cur.execute("UPDATE podcast_episodes SET embedding = ? WHERE id = ?", (json.dumps(emb), pod_id))
	print(f" ✓ Updated Podcast ID {pod_id}")
	except Exception as e:
	print(f" ✗ Error updating Podcast ID {pod_id}: {e}")

	conn.commit()
	print("Embedding recalculation complete.")