saim1309's picture
Upload 4 files
42db8ec verified
import openai
import numpy as np
import re
from typing import List, Tuple
from config import EMBED_MODEL
def get_embedding(text: str) -> List[float]:
"""Generate embedding for a given text."""
text_strip = text.replace("\n", " ").strip()
response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
return response.data[0].embedding
def cosine_similarity(a: List[float], b: List[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a = np.array(a)
b = np.array(b)
if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
return 0.0
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def clean_time(time_str: str) -> str:
"""Clean up time string."""
if not time_str:
return ""
time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
if time_match:
hour = time_match.group(1)
minute = time_match.group(2) or "00"
ampm = time_match.group(3).upper()
return f"{hour}:{minute} {ampm}"
return time_str.strip()
def find_top_k_matches(user_embedding, dataset, k=3):
"""Find top k matching entries from a dataset."""
scored = []
for entry_id, text, emb in dataset:
score = cosine_similarity(user_embedding, emb)
scored.append((score, entry_id, text))
scored.sort(reverse=True)
return scored[:k]
def classify_intent(question: str) -> str:
"""
Classify the user's intent into:
Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
Mode B: Front Desk Mode (Default - Everything else)
"""
prompt = f"""Classify the following user question into one of two modes:
1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.
User Question: "{question}"
Response must be exactly "Mode A" or "Mode B"."""
try:
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=5
)
prediction = response.choices[0].message.content.strip()
if "Mode A" in prediction:
return "Mode A"
return "Mode B"
except Exception as e:
print(f"Error in intent classification: {e}")
return "Mode B" # Default to Front Desk Mode
def should_include_email(question: str) -> bool:
"""
Determine if the contact email should be shown based on user intent.
Allowed for: Payments, Refunds, Attendance issues, Account problems.
"""
from config import EMAIL_ONLY_KEYWORDS
import re
question_lower = question.lower()
for word in EMAIL_ONLY_KEYWORDS:
pattern = rf'\b{re.escape(word)}\b'
if re.search(pattern, question_lower):
return True
return False
def classify_user_type(question: str, history: List[dict] = None) -> str:
"""
Classify the user type into:
- new_actor
- experienced_actor
- parent
- current_student
- unknown
"""
history_str = ""
if history:
history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])
prompt = f"""Classify the user into exactly one of these categories based on their question and context:
1. "new_actor": Just starting out, has no experience, or is asking how to begin.
2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
5. "unknown": Not enough information yet.
User Question: "{question}"{history_str}
Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""
try:
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=10
)
prediction = response.choices[0].message.content.strip().lower()
valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
for t in valid_types:
if t in prediction:
return t
return "unknown"
except Exception as e:
print(f"Error in user type classification: {e}")
return "unknown"
def recalculate_all_embeddings():
"""Recalculate embeddings for all entries in faq_entries and podcast_episodes that are missing embeddings."""
from database import get_db_connection
import json
with get_db_connection() as conn:
cur = conn.cursor()
# 1. Update FAQs
print("Starting FAQ embedding recalculation...")
cur.execute("SELECT id, question FROM faq_entries WHERE embedding IS NULL")
faqs = cur.fetchall()
for faq_id, question in faqs:
try:
emb = get_embedding(question)
cur.execute("UPDATE faq_entries SET embedding = ? WHERE id = ?", (json.dumps(emb), faq_id))
print(f" ✓ Updated FAQ ID {faq_id}")
except Exception as e:
print(f" ✗ Error updating FAQ ID {faq_id}: {e}")
# 2. Update Podcasts
print("Starting Podcast embedding recalculation...")
cur.execute("SELECT id, full_text FROM podcast_episodes WHERE embedding IS NULL")
podcasts = cur.fetchall()
for pod_id, full_text in podcasts:
try:
emb = get_embedding(full_text)
cur.execute("UPDATE podcast_episodes SET embedding = ? WHERE id = ?", (json.dumps(emb), pod_id))
print(f" ✓ Updated Podcast ID {pod_id}")
except Exception as e:
print(f" ✗ Error updating Podcast ID {pod_id}: {e}")
conn.commit()
print("Embedding recalculation complete.")