Spaces:

MLBench
/

Embedding-Calculation

Sleeping

File size: 6,489 Bytes

42db8ec

import openai
import numpy as np
import re
from typing import List, Tuple
from config import EMBED_MODEL

def get_embedding(text: str) -> List[float]:
    """Generate embedding for a given text."""
    text_strip = text.replace("\n", " ").strip()
    response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
    return response.data[0].embedding

def cosine_similarity(a: List[float], b: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    a = np.array(a)
    b = np.array(b)
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def clean_time(time_str: str) -> str:
    """Clean up time string."""
    if not time_str:
        return ""
    
    time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
    if time_match:
        hour = time_match.group(1)
        minute = time_match.group(2) or "00"
        ampm = time_match.group(3).upper()
        return f"{hour}:{minute} {ampm}"
    
    return time_str.strip()

def find_top_k_matches(user_embedding, dataset, k=3):
    """Find top k matching entries from a dataset."""
    scored = []
    for entry_id, text, emb in dataset:
        score = cosine_similarity(user_embedding, emb)
        scored.append((score, entry_id, text))
    scored.sort(reverse=True)
    return scored[:k]

def classify_intent(question: str) -> str:
    """
    Classify the user's intent into:
    Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
    Mode B: Front Desk Mode (Default - Everything else)
    """
    prompt = f"""Classify the following user question into one of two modes:
1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.

User Question: "{question}"

Response must be exactly "Mode A" or "Mode B"."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=5
        )
        prediction = response.choices[0].message.content.strip()
        if "Mode A" in prediction:
            return "Mode A"
        return "Mode B"
    except Exception as e:
        print(f"Error in intent classification: {e}")
        return "Mode B"  # Default to Front Desk Mode

def should_include_email(question: str) -> bool:
    """
    Determine if the contact email should be shown based on user intent.
    Allowed for: Payments, Refunds, Attendance issues, Account problems.
    """
    from config import EMAIL_ONLY_KEYWORDS
    import re
    
    question_lower = question.lower()
    for word in EMAIL_ONLY_KEYWORDS:
        pattern = rf'\b{re.escape(word)}\b'
        if re.search(pattern, question_lower):
            return True
            
    return False

def classify_user_type(question: str, history: List[dict] = None) -> str:
    """
    Classify the user type into:
    - new_actor
    - experienced_actor
    - parent
    - current_student
    - unknown
    """
    history_str = ""
    if history:
        history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])

    prompt = f"""Classify the user into exactly one of these categories based on their question and context:
1. "new_actor": Just starting out, has no experience, or is asking how to begin.
2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
5. "unknown": Not enough information yet.

User Question: "{question}"{history_str}

Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=10
        )
        prediction = response.choices[0].message.content.strip().lower()
        valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
        for t in valid_types:
            if t in prediction:
                return t
        return "unknown"
    except Exception as e:
        print(f"Error in user type classification: {e}")
        return "unknown"

def recalculate_all_embeddings():
    """Recalculate embeddings for all entries in faq_entries and podcast_episodes that are missing embeddings."""
    from database import get_db_connection
    import json
    
    with get_db_connection() as conn:
        cur = conn.cursor()
        
        # 1. Update FAQs
        print("Starting FAQ embedding recalculation...")
        cur.execute("SELECT id, question FROM faq_entries WHERE embedding IS NULL")
        faqs = cur.fetchall()
        for faq_id, question in faqs:
            try:
                emb = get_embedding(question)
                cur.execute("UPDATE faq_entries SET embedding = ? WHERE id = ?", (json.dumps(emb), faq_id))
                print(f"  ✓ Updated FAQ ID {faq_id}")
            except Exception as e:
                print(f"  ✗ Error updating FAQ ID {faq_id}: {e}")
        
        # 2. Update Podcasts
        print("Starting Podcast embedding recalculation...")
        cur.execute("SELECT id, full_text FROM podcast_episodes WHERE embedding IS NULL")
        podcasts = cur.fetchall()
        for pod_id, full_text in podcasts:
            try:
                emb = get_embedding(full_text)
                cur.execute("UPDATE podcast_episodes SET embedding = ? WHERE id = ?", (json.dumps(emb), pod_id))
                print(f"  ✓ Updated Podcast ID {pod_id}")
            except Exception as e:
                print(f"  ✗ Error updating Podcast ID {pod_id}: {e}")
        
        conn.commit()
    print("Embedding recalculation complete.")