Spaces:

decodingdatascience
/

Challengebot

Running

File size: 3,538 Bytes

import os
from pathlib import Path
import gradio as gr

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
TOP_K = int(os.getenv("TOP_K", "3"))

DOC_PATH = Path(os.getenv("DOC_PATH", "challenge_context.txt"))

SYSTEM_GUARDRAILS = (
    "You are Challenge Copilot. Answer ONLY using the provided context. "
    "If the answer is not in the context, say: 'I don’t know based on the current document.' "
    "Then ask the user to add the missing official details to challenge_context.txt."
)

_INDEX = None
_QUERY_ENGINE = None

def build_index():
    global _INDEX, _QUERY_ENGINE
    if _QUERY_ENGINE is not None:
        return _QUERY_ENGINE

    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError(
            "OPENAI_API_KEY is missing. Add it in the Space Settings → Variables and secrets."
        )

    if not DOC_PATH.exists():
        DOC_PATH.write_text(
            "Add the official Building AI Application Challenge content here.\n",
            encoding="utf-8",
        )

    Settings.llm = OpenAI(model=MODEL, temperature=0.2)
    Settings.embed_model = OpenAIEmbedding(model=EMBED_MODEL)
    Settings.chunk_size = 800
    Settings.chunk_overlap = 120

    data_dir = str(DOC_PATH.parent)
    docs = SimpleDirectoryReader(
        input_dir=data_dir,
        required_exts=[".txt"],
        recursive=False
    ).load_data()

    docs = [d for d in docs if d.metadata.get("file_name") == DOC_PATH.name]
    if not docs:
        raise FileNotFoundError(f"Could not load {DOC_PATH.name}. Make sure it exists in the repo.")

    _INDEX = VectorStoreIndex.from_documents(docs)
    _QUERY_ENGINE = _INDEX.as_query_engine(similarity_top_k=TOP_K)
    return _QUERY_ENGINE

def format_sources(resp, max_sources=3, max_chars=220):
    lines = []
    for i, sn in enumerate(getattr(resp, "source_nodes", [])[:max_sources], start=1):
        fn = sn.node.metadata.get("file_name", "unknown")
        snippet = sn.node.get_content().replace("\n", " ").strip()[:max_chars]
        score = getattr(sn, "score", None)
        score_txt = f" (score={score:.3f})" if isinstance(score, (float, int)) else ""
        lines.append(f"{i}. {fn}{score_txt}: {snippet}...")
    return "\n".join(lines) if lines else "No sources returned."

def chat(message, history):
    qe = build_index()
    prompt = (
        f"{SYSTEM_GUARDRAILS}\n\n"
        f"User question: {message}\n"
        f"Answer using ONLY the context."
    )
    resp = qe.query(prompt)
    answer = str(resp).strip()

    show_sources = os.getenv("SHOW_SOURCES", "true").lower() == "true"
    if show_sources:
        answer += "\n\n---\nSources:\n" + format_sources(resp, max_sources=TOP_K)

    return answer


# ---- UI ----
try:
    theme_obj = gr.themes.Soft()
except Exception:
    theme_obj = None  # compatibility fallback

with gr.Blocks(theme=theme_obj) as demo:
    gr.Markdown("# Challenge Copilot — RAG Q&A Bot")
    gr.Markdown("Ask questions about the Building AI Application Challenge using challenge_context.txt (LlamaIndex + OpenAI).")
    gr.ChatInterface(
        fn=chat,
        examples=[
            "What will I build in this live session?",
            "Who is this best for?",
            "What are the prerequisites?"
        ],
    )

if __name__ == "__main__":
    demo.launch()