""" main.py — Plexi API (FastAPI service for HuggingFace Spaces) ============================================================ Endpoints: POST /retrieve — embed query + vector search (scope-filtered) GET /manifest — proxy + cache the materials manifest.json GET /health — liveness probe (also used by keep-alive cron) The heavy resources (index + embedding model) are loaded ONCE at startup via FastAPI's lifespan context manager and shared across all requests. """ import os import time from contextlib import asynccontextmanager from functools import lru_cache import requests from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from rag import ( DEFAULT_TOP_K, MATERIALS_REPO, MANIFEST_BRANCH, format_context, load_index, retrieve_chunks, ) # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- ALLOWED_ORIGINS = os.getenv( "ALLOWED_ORIGINS", # Default: allow the Cloudflare Pages domain + localhost for dev "https://plexi.lazyhideout.tech,http://localhost:5173,http://localhost:4173", ).split(",") # --------------------------------------------------------------------------- # Startup / Shutdown — load heavy resources once # --------------------------------------------------------------------------- _state: dict = {} @asynccontextmanager async def lifespan(app: FastAPI): """Load the RAG index at startup; release on shutdown.""" print("Loading RAG index from GitHub…") t0 = time.time() index, error = load_index() elapsed = round(time.time() - t0, 2) if error: print(f"⚠️ RAG index unavailable: {error}") _state["index"] = None _state["index_error"] = error else: print(f"✅ RAG index loaded in {elapsed}s") _state["index"] = index _state["index_error"] = None _state["index_loaded"] = index is not None _state["startup_ts"] = time.time() yield # Cleanup (nothing heavy to clean up here) _state.clear() # --------------------------------------------------------------------------- # App # --------------------------------------------------------------------------- app = FastAPI( title="Plexi API", description=( "RAG retrieval backend for Plexi. " "Accepts student queries and returns relevant study material chunks." ), version="1.0.0", lifespan=lifespan, ) app.add_middleware( CORSMiddleware, allow_origins=ALLOWED_ORIGINS, allow_credentials=False, allow_methods=["GET", "POST", "OPTIONS"], allow_headers=["Content-Type"], ) # --------------------------------------------------------------------------- # Request / Response models # --------------------------------------------------------------------------- class RetrieveRequest(BaseModel): query: str = Field(..., min_length=1, max_length=2000) semester: str = Field(..., min_length=1, max_length=100) subject: str = Field(..., min_length=1, max_length=100) top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=20) class ChunkResult(BaseModel): text: str score: float | None filename: str | None subject: str | None class RetrieveResponse(BaseModel): chunks: list[ChunkResult] query: str semester: str subject: str rag_active: bool context_formatted: str # --------------------------------------------------------------------------- # Manifest caching (simple in-memory, 5-minute TTL) # --------------------------------------------------------------------------- _manifest_cache: dict = {"data": None, "fetched_at": 0} MANIFEST_TTL = 300 # seconds def _get_manifest() -> dict: now = time.time() if _manifest_cache["data"] and (now - _manifest_cache["fetched_at"]) < MANIFEST_TTL: return _manifest_cache["data"] url = f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/manifest.json" resp = requests.get(url, timeout=15) resp.raise_for_status() data = resp.json() _manifest_cache["data"] = data _manifest_cache["fetched_at"] = now return data # --------------------------------------------------------------------------- # Routes # --------------------------------------------------------------------------- @app.get("/health") def health(): """Liveness probe — also pinged by the GitHub Actions keep-alive cron.""" uptime = round(time.time() - _state.get("startup_ts", time.time()), 1) return { "status": "ok", "index_loaded": _state.get("index_loaded", False), "index_error": _state.get("index_error"), "embed_model": "sentence-transformers/all-MiniLM-L6-v2", "uptime_seconds": uptime, } @app.get("/manifest") def get_manifest(): """ Proxy and cache the study materials manifest.json from GitHub. The Cloudflare Worker also caches this in KV — this is a double layer. """ try: data = _get_manifest() return JSONResponse(content=data) except requests.HTTPError as err: raise HTTPException(status_code=502, detail=f"GitHub fetch failed: {err}") except Exception as err: raise HTTPException(status_code=500, detail=str(err)) @app.post("/retrieve", response_model=RetrieveResponse) def retrieve(body: RetrieveRequest): """ Core RAG endpoint. 1. Embeds the query using all-MiniLM-L6-v2 (local, fast ~5-10ms) 2. Searches the pre-built LlamaIndex vector store 3. Filters results by semester + subject metadata 4. Returns top-k chunks + a formatted context string for the LLM prompt """ index = _state.get("index") chunks = retrieve_chunks( index=index, query=body.query, semester=body.semester, subject=body.subject, top_k=body.top_k, ) context_formatted = format_context(chunks) return RetrieveResponse( chunks=chunks, query=body.query, semester=body.semester, subject=body.subject, rag_active=index is not None, context_formatted=context_formatted, ) # --------------------------------------------------------------------------- # Run (for local development only — HF uses Dockerfile CMD) # --------------------------------------------------------------------------- if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)