| """ |
| main.py — Plexi API (FastAPI service for HuggingFace Spaces) |
| ============================================================ |
| Endpoints: |
| POST /retrieve — embed query + vector search (scope-filtered) |
| GET /manifest — proxy + cache the materials manifest.json |
| GET /health — liveness probe (also used by keep-alive cron) |
| |
| The heavy resources (index + embedding model) are loaded ONCE at startup via |
| FastAPI's lifespan context manager and shared across all requests. |
| """ |
|
|
| import os |
| import time |
| from contextlib import asynccontextmanager |
| from functools import lru_cache |
|
|
| import requests |
| from fastapi import FastAPI, HTTPException, Request |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import JSONResponse |
| from pydantic import BaseModel, Field |
|
|
| from rag import ( |
| DEFAULT_TOP_K, |
| MATERIALS_REPO, |
| MANIFEST_BRANCH, |
| format_context, |
| load_index, |
| retrieve_chunks, |
| ) |
|
|
| |
| |
| |
| ALLOWED_ORIGINS = os.getenv( |
| "ALLOWED_ORIGINS", |
| |
| "https://plexi.lazyhideout.tech,http://localhost:5173,http://localhost:4173", |
| ).split(",") |
|
|
| |
| |
| |
| _state: dict = {} |
|
|
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| """Load the RAG index at startup; release on shutdown.""" |
| print("Loading RAG index from GitHub…") |
| t0 = time.time() |
| index, error = load_index() |
| elapsed = round(time.time() - t0, 2) |
|
|
| if error: |
| print(f"⚠️ RAG index unavailable: {error}") |
| _state["index"] = None |
| _state["index_error"] = error |
| else: |
| print(f"✅ RAG index loaded in {elapsed}s") |
| _state["index"] = index |
| _state["index_error"] = None |
|
|
| _state["index_loaded"] = index is not None |
| _state["startup_ts"] = time.time() |
| yield |
| |
| _state.clear() |
|
|
|
|
| |
| |
| |
| app = FastAPI( |
| title="Plexi API", |
| description=( |
| "RAG retrieval backend for Plexi. " |
| "Accepts student queries and returns relevant study material chunks." |
| ), |
| version="1.0.0", |
| lifespan=lifespan, |
| ) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=ALLOWED_ORIGINS, |
| allow_credentials=False, |
| allow_methods=["GET", "POST", "OPTIONS"], |
| allow_headers=["Content-Type"], |
| ) |
|
|
|
|
| |
| |
| |
| class RetrieveRequest(BaseModel): |
| query: str = Field(..., min_length=1, max_length=2000) |
| semester: str = Field(..., min_length=1, max_length=100) |
| subject: str = Field(..., min_length=1, max_length=100) |
| top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=20) |
|
|
|
|
| class ChunkResult(BaseModel): |
| text: str |
| score: float | None |
| filename: str | None |
| subject: str | None |
|
|
|
|
| class RetrieveResponse(BaseModel): |
| chunks: list[ChunkResult] |
| query: str |
| semester: str |
| subject: str |
| rag_active: bool |
| context_formatted: str |
|
|
|
|
| |
| |
| |
| _manifest_cache: dict = {"data": None, "fetched_at": 0} |
| MANIFEST_TTL = 300 |
|
|
|
|
| def _get_manifest() -> dict: |
| now = time.time() |
| if _manifest_cache["data"] and (now - _manifest_cache["fetched_at"]) < MANIFEST_TTL: |
| return _manifest_cache["data"] |
|
|
| url = f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/manifest.json" |
| resp = requests.get(url, timeout=15) |
| resp.raise_for_status() |
| data = resp.json() |
|
|
| _manifest_cache["data"] = data |
| _manifest_cache["fetched_at"] = now |
| return data |
|
|
|
|
| |
| |
| |
| @app.get("/health") |
| def health(): |
| """Liveness probe — also pinged by the GitHub Actions keep-alive cron.""" |
| uptime = round(time.time() - _state.get("startup_ts", time.time()), 1) |
| return { |
| "status": "ok", |
| "index_loaded": _state.get("index_loaded", False), |
| "index_error": _state.get("index_error"), |
| "embed_model": "sentence-transformers/all-MiniLM-L6-v2", |
| "uptime_seconds": uptime, |
| } |
|
|
|
|
| @app.get("/manifest") |
| def get_manifest(): |
| """ |
| Proxy and cache the study materials manifest.json from GitHub. |
| The Cloudflare Worker also caches this in KV — this is a double layer. |
| """ |
| try: |
| data = _get_manifest() |
| return JSONResponse(content=data) |
| except requests.HTTPError as err: |
| raise HTTPException(status_code=502, detail=f"GitHub fetch failed: {err}") |
| except Exception as err: |
| raise HTTPException(status_code=500, detail=str(err)) |
|
|
|
|
| @app.post("/retrieve", response_model=RetrieveResponse) |
| def retrieve(body: RetrieveRequest): |
| """ |
| Core RAG endpoint. |
| |
| 1. Embeds the query using all-MiniLM-L6-v2 (local, fast ~5-10ms) |
| 2. Searches the pre-built LlamaIndex vector store |
| 3. Filters results by semester + subject metadata |
| 4. Returns top-k chunks + a formatted context string for the LLM prompt |
| """ |
| index = _state.get("index") |
|
|
| chunks = retrieve_chunks( |
| index=index, |
| query=body.query, |
| semester=body.semester, |
| subject=body.subject, |
| top_k=body.top_k, |
| ) |
|
|
| context_formatted = format_context(chunks) |
|
|
| return RetrieveResponse( |
| chunks=chunks, |
| query=body.query, |
| semester=body.semester, |
| subject=body.subject, |
| rag_active=index is not None, |
| context_formatted=context_formatted, |
| ) |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| import uvicorn |
|
|
| uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True) |
|
|