plexi-api / main.py
LazyHuman10
Initial commit for HF Space
3b6130d
"""
main.py — Plexi API (FastAPI service for HuggingFace Spaces)
============================================================
Endpoints:
POST /retrieve — embed query + vector search (scope-filtered)
GET /manifest — proxy + cache the materials manifest.json
GET /health — liveness probe (also used by keep-alive cron)
The heavy resources (index + embedding model) are loaded ONCE at startup via
FastAPI's lifespan context manager and shared across all requests.
"""
import os
import time
from contextlib import asynccontextmanager
from functools import lru_cache
import requests
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from rag import (
DEFAULT_TOP_K,
MATERIALS_REPO,
MANIFEST_BRANCH,
format_context,
load_index,
retrieve_chunks,
)
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
ALLOWED_ORIGINS = os.getenv(
"ALLOWED_ORIGINS",
# Default: allow the Cloudflare Pages domain + localhost for dev
"https://plexi.lazyhideout.tech,http://localhost:5173,http://localhost:4173",
).split(",")
# ---------------------------------------------------------------------------
# Startup / Shutdown — load heavy resources once
# ---------------------------------------------------------------------------
_state: dict = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load the RAG index at startup; release on shutdown."""
print("Loading RAG index from GitHub…")
t0 = time.time()
index, error = load_index()
elapsed = round(time.time() - t0, 2)
if error:
print(f"⚠️ RAG index unavailable: {error}")
_state["index"] = None
_state["index_error"] = error
else:
print(f"✅ RAG index loaded in {elapsed}s")
_state["index"] = index
_state["index_error"] = None
_state["index_loaded"] = index is not None
_state["startup_ts"] = time.time()
yield
# Cleanup (nothing heavy to clean up here)
_state.clear()
# ---------------------------------------------------------------------------
# App
# ---------------------------------------------------------------------------
app = FastAPI(
title="Plexi API",
description=(
"RAG retrieval backend for Plexi. "
"Accepts student queries and returns relevant study material chunks."
),
version="1.0.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=ALLOWED_ORIGINS,
allow_credentials=False,
allow_methods=["GET", "POST", "OPTIONS"],
allow_headers=["Content-Type"],
)
# ---------------------------------------------------------------------------
# Request / Response models
# ---------------------------------------------------------------------------
class RetrieveRequest(BaseModel):
query: str = Field(..., min_length=1, max_length=2000)
semester: str = Field(..., min_length=1, max_length=100)
subject: str = Field(..., min_length=1, max_length=100)
top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=20)
class ChunkResult(BaseModel):
text: str
score: float | None
filename: str | None
subject: str | None
class RetrieveResponse(BaseModel):
chunks: list[ChunkResult]
query: str
semester: str
subject: str
rag_active: bool
context_formatted: str
# ---------------------------------------------------------------------------
# Manifest caching (simple in-memory, 5-minute TTL)
# ---------------------------------------------------------------------------
_manifest_cache: dict = {"data": None, "fetched_at": 0}
MANIFEST_TTL = 300 # seconds
def _get_manifest() -> dict:
now = time.time()
if _manifest_cache["data"] and (now - _manifest_cache["fetched_at"]) < MANIFEST_TTL:
return _manifest_cache["data"]
url = f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/manifest.json"
resp = requests.get(url, timeout=15)
resp.raise_for_status()
data = resp.json()
_manifest_cache["data"] = data
_manifest_cache["fetched_at"] = now
return data
# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------
@app.get("/health")
def health():
"""Liveness probe — also pinged by the GitHub Actions keep-alive cron."""
uptime = round(time.time() - _state.get("startup_ts", time.time()), 1)
return {
"status": "ok",
"index_loaded": _state.get("index_loaded", False),
"index_error": _state.get("index_error"),
"embed_model": "sentence-transformers/all-MiniLM-L6-v2",
"uptime_seconds": uptime,
}
@app.get("/manifest")
def get_manifest():
"""
Proxy and cache the study materials manifest.json from GitHub.
The Cloudflare Worker also caches this in KV — this is a double layer.
"""
try:
data = _get_manifest()
return JSONResponse(content=data)
except requests.HTTPError as err:
raise HTTPException(status_code=502, detail=f"GitHub fetch failed: {err}")
except Exception as err:
raise HTTPException(status_code=500, detail=str(err))
@app.post("/retrieve", response_model=RetrieveResponse)
def retrieve(body: RetrieveRequest):
"""
Core RAG endpoint.
1. Embeds the query using all-MiniLM-L6-v2 (local, fast ~5-10ms)
2. Searches the pre-built LlamaIndex vector store
3. Filters results by semester + subject metadata
4. Returns top-k chunks + a formatted context string for the LLM prompt
"""
index = _state.get("index")
chunks = retrieve_chunks(
index=index,
query=body.query,
semester=body.semester,
subject=body.subject,
top_k=body.top_k,
)
context_formatted = format_context(chunks)
return RetrieveResponse(
chunks=chunks,
query=body.query,
semester=body.semester,
subject=body.subject,
rag_active=index is not None,
context_formatted=context_formatted,
)
# ---------------------------------------------------------------------------
# Run (for local development only — HF uses Dockerfile CMD)
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)