""" SPARKNET API Schemas Pydantic models for request/response validation. """ from pydantic import BaseModel, Field, ConfigDict from typing import List, Dict, Any, Optional from datetime import datetime from enum import Enum # ==================== Enums ==================== class DocumentStatus(str, Enum): PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" INDEXED = "indexed" ERROR = "error" class QueryIntentType(str, Enum): FACTOID = "factoid" COMPARISON = "comparison" AGGREGATION = "aggregation" CAUSAL = "causal" PROCEDURAL = "procedural" DEFINITION = "definition" LIST = "list" MULTI_HOP = "multi_hop" class AnswerFormat(str, Enum): PROSE = "prose" BULLET_POINTS = "bullet_points" TABLE = "table" STEP_BY_STEP = "step_by_step" # ==================== Document Schemas ==================== class DocumentUploadResponse(BaseModel): """Response after uploading a document.""" model_config = ConfigDict(from_attributes=True) doc_id: str = Field(..., description="Unique document identifier") filename: str = Field(..., description="Original filename") status: DocumentStatus = Field(..., description="Document status") message: str = Field(..., description="Status message") created_at: datetime = Field(default_factory=datetime.now) class DocumentMetadata(BaseModel): """Document metadata information.""" model_config = ConfigDict(from_attributes=True) doc_id: str filename: str file_type: str page_count: int = 0 chunk_count: int = 0 text_length: int = 0 status: DocumentStatus indexed: bool = False indexed_chunks: int = 0 processing_time: Optional[float] = None created_at: datetime updated_at: Optional[datetime] = None class DocumentResponse(BaseModel): """Full document response with metadata.""" model_config = ConfigDict(from_attributes=True) doc_id: str filename: str file_type: str status: DocumentStatus metadata: DocumentMetadata raw_text: Optional[str] = Field(None, description="Full extracted text (if requested)") preview: Optional[str] = Field(None, description="Text preview (first 500 chars)") class ChunkInfo(BaseModel): """Information about a document chunk.""" model_config = ConfigDict(from_attributes=True) chunk_id: str doc_id: str text: str chunk_type: str = "text" page_num: Optional[int] = None confidence: float = 1.0 bbox: Optional[Dict[str, float]] = None metadata: Dict[str, Any] = Field(default_factory=dict) class ChunksResponse(BaseModel): """Response containing document chunks.""" doc_id: str total_chunks: int chunks: List[ChunkInfo] class OCRRegionInfo(BaseModel): """OCR region information.""" region_id: str text: str confidence: float page_num: int bbox: Dict[str, float] class LayoutRegionInfo(BaseModel): """Layout region information.""" region_id: str region_type: str confidence: float page_num: int bbox: Dict[str, float] class DocumentDetailResponse(BaseModel): """Detailed document response with all extracted data.""" doc_id: str filename: str status: DocumentStatus metadata: DocumentMetadata chunks: List[ChunkInfo] ocr_regions: List[OCRRegionInfo] = Field(default_factory=list) layout_regions: List[LayoutRegionInfo] = Field(default_factory=list) # ==================== RAG Query Schemas ==================== class QueryRequest(BaseModel): """RAG query request.""" query: str = Field(..., min_length=1, max_length=2000, description="Query text") doc_ids: Optional[List[str]] = Field(None, description="Filter by document IDs") top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve") answer_format: AnswerFormat = Field(AnswerFormat.PROSE, description="Desired answer format") include_sources: bool = Field(True, description="Include source citations") min_confidence: float = Field(0.5, ge=0.0, le=1.0, description="Minimum confidence threshold") use_cache: bool = Field(True, description="Use cached results if available") class Citation(BaseModel): """Citation/source reference.""" citation_id: int = Field(..., description="Citation number [1], [2], etc.") doc_id: str document_name: str chunk_id: str chunk_text: str page_num: Optional[int] = None relevance_score: float bbox: Optional[Dict[str, float]] = None class QueryPlan(BaseModel): """Query planning information.""" intent: QueryIntentType sub_queries: List[str] = Field(default_factory=list) keywords: List[str] = Field(default_factory=list) strategy: str = "hybrid" class RAGResponse(BaseModel): """Complete RAG response.""" query: str answer: str confidence: float = Field(..., ge=0.0, le=1.0) citations: List[Citation] = Field(default_factory=list) source_count: int = 0 query_plan: Optional[QueryPlan] = None from_cache: bool = False validation: Optional[Dict[str, Any]] = None latency_ms: Optional[float] = None revision_count: int = 0 class SearchRequest(BaseModel): """Semantic search request.""" query: str = Field(..., min_length=1, max_length=1000) doc_ids: Optional[List[str]] = None top_k: int = Field(10, ge=1, le=50) min_score: float = Field(0.0, ge=0.0, le=1.0) class SearchResult(BaseModel): """Single search result.""" chunk_id: str doc_id: str document_name: str text: str score: float page_num: Optional[int] = None chunk_type: str = "text" class SearchResponse(BaseModel): """Search response with results.""" query: str total_results: int results: List[SearchResult] latency_ms: float # ==================== Indexing Schemas ==================== class IndexRequest(BaseModel): """Request to index a document.""" doc_id: str = Field(..., description="Document ID to index") force_reindex: bool = Field(False, description="Force reindexing if already indexed") class IndexResponse(BaseModel): """Indexing response.""" doc_id: str status: str chunks_indexed: int message: str class BatchIndexRequest(BaseModel): """Batch indexing request.""" doc_ids: List[str] force_reindex: bool = False class BatchIndexResponse(BaseModel): """Batch indexing response.""" total_requested: int successful: int failed: int results: List[IndexResponse] # ==================== System Schemas ==================== class HealthResponse(BaseModel): """Health check response.""" status: str = Field(..., description="healthy, degraded, or unhealthy") version: str components: Dict[str, bool] class SystemStatus(BaseModel): """Detailed system status.""" status: str version: str uptime_seconds: float components: Dict[str, bool] statistics: Dict[str, Any] models: Dict[str, str] class CollectionInfo(BaseModel): """Vector store collection information.""" name: str document_count: int chunk_count: int embedding_dimension: int class StoreStatus(BaseModel): """Vector store status.""" status: str collections: List[CollectionInfo] total_documents: int total_chunks: int # ==================== Authentication Schemas ==================== class UserCreate(BaseModel): """User creation request.""" username: str = Field(..., min_length=3, max_length=50) email: str password: str = Field(..., min_length=8) class UserResponse(BaseModel): """User response (no password).""" user_id: str username: str email: str is_active: bool = True created_at: datetime class Token(BaseModel): """JWT token response.""" access_token: str token_type: str = "bearer" expires_in: int class TokenData(BaseModel): """Token payload data.""" username: Optional[str] = None user_id: Optional[str] = None scopes: List[str] = Field(default_factory=list)