""" SPARKNET Document Intelligence Subsystem A comprehensive document processing pipeline for: - OCR with PaddleOCR and Tesseract - Layout detection and reading order reconstruction - Semantic chunking with grounding evidence - Document classification and field extraction - Extraction validation with Critic/Verifier Principles: - Processing is not understanding: OCR alone is insufficient - Every extraction includes evidence pointers (bbox, page, chunk_id) - Modular, pluggable components with clean interfaces - Abstain with evidence when confidence is low """ from .schemas.core import ( BoundingBox, OCRRegion, LayoutRegion, LayoutType, DocumentChunk, ChunkType, EvidenceRef, ExtractionResult, DocumentMetadata, ProcessedDocument, ) from .pipeline import ( PipelineConfig, DocumentProcessor, get_document_processor, process_document, ) from .validation import ( CriticConfig, ValidationResult, ExtractionCritic, get_extraction_critic, VerifierConfig, VerificationResult, EvidenceVerifier, get_evidence_verifier, ) __all__ = [ # Core schemas "BoundingBox", "OCRRegion", "LayoutRegion", "LayoutType", "DocumentChunk", "ChunkType", "EvidenceRef", "ExtractionResult", "DocumentMetadata", "ProcessedDocument", # Pipeline "PipelineConfig", "DocumentProcessor", "get_document_processor", "process_document", # Validation "CriticConfig", "ValidationResult", "ExtractionCritic", "get_extraction_critic", "VerifierConfig", "VerificationResult", "EvidenceVerifier", "get_evidence_verifier", ]