| """ | |
| SPARKNET Document Intelligence Subsystem | |
| A comprehensive document processing pipeline for: | |
| - OCR with PaddleOCR and Tesseract | |
| - Layout detection and reading order reconstruction | |
| - Semantic chunking with grounding evidence | |
| - Document classification and field extraction | |
| - Extraction validation with Critic/Verifier | |
| Principles: | |
| - Processing is not understanding: OCR alone is insufficient | |
| - Every extraction includes evidence pointers (bbox, page, chunk_id) | |
| - Modular, pluggable components with clean interfaces | |
| - Abstain with evidence when confidence is low | |
| """ | |
| from .schemas.core import ( | |
| BoundingBox, | |
| OCRRegion, | |
| LayoutRegion, | |
| LayoutType, | |
| DocumentChunk, | |
| ChunkType, | |
| EvidenceRef, | |
| ExtractionResult, | |
| DocumentMetadata, | |
| ProcessedDocument, | |
| ) | |
| from .pipeline import ( | |
| PipelineConfig, | |
| DocumentProcessor, | |
| get_document_processor, | |
| process_document, | |
| ) | |
| from .validation import ( | |
| CriticConfig, | |
| ValidationResult, | |
| ExtractionCritic, | |
| get_extraction_critic, | |
| VerifierConfig, | |
| VerificationResult, | |
| EvidenceVerifier, | |
| get_evidence_verifier, | |
| ) | |
| __all__ = [ | |
| # Core schemas | |
| "BoundingBox", | |
| "OCRRegion", | |
| "LayoutRegion", | |
| "LayoutType", | |
| "DocumentChunk", | |
| "ChunkType", | |
| "EvidenceRef", | |
| "ExtractionResult", | |
| "DocumentMetadata", | |
| "ProcessedDocument", | |
| # Pipeline | |
| "PipelineConfig", | |
| "DocumentProcessor", | |
| "get_document_processor", | |
| "process_document", | |
| # Validation | |
| "CriticConfig", | |
| "ValidationResult", | |
| "ExtractionCritic", | |
| "get_extraction_critic", | |
| "VerifierConfig", | |
| "VerificationResult", | |
| "EvidenceVerifier", | |
| "get_evidence_verifier", | |
| ] | |