File size: 1,682 Bytes
d520909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
"""
SPARKNET Document Intelligence Subsystem
A comprehensive document processing pipeline for:
- OCR with PaddleOCR and Tesseract
- Layout detection and reading order reconstruction
- Semantic chunking with grounding evidence
- Document classification and field extraction
- Extraction validation with Critic/Verifier
Principles:
- Processing is not understanding: OCR alone is insufficient
- Every extraction includes evidence pointers (bbox, page, chunk_id)
- Modular, pluggable components with clean interfaces
- Abstain with evidence when confidence is low
"""
from .schemas.core import (
BoundingBox,
OCRRegion,
LayoutRegion,
LayoutType,
DocumentChunk,
ChunkType,
EvidenceRef,
ExtractionResult,
DocumentMetadata,
ProcessedDocument,
)
from .pipeline import (
PipelineConfig,
DocumentProcessor,
get_document_processor,
process_document,
)
from .validation import (
CriticConfig,
ValidationResult,
ExtractionCritic,
get_extraction_critic,
VerifierConfig,
VerificationResult,
EvidenceVerifier,
get_evidence_verifier,
)
__all__ = [
# Core schemas
"BoundingBox",
"OCRRegion",
"LayoutRegion",
"LayoutType",
"DocumentChunk",
"ChunkType",
"EvidenceRef",
"ExtractionResult",
"DocumentMetadata",
"ProcessedDocument",
# Pipeline
"PipelineConfig",
"DocumentProcessor",
"get_document_processor",
"process_document",
# Validation
"CriticConfig",
"ValidationResult",
"ExtractionCritic",
"get_extraction_critic",
"VerifierConfig",
"VerificationResult",
"EvidenceVerifier",
"get_evidence_verifier",
]
|