SPARKNET / src /document /__init__.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
SPARKNET Document Intelligence Subsystem
A comprehensive document processing pipeline for:
- OCR with PaddleOCR and Tesseract
- Layout detection and reading order reconstruction
- Semantic chunking with grounding evidence
- Document classification and field extraction
- Extraction validation with Critic/Verifier
Principles:
- Processing is not understanding: OCR alone is insufficient
- Every extraction includes evidence pointers (bbox, page, chunk_id)
- Modular, pluggable components with clean interfaces
- Abstain with evidence when confidence is low
"""
from .schemas.core import (
BoundingBox,
OCRRegion,
LayoutRegion,
LayoutType,
DocumentChunk,
ChunkType,
EvidenceRef,
ExtractionResult,
DocumentMetadata,
ProcessedDocument,
)
from .pipeline import (
PipelineConfig,
DocumentProcessor,
get_document_processor,
process_document,
)
from .validation import (
CriticConfig,
ValidationResult,
ExtractionCritic,
get_extraction_critic,
VerifierConfig,
VerificationResult,
EvidenceVerifier,
get_evidence_verifier,
)
__all__ = [
# Core schemas
"BoundingBox",
"OCRRegion",
"LayoutRegion",
"LayoutType",
"DocumentChunk",
"ChunkType",
"EvidenceRef",
"ExtractionResult",
"DocumentMetadata",
"ProcessedDocument",
# Pipeline
"PipelineConfig",
"DocumentProcessor",
"get_document_processor",
"process_document",
# Validation
"CriticConfig",
"ValidationResult",
"ExtractionCritic",
"get_extraction_critic",
"VerifierConfig",
"VerificationResult",
"EvidenceVerifier",
"get_evidence_verifier",
]