|
|
""" |
|
|
Document Classification Schemas |
|
|
|
|
|
Pydantic models for document type classification and categorization. |
|
|
""" |
|
|
|
|
|
from enum import Enum |
|
|
from typing import List, Dict, Any, Optional |
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
from .core import EvidenceRef |
|
|
|
|
|
|
|
|
class DocumentType(str, Enum): |
|
|
""" |
|
|
Common document types for classification. |
|
|
Extensible for domain-specific types. |
|
|
""" |
|
|
|
|
|
CONTRACT = "contract" |
|
|
INVOICE = "invoice" |
|
|
RECEIPT = "receipt" |
|
|
PURCHASE_ORDER = "purchase_order" |
|
|
AGREEMENT = "agreement" |
|
|
NDA = "nda" |
|
|
TERMS_OF_SERVICE = "terms_of_service" |
|
|
|
|
|
|
|
|
PATENT = "patent" |
|
|
RESEARCH_PAPER = "research_paper" |
|
|
TECHNICAL_REPORT = "technical_report" |
|
|
SPECIFICATION = "specification" |
|
|
DATASHEET = "datasheet" |
|
|
USER_MANUAL = "user_manual" |
|
|
|
|
|
|
|
|
FINANCIAL_REPORT = "financial_report" |
|
|
BANK_STATEMENT = "bank_statement" |
|
|
TAX_FORM = "tax_form" |
|
|
BALANCE_SHEET = "balance_sheet" |
|
|
INCOME_STATEMENT = "income_statement" |
|
|
|
|
|
|
|
|
ID_DOCUMENT = "id_document" |
|
|
PASSPORT = "passport" |
|
|
DRIVERS_LICENSE = "drivers_license" |
|
|
CERTIFICATE = "certificate" |
|
|
FORM = "form" |
|
|
APPLICATION = "application" |
|
|
|
|
|
|
|
|
MEDICAL_RECORD = "medical_record" |
|
|
PRESCRIPTION = "prescription" |
|
|
LAB_REPORT = "lab_report" |
|
|
INSURANCE_CLAIM = "insurance_claim" |
|
|
|
|
|
|
|
|
LETTER = "letter" |
|
|
EMAIL = "email" |
|
|
MEMO = "memo" |
|
|
PRESENTATION = "presentation" |
|
|
SPREADSHEET = "spreadsheet" |
|
|
REPORT = "report" |
|
|
ARTICLE = "article" |
|
|
BOOK = "book" |
|
|
|
|
|
|
|
|
OTHER = "other" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
|
|
|
class ClassificationScore(BaseModel): |
|
|
"""Score for a single document type classification.""" |
|
|
document_type: DocumentType = Field(..., description="Document type") |
|
|
confidence: float = Field(..., ge=0.0, le=1.0, description="Classification confidence") |
|
|
reasoning: Optional[str] = Field(default=None, description="Reasoning for classification") |
|
|
|
|
|
|
|
|
class DocumentClassification(BaseModel): |
|
|
""" |
|
|
Document classification result with confidence scores. |
|
|
""" |
|
|
document_id: str = Field(..., description="Document identifier") |
|
|
|
|
|
|
|
|
primary_type: DocumentType = Field(..., description="Most likely document type") |
|
|
primary_confidence: float = Field( |
|
|
..., |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Confidence in primary classification" |
|
|
) |
|
|
|
|
|
|
|
|
scores: List[ClassificationScore] = Field( |
|
|
default_factory=list, |
|
|
description="Scores for all considered types" |
|
|
) |
|
|
|
|
|
|
|
|
evidence: List[EvidenceRef] = Field( |
|
|
default_factory=list, |
|
|
description="Evidence supporting classification" |
|
|
) |
|
|
|
|
|
|
|
|
method: str = Field( |
|
|
default="llm", |
|
|
description="Classification method used (llm/rule-based/hybrid)" |
|
|
) |
|
|
model_used: Optional[str] = Field(default=None, description="Model used for classification") |
|
|
|
|
|
|
|
|
is_confident: bool = Field( |
|
|
default=True, |
|
|
description="Whether classification meets confidence threshold" |
|
|
) |
|
|
warnings: List[str] = Field(default_factory=list, description="Classification warnings") |
|
|
needs_human_review: bool = Field( |
|
|
default=False, |
|
|
description="Whether human review is recommended" |
|
|
) |
|
|
|
|
|
|
|
|
attributes: Dict[str, Any] = Field( |
|
|
default_factory=dict, |
|
|
description="Additional detected attributes (language, domain, etc.)" |
|
|
) |
|
|
|
|
|
def get_top_k(self, k: int = 3) -> List[ClassificationScore]: |
|
|
"""Get top k classifications by confidence.""" |
|
|
sorted_scores = sorted(self.scores, key=lambda x: x.confidence, reverse=True) |
|
|
return sorted_scores[:k] |
|
|
|
|
|
def is_type(self, doc_type: DocumentType, min_confidence: float = 0.5) -> bool: |
|
|
"""Check if document is classified as a specific type with minimum confidence.""" |
|
|
for score in self.scores: |
|
|
if score.document_type == doc_type and score.confidence >= min_confidence: |
|
|
return True |
|
|
return False |
|
|
|
|
|
|
|
|
class DocumentCategoryRule(BaseModel): |
|
|
""" |
|
|
Rule for rule-based document classification. |
|
|
""" |
|
|
name: str = Field(..., description="Rule name") |
|
|
document_type: DocumentType = Field(..., description="Target document type") |
|
|
|
|
|
|
|
|
title_keywords: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Keywords to match in title" |
|
|
) |
|
|
content_keywords: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Keywords to match in content" |
|
|
) |
|
|
required_sections: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Required section headings" |
|
|
) |
|
|
file_patterns: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Filename patterns (regex)" |
|
|
) |
|
|
|
|
|
|
|
|
base_confidence: float = Field( |
|
|
default=0.8, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Base confidence when rule matches" |
|
|
) |
|
|
keyword_boost: float = Field( |
|
|
default=0.05, |
|
|
ge=0.0, |
|
|
le=0.2, |
|
|
description="Confidence boost per matched keyword" |
|
|
) |
|
|
|
|
|
|
|
|
priority: int = Field( |
|
|
default=0, |
|
|
description="Rule priority (higher = checked first)" |
|
|
) |
|
|
|
|
|
|
|
|
class ClassificationConfig(BaseModel): |
|
|
""" |
|
|
Configuration for document classification. |
|
|
""" |
|
|
|
|
|
min_confidence: float = Field( |
|
|
default=0.6, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Minimum confidence for classification" |
|
|
) |
|
|
human_review_threshold: float = Field( |
|
|
default=0.7, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Below this, flag for human review" |
|
|
) |
|
|
|
|
|
|
|
|
use_llm: bool = Field(default=True, description="Use LLM for classification") |
|
|
use_rules: bool = Field(default=True, description="Use rule-based classification") |
|
|
hybrid_mode: str = Field( |
|
|
default="llm_primary", |
|
|
description="Hybrid mode: llm_primary, rules_primary, or ensemble" |
|
|
) |
|
|
|
|
|
|
|
|
custom_rules: List[DocumentCategoryRule] = Field( |
|
|
default_factory=list, |
|
|
description="Custom classification rules" |
|
|
) |
|
|
|
|
|
|
|
|
enabled_types: List[DocumentType] = Field( |
|
|
default_factory=lambda: list(DocumentType), |
|
|
description="Document types to consider" |
|
|
) |
|
|
|
|
|
|
|
|
require_evidence: bool = Field( |
|
|
default=True, |
|
|
description="Require evidence for classification" |
|
|
) |
|
|
max_evidence_snippets: int = Field( |
|
|
default=3, |
|
|
description="Maximum evidence snippets to include" |
|
|
) |
|
|
|