|
|
""" |
|
|
Document Processing CLI Commands |
|
|
|
|
|
Commands: |
|
|
sparknet document parse <file> - Parse and extract text from document |
|
|
sparknet document extract <file> - Extract structured fields |
|
|
sparknet document classify <file> - Classify document type |
|
|
sparknet document analyze <file> - Full document analysis |
|
|
""" |
|
|
|
|
|
import typer |
|
|
from typing import Optional, List |
|
|
from pathlib import Path |
|
|
import json |
|
|
import sys |
|
|
|
|
|
|
|
|
document_app = typer.Typer( |
|
|
name="document", |
|
|
help="Document processing commands", |
|
|
) |
|
|
|
|
|
|
|
|
@document_app.command("parse") |
|
|
def parse_document( |
|
|
file_path: Path = typer.Argument(..., help="Path to document file"), |
|
|
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
|
|
ocr_engine: str = typer.Option("paddleocr", "--ocr", help="OCR engine: paddleocr, tesseract"), |
|
|
dpi: int = typer.Option(300, "--dpi", help="Rendering DPI for PDFs"), |
|
|
max_pages: Optional[int] = typer.Option(None, "--max-pages", help="Maximum pages to process"), |
|
|
include_images: bool = typer.Option(False, "--images", help="Include cropped region images"), |
|
|
): |
|
|
""" |
|
|
Parse a document and extract text with layout information. |
|
|
|
|
|
Example: |
|
|
sparknet document parse invoice.pdf -o result.json |
|
|
""" |
|
|
from loguru import logger |
|
|
|
|
|
if not file_path.exists(): |
|
|
typer.echo(f"Error: File not found: {file_path}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
typer.echo(f"Parsing document: {file_path}") |
|
|
|
|
|
try: |
|
|
from ..document.pipeline import ( |
|
|
PipelineConfig, |
|
|
get_document_processor, |
|
|
) |
|
|
from ..document.ocr import OCRConfig |
|
|
|
|
|
|
|
|
ocr_config = OCRConfig(engine=ocr_engine) |
|
|
config = PipelineConfig( |
|
|
ocr=ocr_config, |
|
|
render_dpi=dpi, |
|
|
max_pages=max_pages, |
|
|
) |
|
|
|
|
|
|
|
|
processor = get_document_processor(config) |
|
|
result = processor.process(str(file_path)) |
|
|
|
|
|
|
|
|
output_data = { |
|
|
"document_id": result.metadata.document_id, |
|
|
"filename": result.metadata.filename, |
|
|
"num_pages": result.metadata.num_pages, |
|
|
"total_chunks": result.metadata.total_chunks, |
|
|
"total_characters": result.metadata.total_characters, |
|
|
"ocr_confidence": result.metadata.ocr_confidence_avg, |
|
|
"chunks": [ |
|
|
{ |
|
|
"chunk_id": c.chunk_id, |
|
|
"type": c.chunk_type.value, |
|
|
"page": c.page, |
|
|
"text": c.text[:500] + "..." if len(c.text) > 500 else c.text, |
|
|
"confidence": c.confidence, |
|
|
"bbox": { |
|
|
"x_min": c.bbox.x_min, |
|
|
"y_min": c.bbox.y_min, |
|
|
"x_max": c.bbox.x_max, |
|
|
"y_max": c.bbox.y_max, |
|
|
}, |
|
|
} |
|
|
for c in result.chunks |
|
|
], |
|
|
"full_text": result.full_text[:2000] + "..." if len(result.full_text) > 2000 else result.full_text, |
|
|
} |
|
|
|
|
|
|
|
|
if output: |
|
|
with open(output, "w") as f: |
|
|
json.dump(output_data, f, indent=2) |
|
|
typer.echo(f"Results written to: {output}") |
|
|
else: |
|
|
typer.echo(json.dumps(output_data, indent=2)) |
|
|
|
|
|
typer.echo(f"\nProcessed {result.metadata.num_pages} pages, {len(result.chunks)} chunks") |
|
|
|
|
|
except ImportError as e: |
|
|
typer.echo(f"Error: Missing dependency - {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
except Exception as e: |
|
|
typer.echo(f"Error processing document: {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
|
|
|
@document_app.command("extract") |
|
|
def extract_fields( |
|
|
file_path: Path = typer.Argument(..., help="Path to document file"), |
|
|
schema: Optional[Path] = typer.Option(None, "--schema", "-s", help="Extraction schema YAML file"), |
|
|
fields: Optional[List[str]] = typer.Option(None, "--field", "-f", help="Fields to extract (can use multiple)"), |
|
|
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
|
|
validate: bool = typer.Option(True, "--validate/--no-validate", help="Validate extraction"), |
|
|
): |
|
|
""" |
|
|
Extract structured fields from a document. |
|
|
|
|
|
Example: |
|
|
sparknet document extract invoice.pdf -f "invoice_number" -f "total_amount" |
|
|
sparknet document extract contract.pdf --schema contract_schema.yaml |
|
|
""" |
|
|
from loguru import logger |
|
|
|
|
|
if not file_path.exists(): |
|
|
typer.echo(f"Error: File not found: {file_path}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
if not schema and not fields: |
|
|
typer.echo("Error: Provide --schema or --field options", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
typer.echo(f"Extracting fields from: {file_path}") |
|
|
|
|
|
try: |
|
|
from ..document.schemas.extraction import ExtractionSchema, FieldDefinition |
|
|
from ..agents.document_agent import DocumentAgent |
|
|
|
|
|
|
|
|
if schema: |
|
|
import yaml |
|
|
with open(schema) as f: |
|
|
schema_data = yaml.safe_load(f) |
|
|
extraction_schema = ExtractionSchema(**schema_data) |
|
|
else: |
|
|
|
|
|
field_defs = [ |
|
|
FieldDefinition( |
|
|
name=f, |
|
|
field_type="string", |
|
|
required=True, |
|
|
) |
|
|
for f in fields |
|
|
] |
|
|
extraction_schema = ExtractionSchema( |
|
|
name="cli_extraction", |
|
|
fields=field_defs, |
|
|
) |
|
|
|
|
|
|
|
|
import asyncio |
|
|
agent = DocumentAgent() |
|
|
asyncio.run(agent.load_document(str(file_path))) |
|
|
result = asyncio.run(agent.extract_fields(extraction_schema)) |
|
|
|
|
|
|
|
|
output_data = { |
|
|
"document": str(file_path), |
|
|
"fields": result.fields, |
|
|
"confidence": result.confidence, |
|
|
"evidence": [ |
|
|
{ |
|
|
"chunk_id": e.chunk_id, |
|
|
"page": e.page, |
|
|
"snippet": e.snippet, |
|
|
} |
|
|
for e in result.evidence |
|
|
] if result.evidence else [], |
|
|
} |
|
|
|
|
|
|
|
|
if validate and result.fields: |
|
|
from ..document.validation import get_extraction_critic |
|
|
critic = get_extraction_critic() |
|
|
|
|
|
evidence_chunks = [ |
|
|
{"text": e.snippet, "page": e.page, "chunk_id": e.chunk_id} |
|
|
for e in result.evidence |
|
|
] if result.evidence else [] |
|
|
|
|
|
validation = critic.validate_extraction(result.fields, evidence_chunks) |
|
|
output_data["validation"] = { |
|
|
"status": validation.overall_status.value, |
|
|
"confidence": validation.overall_confidence, |
|
|
"should_accept": validation.should_accept, |
|
|
"abstain_reason": validation.abstain_reason, |
|
|
} |
|
|
|
|
|
|
|
|
if output: |
|
|
with open(output, "w") as f: |
|
|
json.dump(output_data, f, indent=2) |
|
|
typer.echo(f"Results written to: {output}") |
|
|
else: |
|
|
typer.echo(json.dumps(output_data, indent=2)) |
|
|
|
|
|
except ImportError as e: |
|
|
typer.echo(f"Error: Missing dependency - {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
except Exception as e: |
|
|
typer.echo(f"Error extracting fields: {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
|
|
|
@document_app.command("classify") |
|
|
def classify_document( |
|
|
file_path: Path = typer.Argument(..., help="Path to document file"), |
|
|
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
|
|
): |
|
|
""" |
|
|
Classify document type. |
|
|
|
|
|
Example: |
|
|
sparknet document classify document.pdf |
|
|
""" |
|
|
from loguru import logger |
|
|
|
|
|
if not file_path.exists(): |
|
|
typer.echo(f"Error: File not found: {file_path}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
typer.echo(f"Classifying document: {file_path}") |
|
|
|
|
|
try: |
|
|
from ..agents.document_agent import DocumentAgent |
|
|
import asyncio |
|
|
|
|
|
agent = DocumentAgent() |
|
|
asyncio.run(agent.load_document(str(file_path))) |
|
|
classification = asyncio.run(agent.classify()) |
|
|
|
|
|
output_data = { |
|
|
"document": str(file_path), |
|
|
"document_type": classification.document_type.value, |
|
|
"confidence": classification.confidence, |
|
|
"reasoning": classification.reasoning, |
|
|
"metadata": classification.metadata, |
|
|
} |
|
|
|
|
|
if output: |
|
|
with open(output, "w") as f: |
|
|
json.dump(output_data, f, indent=2) |
|
|
typer.echo(f"Results written to: {output}") |
|
|
else: |
|
|
typer.echo(json.dumps(output_data, indent=2)) |
|
|
|
|
|
except Exception as e: |
|
|
typer.echo(f"Error classifying document: {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
|
|
|
@document_app.command("ask") |
|
|
def ask_document( |
|
|
file_path: Path = typer.Argument(..., help="Path to document file"), |
|
|
question: str = typer.Argument(..., help="Question to ask about the document"), |
|
|
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output JSON file"), |
|
|
): |
|
|
""" |
|
|
Ask a question about a document. |
|
|
|
|
|
Example: |
|
|
sparknet document ask invoice.pdf "What is the total amount?" |
|
|
""" |
|
|
from loguru import logger |
|
|
|
|
|
if not file_path.exists(): |
|
|
typer.echo(f"Error: File not found: {file_path}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|
|
|
typer.echo(f"Processing question for: {file_path}") |
|
|
|
|
|
try: |
|
|
from ..agents.document_agent import DocumentAgent |
|
|
import asyncio |
|
|
|
|
|
agent = DocumentAgent() |
|
|
asyncio.run(agent.load_document(str(file_path))) |
|
|
answer, evidence = asyncio.run(agent.answer_question(question)) |
|
|
|
|
|
output_data = { |
|
|
"document": str(file_path), |
|
|
"question": question, |
|
|
"answer": answer, |
|
|
"evidence": [ |
|
|
{ |
|
|
"chunk_id": e.chunk_id, |
|
|
"page": e.page, |
|
|
"snippet": e.snippet, |
|
|
"confidence": e.confidence, |
|
|
} |
|
|
for e in evidence |
|
|
] if evidence else [], |
|
|
} |
|
|
|
|
|
if output: |
|
|
with open(output, "w") as f: |
|
|
json.dump(output_data, f, indent=2) |
|
|
typer.echo(f"Results written to: {output}") |
|
|
else: |
|
|
typer.echo(f"\nQuestion: {question}") |
|
|
typer.echo(f"\nAnswer: {answer}") |
|
|
if evidence: |
|
|
typer.echo(f"\nEvidence ({len(evidence)} sources):") |
|
|
for e in evidence[:3]: |
|
|
typer.echo(f" - Page {e.page + 1}: {e.snippet[:100]}...") |
|
|
|
|
|
except Exception as e: |
|
|
typer.echo(f"Error processing question: {e}", err=True) |
|
|
raise typer.Exit(1) |
|
|
|