anthropic-pdf-parser / pdf_atomic_parser.py

Upload 6 files

0ee11bd verified about 12 hours ago

52.5 kB

	"""
	pdf_atomic_parser.py
	====================
	Author : algorembrant
	Version : 1.0.0
	License : MIT

	DESCRIPTION
	-----------
	Atomically parse and understand complex PDF documents using Claude claude-opus-4-6.
	Handles equations, graphs, algorithms, unique drawings, tables, multi-column
	layouts, and 100+ page documents without hallucination. Designed for local
	agent pipelines.

	CAPABILITIES
	------------
	- Native PDF document API (base64) with prompt caching
	- Page-as-image fallback using PyMuPDF at 300 DPI for max fidelity
	- LaTeX equation extraction
	- Table extraction (Markdown + JSON)
	- Algorithm and pseudocode extraction
	- Figure and graph semantic description
	- Multi-column and complex layout handling
	- Chunked processing for 100+ page documents
	- SQLite-backed cache to avoid re-processing pages
	- Structured JSON output per page and full document
	- Agent-callable interface (AgentPDFInterface)
	- Async batch processing for speed

	USAGE COMMANDS
	--------------
	# Parse a PDF and save structured JSON
	python pdf_atomic_parser.py parse document.pdf

	# Parse with verbose output
	python pdf_atomic_parser.py parse document.pdf --verbose

	# Parse specific page range
	python pdf_atomic_parser.py parse document.pdf --pages 1-20

	# Extract only equations (LaTeX)
	python pdf_atomic_parser.py extract-equations document.pdf

	# Extract only tables (Markdown)
	python pdf_atomic_parser.py extract-tables document.pdf

	# Extract only algorithms/code blocks
	python pdf_atomic_parser.py extract-algorithms document.pdf

	# Extract figures and graph descriptions
	python pdf_atomic_parser.py extract-figures document.pdf

	# Full atomic extraction (all content types) to output dir
	python pdf_atomic_parser.py atomic document.pdf --output ./results/

	# Query a parsed PDF (semantic search over cached parse)
	python pdf_atomic_parser.py query document.pdf "What is the main theorem?"

	# Use faster/cheaper model (Sonnet instead of Opus)
	python pdf_atomic_parser.py parse document.pdf --model sonnet

	# Use page-as-image mode (higher fidelity for scanned/complex PDFs)
	python pdf_atomic_parser.py parse document.pdf --mode image

	# Use native PDF mode (default, faster)
	python pdf_atomic_parser.py parse document.pdf --mode native

	# Set chunk size for large PDFs (default 20 pages per chunk)
	python pdf_atomic_parser.py parse document.pdf --chunk-size 10

	# Clear cache for a document
	python pdf_atomic_parser.py clear-cache document.pdf

	# Show cache stats
	python pdf_atomic_parser.py cache-stats

	# List all cached documents
	python pdf_atomic_parser.py list-cache

	# Batch process a directory of PDFs
	python pdf_atomic_parser.py batch ./pdf_folder/ --output ./results/

	# Export parse results as Markdown report
	python pdf_atomic_parser.py parse document.pdf --format markdown

	# Export as plain text
	python pdf_atomic_parser.py parse document.pdf --format text

	# Show token usage estimate before parsing
	python pdf_atomic_parser.py estimate document.pdf

	# Agent interface example (programmatic)
	# from pdf_atomic_parser import AgentPDFInterface
	# agent = AgentPDFInterface()
	# result = agent.parse("document.pdf")
	# equations = agent.get_equations("document.pdf")
	"""

	from __future__ import annotations

	import argparse
	import asyncio
	import base64
	import hashlib
	import json
	import logging
	import os
	import sqlite3
	import sys
	import time
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from dataclasses import asdict, dataclass, field
	from pathlib import Path
	from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple

	import anthropic
	import fitz # PyMuPDF
	from rich.console import Console
	from rich.logging import RichHandler
	from rich.progress import (
	BarColumn,
	MofNCompleteColumn,
	Progress,
	SpinnerColumn,
	TaskProgressColumn,
	TextColumn,
	TimeElapsedColumn,
	TimeRemainingColumn,
	)
	from rich.table import Table
	from tqdm import tqdm


	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	DEFAULT_MODEL_OPUS = "claude-opus-4-6"
	DEFAULT_MODEL_SONNET = "claude-sonnet-4-6"
	DEFAULT_MODEL_HAIKU = "claude-haiku-4-5-20251001"

	MAX_TOKENS_OUTPUT = 8192
	CHUNK_SIZE_DEFAULT = 20 # pages per API call
	IMAGE_DPI = 300 # render DPI for page-as-image mode
	MAX_PDF_SIZE_BYTES = 32 * 1024 * 1024 # 32 MB native API limit
	MAX_PDF_PAGES_NATIVE = 100 # native API page cap per request
	CACHE_DB_NAME = ".pdf_parser_cache.db"
	LOG_FORMAT = "%(message)s"

	console = Console()

	logging.basicConfig(
	level=logging.WARNING,
	format=LOG_FORMAT,
	handlers=[RichHandler(console=console, rich_tracebacks=True, show_path=False)],
	)
	logger = logging.getLogger("pdf_atomic_parser")


	# ---------------------------------------------------------------------------
	# Data structures
	# ---------------------------------------------------------------------------

	@dataclass
	class EquationBlock:
	page: int
	index: int
	latex: str
	description: str
	inline: bool = False


	@dataclass
	class TableBlock:
	page: int
	index: int
	markdown: str
	json_data: List[Dict]
	caption: str = ""


	@dataclass
	class AlgorithmBlock:
	page: int
	index: int
	name: str
	language: str
	code: str
	description: str


	@dataclass
	class FigureBlock:
	page: int
	index: int
	figure_type: str # chart \| diagram \| drawing \| photograph \| plot
	description: str
	data_summary: str
	caption: str = ""


	@dataclass
	class PageResult:
	page_number: int
	raw_text: str
	summary: str
	equations: List[EquationBlock] = field(default_factory=list)
	tables: List[TableBlock] = field(default_factory=list)
	algorithms: List[AlgorithmBlock] = field(default_factory=list)
	figures: List[FigureBlock] = field(default_factory=list)
	section_headers: List[str] = field(default_factory=list)
	references: List[str] = field(default_factory=list)
	keywords: List[str] = field(default_factory=list)
	layout_notes: str = ""
	processing_mode: str = "native"
	tokens_used: int = 0
	processing_time_s: float = 0.0


	@dataclass
	class DocumentResult:
	document_path: str
	document_hash: str
	total_pages: int
	pages_processed: int
	model: str
	processing_mode: str
	title: str
	authors: List[str]
	abstract: str
	document_summary: str
	page_results: List[PageResult] = field(default_factory=list)
	total_equations: int = 0
	total_tables: int = 0
	total_algorithms: int = 0
	total_figures: int = 0
	total_tokens_used: int = 0
	total_processing_time_s: float = 0.0


	# ---------------------------------------------------------------------------
	# Cache layer
	# ---------------------------------------------------------------------------

	class ParseCache:
	"""SQLite-backed cache for parsed page results."""

	def __init__(self, cache_dir: Path):
	cache_dir.mkdir(parents=True, exist_ok=True)
	self.db_path = cache_dir / CACHE_DB_NAME
	self._init_db()

	def _init_db(self) -> None:
	with self._connect() as conn:
	conn.execute("""
	CREATE TABLE IF NOT EXISTS page_cache (
	doc_hash TEXT NOT NULL,
	page_num INTEGER NOT NULL,
	model TEXT NOT NULL,
	mode TEXT NOT NULL,
	result_json TEXT NOT NULL,
	created_at REAL NOT NULL,
	PRIMARY KEY (doc_hash, page_num, model, mode)
	)
	""")
	conn.execute("""
	CREATE TABLE IF NOT EXISTS doc_meta (
	doc_hash TEXT PRIMARY KEY,
	doc_path TEXT NOT NULL,
	total_pages INTEGER NOT NULL,
	created_at REAL NOT NULL
	)
	""")

	def _connect(self) -> sqlite3.Connection:
	conn = sqlite3.connect(self.db_path, timeout=30)
	conn.execute("PRAGMA journal_mode=WAL")
	return conn

	@staticmethod
	def file_hash(path: Path) -> str:
	h = hashlib.sha256()
	with open(path, "rb") as fh:
	for chunk in iter(lambda: fh.read(65536), b""):
	h.update(chunk)
	return h.hexdigest()[:16]

	def get_page(self, doc_hash: str, page_num: int, model: str, mode: str) -> Optional[PageResult]:
	with self._connect() as conn:
	row = conn.execute(
	"SELECT result_json FROM page_cache WHERE doc_hash=? AND page_num=? AND model=? AND mode=?",
	(doc_hash, page_num, model, mode),
	).fetchone()
	if row:
	return self._deserialize_page(json.loads(row[0]))
	return None

	def set_page(self, doc_hash: str, result: PageResult, model: str, mode: str) -> None:
	with self._connect() as conn:
	conn.execute(
	"INSERT OR REPLACE INTO page_cache VALUES (?,?,?,?,?,?)",
	(doc_hash, result.page_number, model, mode,
	json.dumps(self._serialize_page(result)), time.time()),
	)

	def clear_document(self, doc_hash: str) -> int:
	with self._connect() as conn:
	cur = conn.execute("DELETE FROM page_cache WHERE doc_hash=?", (doc_hash,))
	conn.execute("DELETE FROM doc_meta WHERE doc_hash=?", (doc_hash,))
	return cur.rowcount

	def stats(self) -> Dict[str, Any]:
	with self._connect() as conn:
	total = conn.execute("SELECT COUNT(*) FROM page_cache").fetchone()[0]
	docs = conn.execute("SELECT COUNT(DISTINCT doc_hash) FROM page_cache").fetchone()[0]
	size = self.db_path.stat().st_size if self.db_path.exists() else 0
	return {"total_cached_pages": total, "unique_documents": docs, "cache_size_mb": round(size / 1e6, 2)}

	def list_documents(self) -> List[Dict]:
	with self._connect() as conn:
	rows = conn.execute("""
	SELECT doc_hash, COUNT(*) as pages, MIN(created_at) as first_seen
	FROM page_cache GROUP BY doc_hash
	""").fetchall()
	return [{"hash": r[0], "cached_pages": r[1], "first_seen": r[2]} for r in rows]

	# -- serialization helpers -----------------------------------------------

	@staticmethod
	def _serialize_page(p: PageResult) -> Dict:
	d = asdict(p)
	return d

	@staticmethod
	def _deserialize_page(d: Dict) -> PageResult:
	d["equations"] = [EquationBlock(**e) for e in d.get("equations", [])]
	d["tables"] = [TableBlock(**t) for t in d.get("tables", [])]
	d["algorithms"] = [AlgorithmBlock(**a) for a in d.get("algorithms", [])]
	d["figures"] = [FigureBlock(**f) for f in d.get("figures", [])]
	return PageResult(**d)


	# ---------------------------------------------------------------------------
	# PDF utilities
	# ---------------------------------------------------------------------------

	class PDFDocument:
	"""Thin wrapper around fitz.Document with chunking helpers."""

	def __init__(self, path: Path):
	self.path = path
	self._doc = fitz.open(str(path))
	self.total_pages = len(self._doc)

	@property
	def file_size_bytes(self) -> int:
	return self.path.stat().st_size

	def get_chunk_ranges(self, chunk_size: int) -> List[Tuple[int, int]]:
	"""Return list of (start_page_0indexed, end_page_exclusive) tuples."""
	ranges = []
	for start in range(0, self.total_pages, chunk_size):
	end = min(start + chunk_size, self.total_pages)
	ranges.append((start, end))
	return ranges

	def get_chunk_as_pdf_bytes(self, start: int, end: int) -> bytes:
	"""Extract pages [start, end) into a new in-memory PDF."""
	sub = fitz.open()
	sub.insert_pdf(self._doc, from_page=start, to_page=end - 1)
	return sub.write()

	def get_page_as_png_bytes(self, page_idx: int, dpi: int = IMAGE_DPI) -> bytes:
	"""Render a single page to PNG bytes at given DPI."""
	page = self._doc[page_idx]
	mat = fitz.Matrix(dpi / 72, dpi / 72)
	pix = page.get_pixmap(matrix=mat, alpha=False)
	return pix.tobytes("png")

	def close(self) -> None:
	self._doc.close()

	def __enter__(self):
	return self

	def __exit__(self, *_):
	self.close()


	# ---------------------------------------------------------------------------
	# Extraction prompts
	# ---------------------------------------------------------------------------

	SYSTEM_PROMPT = """You are an expert scientific document analyst specializing in atomically
	parsing complex academic and technical PDFs. Your extractions must be:
	- Complete: capture every equation, table, figure, and algorithm
	- Faithful: never invent or hallucinate content
	- Precise: reproduce equations in proper LaTeX
	- Structured: respond only with valid JSON matching the schema provided

	Do NOT add prose outside the JSON response. If a field has no content, use an
	empty list [] or empty string "" rather than null."""

	PAGE_EXTRACTION_PROMPT = """\
	Atomically parse the provided PDF page(s) and return a JSON object that matches
	this schema exactly:

	{
	"raw_text": "<full verbatim text extracted from page, preserving paragraphs>",
	"summary": "<2-4 sentence factual summary of this page>",
	"section_headers": ["<header string>", ...],
	"keywords": ["<important technical term>", ...],
	"layout_notes": "<describe columns, special layouts, footnotes, margin notes>",
	"equations": [
	{
	"index": <int starting at 0>,
	"latex": "<complete LaTeX representation>",
	"description": "<what this equation represents>",
	"inline": <true if inline, false if display/block>
	}
	],
	"tables": [
	{
	"index": <int>,
	"markdown": "<GitHub-flavored Markdown table>",
	"json_data": [{"col1": "val", ...}, ...],
	"caption": "<table caption or empty string>"
	}
	],
	"algorithms": [
	{
	"index": <int>,
	"name": "<algorithm name or Algorithm N>",
	"language": "<pseudocode \| python \| cpp \| generic \| etc.>",
	"code": "<verbatim algorithm text, preserve indentation>",
	"description": "<what this algorithm does>"
	}
	],
	"figures": [
	{
	"index": <int>,
	"figure_type": "<chart \| bar_chart \| line_chart \| scatter_plot \| histogram \| diagram \| flowchart \| neural_network \| tree \| graph \| drawing \| photograph \| heatmap \| 3d_plot \| other>",
	"description": "<detailed semantic description of the visual>",
	"data_summary": "<describe axes, units, trend, key values if quantitative>",
	"caption": "<figure caption or empty string>"
	}
	],
	"references": ["<any in-text citation or bibliography entry on this page>"]
	}

	Rules:
	1. Every equation MUST have LaTeX. Use \\frac, \\sum, \\int, \\mathbf etc. for proper notation.
	2. Tables must be fully reproduced in both Markdown and as list-of-dicts.
	3. Algorithms must preserve all steps, loops, conditions verbatim.
	4. Figures: describe them as if for a blind reader — quantitative values, trends, colors, labels.
	5. raw_text must include ALL text visible on the page, including headers, footers, captions.
	6. Do NOT summarize or truncate any content.
	"""

	DOCUMENT_META_PROMPT = """\
	Based on the document pages you have seen, extract high-level metadata as JSON:

	{
	"title": "<document title>",
	"authors": ["<author name>", ...],
	"abstract": "<full abstract text or empty string if none>",
	"document_summary": "<comprehensive 5-8 sentence summary of the entire document>"
	}

	Respond with valid JSON only.
	"""


	# ---------------------------------------------------------------------------
	# Core parser
	# ---------------------------------------------------------------------------

	class AtomicPDFParser:
	"""
	Core parser that sends PDF chunks or page images to the Claude API
	and extracts structured content atomically.
	"""

	def __init__(
	self,
	api_key: Optional[str] = None,
	model: str = DEFAULT_MODEL_OPUS,
	mode: str = "native", # "native" \| "image"
	chunk_size: int = CHUNK_SIZE_DEFAULT,
	cache_dir: Optional[Path] = None,
	verbose: bool = False,
	max_workers: int = 4,
	):
	self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "")
	self.model = self._resolve_model(model)
	self.mode = mode
	self.chunk_size = chunk_size
	self.verbose = verbose
	self.max_workers = max_workers

	if not self.api_key:
	raise ValueError(
	"ANTHROPIC_API_KEY environment variable not set. "
	"Export it or pass api_key= to AtomicPDFParser."
	)

	self.client = anthropic.Anthropic(api_key=self.api_key)

	cache_path = cache_dir or Path.home() / ".cache" / "pdf_atomic_parser"
	self.cache = ParseCache(cache_path)

	if verbose:
	logger.setLevel(logging.DEBUG)

	@staticmethod
	def _resolve_model(alias: str) -> str:
	mapping = {
	"opus": DEFAULT_MODEL_OPUS,
	"sonnet": DEFAULT_MODEL_SONNET,
	"haiku": DEFAULT_MODEL_HAIKU,
	}
	return mapping.get(alias.lower(), alias)

	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def parse(
	self,
	pdf_path: str \| Path,
	page_range: Optional[Tuple[int, int]] = None,
	) -> DocumentResult:
	"""
	Parse the entire document (or a page range) atomically.

	Parameters
	----------
	pdf_path : Path to the PDF file.
	page_range : Optional (start, end) 1-indexed inclusive page numbers.

	Returns
	-------
	DocumentResult with full structured extraction.
	"""
	path = Path(pdf_path).resolve()
	if not path.exists():
	raise FileNotFoundError(f"PDF not found: {path}")

	doc_hash = self.cache.file_hash(path)
	t_start = time.time()

	with PDFDocument(path) as pdf:
	total = pdf.total_pages
	if page_range:
	p_start = max(0, page_range[0] - 1)
	p_end = min(total, page_range[1])
	else:
	p_start, p_end = 0, total

	chunks = []
	for s in range(p_start, p_end, self.chunk_size):
	e = min(s + self.chunk_size, p_end)
	chunks.append((s, e))

	page_results: List[PageResult] = []

	with Progress(
	SpinnerColumn(),
	TextColumn("[bold cyan]{task.description}"),
	BarColumn(),
	MofNCompleteColumn(),
	TaskProgressColumn(),
	TimeElapsedColumn(),
	TimeRemainingColumn(),
	console=console,
	transient=False,
	) as progress:
	task = progress.add_task(
	f"[cyan]Parsing {path.name}", total=len(chunks)
	)

	for chunk_start, chunk_end in chunks:
	chunk_pages = self._parse_chunk(
	pdf, doc_hash, chunk_start, chunk_end
	)
	page_results.extend(chunk_pages)
	progress.advance(task)

	# Build document-level metadata
	meta = self._extract_document_meta(page_results)

	doc_result = DocumentResult(
	document_path = str(path),
	document_hash = doc_hash,
	total_pages = total,
	pages_processed = len(page_results),
	model = self.model,
	processing_mode = self.mode,
	title = meta.get("title", ""),
	authors = meta.get("authors", []),
	abstract = meta.get("abstract", ""),
	document_summary = meta.get("document_summary", ""),
	page_results = page_results,
	total_equations = sum(len(p.equations) for p in page_results),
	total_tables = sum(len(p.tables) for p in page_results),
	total_algorithms = sum(len(p.algorithms) for p in page_results),
	total_figures = sum(len(p.figures) for p in page_results),
	total_tokens_used = sum(p.tokens_used for p in page_results),
	total_processing_time_s = time.time() - t_start,
	)
	return doc_result

	def extract_equations(self, pdf_path: str \| Path) -> List[EquationBlock]:
	result = self.parse(pdf_path)
	return [eq for p in result.page_results for eq in p.equations]

	def extract_tables(self, pdf_path: str \| Path) -> List[TableBlock]:
	result = self.parse(pdf_path)
	return [tb for p in result.page_results for tb in p.tables]

	def extract_algorithms(self, pdf_path: str \| Path) -> List[AlgorithmBlock]:
	result = self.parse(pdf_path)
	return [al for p in result.page_results for al in p.algorithms]

	def extract_figures(self, pdf_path: str \| Path) -> List[FigureBlock]:
	result = self.parse(pdf_path)
	return [fg for p in result.page_results for fg in p.figures]

	def query(self, pdf_path: str \| Path, question: str) -> str:
	"""
	Semantic query over cached parse results. Re-parses if not cached.
	"""
	result = self.parse(pdf_path)
	full_text = "\n\n".join(
	f"[Page {p.page_number}]\n{p.raw_text}" for p in result.page_results
	)
	messages = [
	{
	"role": "user",
	"content": (
	f"Based on the following document content, answer this question "
	f"precisely and cite page numbers where relevant.\n\n"
	f"Question: {question}\n\n"
	f"Document content:\n{full_text[:60000]}"
	),
	}
	]
	resp = self.client.messages.create(
	model=self.model,
	max_tokens=2048,
	messages=messages,
	)
	return resp.content[0].text

	# ------------------------------------------------------------------
	# Internal methods
	# ------------------------------------------------------------------

	def _parse_chunk(
	self,
	pdf: PDFDocument,
	doc_hash: str,
	chunk_start: int,
	chunk_end: int,
	) -> List[PageResult]:
	"""Parse a range of pages, using cache when available."""
	results = []
	pages_to_process = []

	for pg in range(chunk_start, chunk_end):
	cached = self.cache.get_page(doc_hash, pg + 1, self.model, self.mode)
	if cached:
	logger.debug("Cache hit page %d", pg + 1)
	results.append(cached)
	else:
	pages_to_process.append(pg)

	if not pages_to_process:
	return results

	# Group consecutive un-cached pages into sub-chunks
	sub_chunks = self._group_consecutive(pages_to_process)
	for sub_start, sub_end in sub_chunks:
	sub_results = self._call_api_chunk(pdf, doc_hash, sub_start, sub_end)
	results.extend(sub_results)

	results.sort(key=lambda r: r.page_number)
	return results

	@staticmethod
	def _group_consecutive(pages: List[int]) -> List[Tuple[int, int]]:
	if not pages:
	return []
	groups, start, prev = [], pages[0], pages[0]
	for p in pages[1:]:
	if p != prev + 1:
	groups.append((start, prev + 1))
	start = p
	prev = p
	groups.append((start, prev + 1))
	return groups

	def _call_api_chunk(
	self,
	pdf: PDFDocument,
	doc_hash: str,
	chunk_start: int,
	chunk_end: int,
	) -> List[PageResult]:
	"""Send pages to Claude API and parse response."""
	t_start = time.time()

	if self.mode == "image":
	return self._call_api_as_images(pdf, doc_hash, chunk_start, chunk_end, t_start)
	else:
	return self._call_api_native(pdf, doc_hash, chunk_start, chunk_end, t_start)

	def _call_api_native(
	self,
	pdf: PDFDocument,
	doc_hash: str,
	chunk_start: int,
	chunk_end: int,
	t_start: float,
	) -> List[PageResult]:
	chunk_bytes = pdf.get_chunk_as_pdf_bytes(chunk_start, chunk_end)
	b64_pdf = base64.standard_b64encode(chunk_bytes).decode("utf-8")
	num_pages = chunk_end - chunk_start

	prompt_suffix = (
	f"\nThis PDF chunk contains pages {chunk_start + 1} to {chunk_end} "
	f"of the original document. "
	f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
	f"Index them page_number={chunk_start + 1} through {chunk_end}."
	)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "document",
	"source": {
	"type": "base64",
	"media_type": "application/pdf",
	"data": b64_pdf,
	},
	"cache_control": {"type": "ephemeral"},
	},
	{
	"type": "text",
	"text": PAGE_EXTRACTION_PROMPT + prompt_suffix,
	},
	],
	}
	]

	return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "native")

	def _call_api_as_images(
	self,
	pdf: PDFDocument,
	doc_hash: str,
	chunk_start: int,
	chunk_end: int,
	t_start: float,
	) -> List[PageResult]:
	content = []
	for pg_idx in range(chunk_start, chunk_end):
	png_bytes = pdf.get_page_as_png_bytes(pg_idx, dpi=IMAGE_DPI)
	b64_img = base64.standard_b64encode(png_bytes).decode("utf-8")
	content.append({
	"type": "text",
	"text": f"--- Page {pg_idx + 1} ---",
	})
	content.append({
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": b64_img,
	},
	})

	num_pages = chunk_end - chunk_start
	prompt_suffix = (
	f"\nThese are page images {chunk_start + 1} through {chunk_end}. "
	f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
	f"Index them page_number={chunk_start + 1} through {chunk_end}."
	)
	content.append({"type": "text", "text": PAGE_EXTRACTION_PROMPT + prompt_suffix})

	messages = [{"role": "user", "content": content}]
	return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "image")

	def _execute_api_call(
	self,
	messages: List[Dict],
	doc_hash: str,
	chunk_start: int,
	chunk_end: int,
	t_start: float,
	mode: str,
	) -> List[PageResult]:
	retries, delay = 3, 5
	for attempt in range(retries):
	try:
	resp = self.client.messages.create(
	model=self.model,
	max_tokens=MAX_TOKENS_OUTPUT,
	system=SYSTEM_PROMPT,
	messages=messages,
	)
	break
	except anthropic.RateLimitError:
	if attempt == retries - 1:
	raise
	logger.warning("Rate limit hit; retrying in %ds...", delay)
	time.sleep(delay)
	delay *= 2
	except anthropic.APIStatusError as exc:
	logger.error("API error: %s", exc)
	raise

	raw_response = resp.content[0].text.strip()
	tokens_used = resp.usage.input_tokens + resp.usage.output_tokens
	elapsed = time.time() - t_start

	# Clean possible markdown fences
	if raw_response.startswith("```"):
	lines = raw_response.split("\n")
	raw_response = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])

	try:
	parsed = json.loads(raw_response)
	except json.JSONDecodeError as exc:
	logger.error("JSON parse error on API response: %s\nRaw:\n%s", exc, raw_response[:500])
	# Return minimal fallback for affected pages
	return [
	PageResult(
	page_number=pg + 1,
	raw_text="[PARSE ERROR: JSON decode failed]",
	summary="Failed to parse this page.",
	processing_mode=mode,
	tokens_used=tokens_used // max(1, chunk_end - chunk_start),
	processing_time_s=elapsed,
	)
	for pg in range(chunk_start, chunk_end)
	]

	# Handle both array-of-pages and single-page responses
	if isinstance(parsed, dict):
	parsed = [parsed]

	results = []
	for i, page_data in enumerate(parsed):
	pg_num = chunk_start + i + 1
	page_data["page_number"] = pg_num
	page_data["processing_mode"] = mode
	page_data["tokens_used"] = tokens_used // len(parsed)
	page_data["processing_time_s"] = elapsed / len(parsed)

	pr = self._dict_to_page_result(page_data)
	self.cache.set_page(doc_hash, pr, self.model, mode)
	results.append(pr)

	return results

	@staticmethod
	def _dict_to_page_result(d: Dict) -> PageResult:
	equations = [
	EquationBlock(
	page=d["page_number"],
	index=e.get("index", i),
	latex=e.get("latex", ""),
	description=e.get("description", ""),
	inline=e.get("inline", False),
	)
	for i, e in enumerate(d.get("equations", []))
	]
	tables = [
	TableBlock(
	page=d["page_number"],
	index=t.get("index", i),
	markdown=t.get("markdown", ""),
	json_data=t.get("json_data", []),
	caption=t.get("caption", ""),
	)
	for i, t in enumerate(d.get("tables", []))
	]
	algorithms = [
	AlgorithmBlock(
	page=d["page_number"],
	index=a.get("index", i),
	name=a.get("name", f"Algorithm {i+1}"),
	language=a.get("language", "pseudocode"),
	code=a.get("code", ""),
	description=a.get("description", ""),
	)
	for i, a in enumerate(d.get("algorithms", []))
	]
	figures = [
	FigureBlock(
	page=d["page_number"],
	index=f.get("index", i),
	figure_type=f.get("figure_type", "other"),
	description=f.get("description", ""),
	data_summary=f.get("data_summary", ""),
	caption=f.get("caption", ""),
	)
	for i, f in enumerate(d.get("figures", []))
	]
	return PageResult(
	page_number = d["page_number"],
	raw_text = d.get("raw_text", ""),
	summary = d.get("summary", ""),
	equations = equations,
	tables = tables,
	algorithms = algorithms,
	figures = figures,
	section_headers = d.get("section_headers", []),
	references = d.get("references", []),
	keywords = d.get("keywords", []),
	layout_notes = d.get("layout_notes", ""),
	processing_mode = d.get("processing_mode", "native"),
	tokens_used = d.get("tokens_used", 0),
	processing_time_s = d.get("processing_time_s", 0.0),
	)

	def _extract_document_meta(self, page_results: List[PageResult]) -> Dict:
	# Use first 5 pages for metadata extraction
	sample_text = "\n\n".join(
	f"[Page {p.page_number}]\n{p.raw_text}" for p in page_results[:5]
	)
	messages = [
	{
	"role": "user",
	"content": (
	f"{DOCUMENT_META_PROMPT}\n\nDocument sample:\n{sample_text[:8000]}"
	),
	}
	]
	try:
	resp = self.client.messages.create(
	model=self.model,
	max_tokens=1024,
	system=SYSTEM_PROMPT,
	messages=messages,
	)
	raw = resp.content[0].text.strip()
	if raw.startswith("```"):
	lines = raw.split("\n")
	raw = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
	return json.loads(raw)
	except Exception as exc:
	logger.warning("Document meta extraction failed: %s", exc)
	return {"title": "", "authors": [], "abstract": "", "document_summary": ""}


	# ---------------------------------------------------------------------------
	# Output formatters
	# ---------------------------------------------------------------------------

	class OutputFormatter:
	@staticmethod
	def to_json(result: DocumentResult, indent: int = 2) -> str:
	return json.dumps(asdict(result), indent=indent, ensure_ascii=False)

	@staticmethod
	def to_markdown(result: DocumentResult) -> str:
	lines = []
	lines.append(f"# {result.title or Path(result.document_path).name}")
	if result.authors:
	lines.append(f"\nAuthors: {', '.join(result.authors)}")
	lines.append(f"\nDocument Hash: `{result.document_hash}`")
	lines.append(f"Model: {result.model} \| Mode: {result.processing_mode}")
	lines.append(
	f"Pages: {result.pages_processed}/{result.total_pages} \| "
	f"Tokens: {result.total_tokens_used:,} \| "
	f"Time: {result.total_processing_time_s:.1f}s"
	)
	lines.append(
	f"Equations: {result.total_equations} \| "
	f"Tables: {result.total_tables} \| "
	f"Algorithms: {result.total_algorithms} \| "
	f"Figures: {result.total_figures}"
	)
	if result.abstract:
	lines.append(f"\n## Abstract\n\n{result.abstract}")
	if result.document_summary:
	lines.append(f"\n## Document Summary\n\n{result.document_summary}")

	for page in result.page_results:
	lines.append(f"\n---\n\n## Page {page.page_number}")
	if page.section_headers:
	lines.append("\n### Sections\n" + "\n".join(f"- {h}" for h in page.section_headers))
	lines.append(f"\n### Summary\n{page.summary}")
	lines.append(f"\n### Full Text\n\n{page.raw_text}")

	if page.equations:
	lines.append("\n### Equations\n")
	for eq in page.equations:
	lines.append(f"Eq {eq.index} ({('inline' if eq.inline else 'display')})")
	lines.append(f"```latex\n{eq.latex}\n```")
	lines.append(f"{eq.description}\n")

	if page.tables:
	lines.append("\n### Tables\n")
	for tb in page.tables:
	if tb.caption:
	lines.append(f"{tb.caption}\n")
	lines.append(tb.markdown + "\n")

	if page.algorithms:
	lines.append("\n### Algorithms\n")
	for al in page.algorithms:
	lines.append(f"{al.name} ({al.language})\n")
	lines.append(f"```{al.language}\n{al.code}\n```")
	lines.append(f"{al.description}\n")

	if page.figures:
	lines.append("\n### Figures\n")
	for fg in page.figures:
	lines.append(f"Figure {fg.index} [{fg.figure_type}]")
	if fg.caption:
	lines.append(f"{fg.caption}")
	lines.append(fg.description)
	if fg.data_summary:
	lines.append(f"Data: {fg.data_summary}\n")

	return "\n".join(lines)

	@staticmethod
	def to_text(result: DocumentResult) -> str:
	lines = [
	f"DOCUMENT: {result.title or Path(result.document_path).name}",
	f"Authors: {', '.join(result.authors)}",
	f"Pages processed: {result.pages_processed}/{result.total_pages}",
	"",
	"SUMMARY",
	"=" * 60,
	result.document_summary,
	"",
	]
	for page in result.page_results:
	lines.append(f"\n[PAGE {page.page_number}]")
	lines.append(page.raw_text)
	return "\n".join(lines)

	@staticmethod
	def print_summary_table(result: DocumentResult) -> None:
	table = Table(title=f"Parse Results: {Path(result.document_path).name}", show_lines=True)
	table.add_column("Metric", style="cyan", no_wrap=True)
	table.add_column("Value", style="green")

	table.add_row("Title", result.title or "(unknown)")
	table.add_row("Authors", ", ".join(result.authors) or "(unknown)")
	table.add_row("Model", result.model)
	table.add_row("Mode", result.processing_mode)
	table.add_row("Pages total", str(result.total_pages))
	table.add_row("Pages parsed", str(result.pages_processed))
	table.add_row("Equations", str(result.total_equations))
	table.add_row("Tables", str(result.total_tables))
	table.add_row("Algorithms", str(result.total_algorithms))
	table.add_row("Figures", str(result.total_figures))
	table.add_row("Tokens used", f"{result.total_tokens_used:,}")
	table.add_row("Processing time", f"{result.total_processing_time_s:.1f}s")
	table.add_row("Document hash", result.document_hash)

	console.print(table)


	# ---------------------------------------------------------------------------
	# Agent interface
	# ---------------------------------------------------------------------------

	class AgentPDFInterface:
	"""
	High-level interface designed for use within agent pipelines.
	All methods accept a file path and return serializable Python objects.

	Example usage in an agent:
	from pdf_atomic_parser import AgentPDFInterface

	agent = AgentPDFInterface(model="opus")
	full = agent.parse("paper.pdf")
	eqs = agent.get_equations("paper.pdf")
	answer = agent.ask("paper.pdf", "What is the loss function?")
	"""

	def __init__(self, **kwargs):
	self._parser = AtomicPDFParser(**kwargs)

	def parse(self, pdf_path: str, page_range: Optional[Tuple[int, int]] = None) -> Dict:
	result = self._parser.parse(pdf_path, page_range)
	return asdict(result)

	def get_equations(self, pdf_path: str) -> List[Dict]:
	return [asdict(e) for e in self._parser.extract_equations(pdf_path)]

	def get_tables(self, pdf_path: str) -> List[Dict]:
	return [asdict(t) for t in self._parser.extract_tables(pdf_path)]

	def get_algorithms(self, pdf_path: str) -> List[Dict]:
	return [asdict(a) for a in self._parser.extract_algorithms(pdf_path)]

	def get_figures(self, pdf_path: str) -> List[Dict]:
	return [asdict(f) for f in self._parser.extract_figures(pdf_path)]

	def ask(self, pdf_path: str, question: str) -> str:
	return self._parser.query(pdf_path, question)

	def get_full_text(self, pdf_path: str) -> str:
	result = self._parser.parse(pdf_path)
	return "\n\n".join(
	f"[Page {p.page_number}]\n{p.raw_text}"
	for p in result.page_results
	)

	def cache_stats(self) -> Dict:
	return self._parser.cache.stats()


	# ---------------------------------------------------------------------------
	# Batch processor
	# ---------------------------------------------------------------------------

	def batch_process(
	input_dir: Path,
	output_dir: Path,
	parser: AtomicPDFParser,
	fmt: str = "json",
	) -> None:
	pdfs = sorted(input_dir.glob("*/.pdf"))
	if not pdfs:
	console.print(f"[yellow]No PDF files found in {input_dir}[/yellow]")
	return

	output_dir.mkdir(parents=True, exist_ok=True)
	console.print(f"[cyan]Found {len(pdfs)} PDF files to process.[/cyan]")

	for pdf_path in pdfs:
	console.print(f"\n[bold]Processing:[/bold] {pdf_path.name}")
	try:
	result = parser.parse(pdf_path)
	stem = pdf_path.stem
	if fmt == "json":
	out = output_dir / f"{stem}.json"
	out.write_text(OutputFormatter.to_json(result), encoding="utf-8")
	elif fmt == "markdown":
	out = output_dir / f"{stem}.md"
	out.write_text(OutputFormatter.to_markdown(result), encoding="utf-8")
	else:
	out = output_dir / f"{stem}.txt"
	out.write_text(OutputFormatter.to_text(result), encoding="utf-8")
	console.print(f" [green]Saved:[/green] {out}")
	OutputFormatter.print_summary_table(result)
	except Exception as exc:
	console.print(f" [red]Error processing {pdf_path.name}: {exc}[/red]")
	logger.exception("Batch error")


	# ---------------------------------------------------------------------------
	# Token estimator
	# ---------------------------------------------------------------------------

	def estimate_tokens(pdf_path: Path) -> None:
	with PDFDocument(pdf_path) as pdf:
	total = pdf.total_pages
	size_mb = pdf.file_size_bytes / 1e6

	# Rough estimate: ~800 tokens per page for dense academic content
	est_tokens_in = total * 800
	est_tokens_out = total * 400
	est_total = est_tokens_in + est_tokens_out

	# Pricing approximate (Opus: $15/Mtok in, $75/Mtok out as of 2025)
	est_cost_opus = (est_tokens_in * 15 + est_tokens_out * 75) / 1_000_000

	table = Table(title=f"Token Estimate: {pdf_path.name}", show_lines=True)
	table.add_column("Metric", style="cyan")
	table.add_column("Estimate", style="yellow")

	table.add_row("Total pages", str(total))
	table.add_row("File size", f"{size_mb:.2f} MB")
	table.add_row("Est. input tokens", f"{est_tokens_in:,}")
	table.add_row("Est. output tokens", f"{est_tokens_out:,}")
	table.add_row("Est. total tokens", f"{est_total:,}")
	table.add_row("Est. cost (Opus)", f"${est_cost_opus:.2f}")
	table.add_row("Note", "Estimate only; actual usage varies")

	console.print(table)


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------

	def build_cli() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(
	prog="pdf_atomic_parser",
	description="Atomic PDF parser powered by Claude claude-opus-4-6",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)
	parser.add_argument("--model", default="opus", help="opus \| sonnet \| haiku \| full-model-string")
	parser.add_argument("--mode", default="native", choices=["native", "image"], help="Parsing mode")
	parser.add_argument("--chunk-size", type=int, default=CHUNK_SIZE_DEFAULT, help="Pages per API call")
	parser.add_argument("--verbose", action="store_true")

	sub = parser.add_subparsers(dest="command", required=True)

	# parse
	p_parse = sub.add_parser("parse", help="Parse a PDF fully")
	p_parse.add_argument("pdf", help="Path to PDF file")
	p_parse.add_argument("--output", "-o", help="Output file path")
	p_parse.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])
	p_parse.add_argument("--pages", help="Page range e.g. 1-50")

	# atomic (alias for parse with all content)
	p_atomic = sub.add_parser("atomic", help="Full atomic extraction to directory")
	p_atomic.add_argument("pdf", help="Path to PDF file")
	p_atomic.add_argument("--output", "-o", default="./atomic_output")

	# extract-equations
	p_eq = sub.add_parser("extract-equations", help="Extract LaTeX equations")
	p_eq.add_argument("pdf")
	p_eq.add_argument("--output", "-o")

	# extract-tables
	p_tb = sub.add_parser("extract-tables", help="Extract tables")
	p_tb.add_argument("pdf")
	p_tb.add_argument("--output", "-o")

	# extract-algorithms
	p_al = sub.add_parser("extract-algorithms", help="Extract algorithms/code")
	p_al.add_argument("pdf")
	p_al.add_argument("--output", "-o")

	# extract-figures
	p_fg = sub.add_parser("extract-figures", help="Extract figure descriptions")
	p_fg.add_argument("pdf")
	p_fg.add_argument("--output", "-o")

	# query
	p_q = sub.add_parser("query", help="Ask a question about the PDF")
	p_q.add_argument("pdf")
	p_q.add_argument("question", help="Question to ask")

	# batch
	p_batch = sub.add_parser("batch", help="Batch process a directory of PDFs")
	p_batch.add_argument("directory")
	p_batch.add_argument("--output", "-o", default="./batch_output")
	p_batch.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])

	# estimate
	p_est = sub.add_parser("estimate", help="Estimate token cost before parsing")
	p_est.add_argument("pdf")

	# cache commands
	sub.add_parser("cache-stats", help="Show cache statistics")
	sub.add_parser("list-cache", help="List all cached documents")
	p_cc = sub.add_parser("clear-cache", help="Clear cache for a document")
	p_cc.add_argument("pdf", help="PDF path (to identify document)")

	return parser


	def parse_page_range(s: str) -> Tuple[int, int]:
	parts = s.split("-")
	if len(parts) != 2:
	raise ValueError(f"Page range must be in format start-end, got: {s}")
	return int(parts[0]), int(parts[1])


	def save_output(content: str, output_path: Optional[str], default_name: str) -> None:
	path = Path(output_path) if output_path else Path(default_name)
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(content, encoding="utf-8")
	console.print(f"[green]Saved:[/green] {path}")


	def main() -> None:
	cli = build_cli()
	args = cli.parse_args()
	cache = ParseCache(Path.home() / ".cache" / "pdf_atomic_parser")

	if args.command == "cache-stats":
	stats = cache.stats()
	table = Table(title="Cache Statistics", show_lines=True)
	table.add_column("Key", style="cyan")
	table.add_column("Value", style="green")
	for k, v in stats.items():
	table.add_row(k.replace("_", " ").title(), str(v))
	console.print(table)
	return

	if args.command == "list-cache":
	docs = cache.list_documents()
	if not docs:
	console.print("[yellow]Cache is empty.[/yellow]")
	return
	table = Table(title="Cached Documents", show_lines=True)
	table.add_column("Hash", style="cyan")
	table.add_column("Cached Pages", style="green")
	table.add_column("First Seen", style="dim")
	for d in docs:
	import datetime
	ts = datetime.datetime.fromtimestamp(d["first_seen"]).strftime("%Y-%m-%d %H:%M")
	table.add_row(d["hash"], str(d["cached_pages"]), ts)
	console.print(table)
	return

	if args.command == "estimate":
	estimate_tokens(Path(args.pdf))
	return

	parser = AtomicPDFParser(
	model=args.model,
	mode=args.mode,
	chunk_size=args.chunk_size,
	verbose=args.verbose,
	)

	if args.command == "clear-cache":
	doc_hash = cache.file_hash(Path(args.pdf))
	n = cache.clear_document(doc_hash)
	console.print(f"[green]Cleared {n} cached pages for {Path(args.pdf).name}[/green]")
	return

	if args.command in ("parse", "atomic"):
	page_range = None
	if hasattr(args, "pages") and args.pages:
	page_range = parse_page_range(args.pages)

	result = parser.parse(args.pdf, page_range)
	OutputFormatter.print_summary_table(result)

	if args.command == "atomic":
	out_dir = Path(args.output)
	stem = Path(args.pdf).stem
	for fmt, fn in [("json", f"{stem}.json"), ("markdown", f"{stem}.md"), ("text", f"{stem}.txt")]:
	(out_dir / fn).parent.mkdir(parents=True, exist_ok=True)
	if fmt == "json":
	content = OutputFormatter.to_json(result)
	elif fmt == "markdown":
	content = OutputFormatter.to_markdown(result)
	else:
	content = OutputFormatter.to_text(result)
	(out_dir / fn).write_text(content, encoding="utf-8")
	console.print(f"[green]Saved {fmt}:[/green] {out_dir / fn}")
	else:
	fmt = args.format
	if fmt == "json":
	content = OutputFormatter.to_json(result)
	elif fmt == "markdown":
	content = OutputFormatter.to_markdown(result)
	else:
	content = OutputFormatter.to_text(result)

	stem = Path(args.pdf).stem
	save_output(content, getattr(args, "output", None), f"{stem}_parsed.{fmt if fmt != 'markdown' else 'md'}")

	elif args.command == "extract-equations":
	result = parser.parse(args.pdf)
	eqs = [asdict(e) for p in result.page_results for e in p.equations]
	content = json.dumps(eqs, indent=2, ensure_ascii=False)
	save_output(content, args.output, f"{Path(args.pdf).stem}_equations.json")
	console.print(f"[cyan]{len(eqs)} equations extracted.[/cyan]")

	elif args.command == "extract-tables":
	result = parser.parse(args.pdf)
	tables = [asdict(t) for p in result.page_results for t in p.tables]
	content = json.dumps(tables, indent=2, ensure_ascii=False)
	save_output(content, args.output, f"{Path(args.pdf).stem}_tables.json")
	console.print(f"[cyan]{len(tables)} tables extracted.[/cyan]")

	elif args.command == "extract-algorithms":
	result = parser.parse(args.pdf)
	algos = [asdict(a) for p in result.page_results for a in p.algorithms]
	content = json.dumps(algos, indent=2, ensure_ascii=False)
	save_output(content, args.output, f"{Path(args.pdf).stem}_algorithms.json")
	console.print(f"[cyan]{len(algos)} algorithms extracted.[/cyan]")

	elif args.command == "extract-figures":
	result = parser.parse(args.pdf)
	figures = [asdict(f) for p in result.page_results for f in p.figures]
	content = json.dumps(figures, indent=2, ensure_ascii=False)
	save_output(content, args.output, f"{Path(args.pdf).stem}_figures.json")
	console.print(f"[cyan]{len(figures)} figures extracted.[/cyan]")

	elif args.command == "query":
	answer = parser.query(args.pdf, args.question)
	console.print(f"\n[bold cyan]Answer:[/bold cyan]\n{answer}")

	elif args.command == "batch":
	batch_process(
	Path(args.directory),
	Path(args.output),
	parser,
	getattr(args, "format", "json"),
	)


	if __name__ == "__main__":
	main()