algorembrant commited on
Commit
0ee11bd
·
verified ·
1 Parent(s): 801646b

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +37 -34
  2. .gitignore +97 -0
  3. README.md +300 -0
  4. model_card.yml +88 -0
  5. pdf_atomic_parser.py +1405 -0
  6. requirements.txt +4 -0
.gitattributes CHANGED
@@ -1,35 +1,38 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Handle line endings automatically for all files tracked by Git
2
+ * text=auto eol=lf
3
+
4
+ # Explicitly declare Python as text
5
+ *.py text eol=lf linguist-language=Python
6
+ *.txt text eol=lf
7
+ *.md text eol=lf
8
+ *.yml text eol=lf
9
+ *.yaml text eol=lf
10
+ *.json text eol=lf
11
+ *.sh text eol=lf
12
+
13
+ # Binary files - do not attempt to process line endings
14
+ *.pdf binary
15
+ *.png binary
16
+ *.jpg binary
17
+ *.jpeg binary
18
+ *.gif binary
19
+ *.ico binary
20
+ *.db binary
21
+ *.zip binary
22
+ *.tar binary
23
+ *.gz binary
24
+ *.whl binary
25
+
26
+ # Hugging Face LFS tracked files
27
+ *.bin filter=lfs diff=lfs merge=lfs -text
28
+ *.pt filter=lfs diff=lfs merge=lfs -text
29
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
30
  *.safetensors filter=lfs diff=lfs merge=lfs -text
31
+ *.h5 filter=lfs diff=lfs merge=lfs -text
32
+ *.npz filter=lfs diff=lfs merge=lfs -text
33
+ *.npy filter=lfs diff=lfs merge=lfs -text
34
+ *.parquet filter=lfs diff=lfs merge=lfs -text
35
+ *.arrow filter=lfs diff=lfs merge=lfs -text
36
+
37
+ # Statistics
38
+ *.ipynb linguist-detectable=true
 
 
.gitignore ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+ pip-wheel-metadata/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+
28
+ # Virtual environments
29
+ .venv/
30
+ venv/
31
+ ENV/
32
+ env/
33
+ .env
34
+
35
+ # Environment / secrets
36
+ .env
37
+ *.env
38
+ .env.*
39
+ !.env.example
40
+
41
+ # IDE
42
+ .vscode/
43
+ .idea/
44
+ *.sublime-project
45
+ *.sublime-workspace
46
+ .DS_Store
47
+ Thumbs.db
48
+
49
+ # Testing
50
+ .pytest_cache/
51
+ .coverage
52
+ htmlcov/
53
+ .tox/
54
+ .nox/
55
+ *.cover
56
+ *.py,cover
57
+ .hypothesis/
58
+ coverage.xml
59
+ nosetests.xml
60
+ pytest.xml
61
+
62
+ # Distribution
63
+ *.tar.gz
64
+ *.whl
65
+
66
+ # PDF parser specific
67
+ .pdf_parser_cache.db
68
+ atomic_output/
69
+ batch_output/
70
+ results/
71
+ *.parsed.json
72
+ *.parsed.md
73
+ *.parsed.txt
74
+
75
+ # Logs
76
+ *.log
77
+ logs/
78
+
79
+ # Jupyter
80
+ .ipynb_checkpoints/
81
+ *.ipynb
82
+
83
+ # macOS
84
+ .DS_Store
85
+ .AppleDouble
86
+ .LSOverride
87
+
88
+ # Windows
89
+ Thumbs.db
90
+ ehthumbs.db
91
+ Desktop.ini
92
+
93
+ # Type checking
94
+ .mypy_cache/
95
+ .dmypy.json
96
+ dmypy.json
97
+ .pytype/
README.md ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF Atomic Parser
2
+
3
+ ![Python](https://img.shields.io/badge/Python-3.10%2B-blue?logo=python&logoColor=white)
4
+ ![License](https://img.shields.io/badge/License-MIT-green)
5
+ ![Model](https://img.shields.io/badge/Model-claude--opus--4--6-purple)
6
+ ![Status](https://img.shields.io/badge/Status-Stable-brightgreen)
7
+ ![Hugging Face](https://img.shields.io/badge/Hugging%20Face-Spaces-yellow?logo=huggingface)
8
+ ![Author](https://img.shields.io/badge/Author-algorembrant-orange)
9
+
10
+ Atomically parse and understand complex PDF documents using **claude-opus-4-6** (Anthropic).
11
+ Handles equations, graphs, algorithms, unique drawings, multi-column layouts, scanned pages,
12
+ and 100+ page documents without hallucination.
13
+
14
+ Designed to be dropped into local agent pipelines as a callable module.
15
+
16
+ ---
17
+
18
+ ## What Makes This Work
19
+
20
+ Claude processes PDFs natively through Anthropic's document API. Each page is sent as a
21
+ base64-encoded PDF chunk (or rendered at 300 DPI in image mode) alongside a structured
22
+ JSON extraction prompt. The model simultaneously sees:
23
+
24
+ - The rasterized visual content (charts, graphs, drawings, handwriting)
25
+ - The underlying text layer (searchable text, equations, captions)
26
+
27
+ This dual perception eliminates the need for separate OCR, layout parsers, or equation
28
+ recognizers. The model returns fully structured JSON containing LaTeX equations, Markdown
29
+ tables, verbatim algorithm code, and semantic figure descriptions per page.
30
+
31
+ ---
32
+
33
+ ## Features
34
+
35
+ | Feature | Description |
36
+ |---|---|
37
+ | Native PDF API | Sends PDF bytes directly; Claude sees both text and visuals |
38
+ | Image mode | Renders pages at 300 DPI via PyMuPDF for maximum fidelity |
39
+ | LaTeX equations | Every equation extracted as proper LaTeX |
40
+ | Table extraction | Tables as Markdown and list-of-dicts JSON |
41
+ | Algorithm extraction | Pseudocode and code blocks verbatim with language detection |
42
+ | Figure description | Semantic descriptions of charts, plots, diagrams, drawings |
43
+ | SQLite caching | Pages are cached; re-runs skip already-parsed pages |
44
+ | Chunked processing | Handles 100+ page documents by splitting into chunks |
45
+ | Multiple output formats | JSON, Markdown, plain text |
46
+ | Agent interface | `AgentPDFInterface` class for programmatic use |
47
+ | Batch processing | Process entire directories of PDFs |
48
+
49
+ ---
50
+
51
+ ## Requirements
52
+
53
+ - Python 3.10 or higher
54
+ - An Anthropic API key with access to `claude-opus-4-6`
55
+ - No GPU required; all inference runs through the Anthropic API
56
+
57
+ ### External System Dependencies
58
+
59
+ PyMuPDF (installed via pip) requires no external system libraries on most platforms.
60
+ On some Linux systems you may need:
61
+
62
+ ```bash
63
+ sudo apt-get install -y libmupdf-dev
64
+ ```
65
+
66
+ On macOS:
67
+
68
+ ```bash
69
+ brew install mupdf
70
+ ```
71
+
72
+ On Windows: PyMuPDF ships with pre-built wheels on PyPI; no additional steps needed.
73
+
74
+ ---
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ git clone https://github.com/algorembrant/pdf-atomic-parser.git
80
+ cd pdf-atomic-parser
81
+
82
+ python -m venv .venv
83
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
84
+
85
+ pip install -r requirements.txt
86
+ ```
87
+
88
+ Set your API key:
89
+
90
+ ```bash
91
+ export ANTHROPIC_API_KEY="sk-ant-..." # Linux / macOS
92
+ set ANTHROPIC_API_KEY=sk-ant-... # Windows CMD
93
+ $env:ANTHROPIC_API_KEY="sk-ant-..." # Windows PowerShell
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Quick Start
99
+
100
+ ### Parse a PDF
101
+
102
+ ```bash
103
+ python pdf_atomic_parser.py parse document.pdf
104
+ ```
105
+
106
+ Outputs `document_parsed.json` in the current directory.
107
+
108
+ ### Full Atomic Extraction (JSON + Markdown + Text)
109
+
110
+ ```bash
111
+ python pdf_atomic_parser.py atomic document.pdf --output ./results/
112
+ ```
113
+
114
+ ### Ask a Question
115
+
116
+ ```bash
117
+ python pdf_atomic_parser.py query document.pdf "What is the main loss function?"
118
+ ```
119
+
120
+ ### Extract Only Equations
121
+
122
+ ```bash
123
+ python pdf_atomic_parser.py extract-equations document.pdf
124
+ ```
125
+
126
+ ### Use in an Agent Pipeline
127
+
128
+ ```python
129
+ from pdf_atomic_parser import AgentPDFInterface
130
+
131
+ agent = AgentPDFInterface(model="opus")
132
+
133
+ # Full structured parse
134
+ result = agent.parse("paper.pdf")
135
+
136
+ # Just equations as list of dicts
137
+ equations = agent.get_equations("paper.pdf")
138
+ for eq in equations:
139
+ print(f"Page {eq['page']}: {eq['latex']}")
140
+
141
+ # Just tables
142
+ tables = agent.get_tables("paper.pdf")
143
+
144
+ # Semantic query
145
+ answer = agent.ask("paper.pdf", "What datasets were used for evaluation?")
146
+ print(answer)
147
+ ```
148
+
149
+ ---
150
+
151
+ ## Usage Reference
152
+
153
+ ### Command Overview
154
+
155
+ | Command | Purpose |
156
+ |---|---|
157
+ | `parse <pdf>` | Parse entire PDF to JSON/Markdown/text |
158
+ | `atomic <pdf>` | Full extraction to output directory (all formats) |
159
+ | `extract-equations <pdf>` | Extract LaTeX equations only |
160
+ | `extract-tables <pdf>` | Extract tables only |
161
+ | `extract-algorithms <pdf>` | Extract algorithms and code blocks only |
162
+ | `extract-figures <pdf>` | Extract figure descriptions only |
163
+ | `query <pdf> "<question>"` | Semantic question-answering over document |
164
+ | `batch <dir>` | Batch process all PDFs in a directory |
165
+ | `estimate <pdf>` | Estimate token count and cost before parsing |
166
+ | `cache-stats` | Show SQLite cache statistics |
167
+ | `list-cache` | List all cached documents |
168
+ | `clear-cache <pdf>` | Clear cached pages for a document |
169
+
170
+ ### Global Options
171
+
172
+ | Option | Default | Description |
173
+ |---|---|---|
174
+ | `--model` | `opus` | `opus`, `sonnet`, `haiku`, or full model string |
175
+ | `--mode` | `native` | `native` (PDF bytes) or `image` (300 DPI PNG per page) |
176
+ | `--chunk-size` | `20` | Number of pages per API call |
177
+ | `--verbose` | off | Enable debug logging |
178
+
179
+ ### parse / atomic Options
180
+
181
+ | Option | Default | Description |
182
+ |---|---|---|
183
+ | `--output / -o` | auto | Output file or directory path |
184
+ | `--format / -f` | `json` | `json`, `markdown`, or `text` |
185
+ | `--pages` | all | Page range, e.g. `1-50` |
186
+
187
+ ---
188
+
189
+ ## Output Schema
190
+
191
+ Each parsed document returns a `DocumentResult` with:
192
+
193
+ - `title`, `authors`, `abstract`, `document_summary`
194
+ - `page_results`: list of `PageResult` per page
195
+
196
+ Each `PageResult` contains:
197
+
198
+ ```json
199
+ {
200
+ "page_number": 3,
201
+ "raw_text": "Full verbatim text...",
202
+ "summary": "This page describes...",
203
+ "section_headers": ["Introduction", "Related Work"],
204
+ "keywords": ["transformer", "attention", "BERT"],
205
+ "equations": [
206
+ {
207
+ "index": 0,
208
+ "latex": "\\mathcal{L} = -\\sum_{i} y_i \\log \\hat{y}_i",
209
+ "description": "Cross-entropy loss function",
210
+ "inline": false
211
+ }
212
+ ],
213
+ "tables": [
214
+ {
215
+ "index": 0,
216
+ "markdown": "| Model | Accuracy |\n|---|---|\n| BERT | 94.2 |",
217
+ "json_data": [{"Model": "BERT", "Accuracy": "94.2"}],
218
+ "caption": "Table 1: Benchmark results"
219
+ }
220
+ ],
221
+ "algorithms": [
222
+ {
223
+ "index": 0,
224
+ "name": "Algorithm 1: Backpropagation",
225
+ "language": "pseudocode",
226
+ "code": "for each layer l from L to 1:\n ...",
227
+ "description": "Gradient descent update rule"
228
+ }
229
+ ],
230
+ "figures": [
231
+ {
232
+ "index": 0,
233
+ "figure_type": "line_chart",
234
+ "description": "Training loss over 100 epochs...",
235
+ "data_summary": "Y-axis: loss 0-2.0, X-axis: epoch 0-100...",
236
+ "caption": "Figure 2: Training curves"
237
+ }
238
+ ]
239
+ }
240
+ ```
241
+
242
+ ---
243
+
244
+ ## Choosing a Mode
245
+
246
+ | Scenario | Recommended Mode | Reason |
247
+ |---|---|---|
248
+ | Standard digital PDF | `native` (default) | Fastest, uses both text and visual layers |
249
+ | Scanned / photographed PDF | `image` | Text layer absent; vision handles everything |
250
+ | PDF with complex math | `image` | 300 DPI render ensures equation clarity |
251
+ | Very large file (>32 MB) | `image` | Native API has 32 MB size limit per chunk |
252
+ | Cost-sensitive workflow | `native` | Fewer tokens consumed |
253
+
254
+ ---
255
+
256
+ ## Cost Estimate
257
+
258
+ Rough estimates per 100-page academic paper:
259
+
260
+ | Model | Est. Tokens | Est. Cost |
261
+ |---|---|---|
262
+ | claude-opus-4-6 | ~120,000 | ~$3.50 |
263
+ | claude-sonnet-4-6 | ~120,000 | ~$0.60 |
264
+ | claude-haiku-4-5 | ~120,000 | ~$0.10 |
265
+
266
+ Use `python pdf_atomic_parser.py estimate document.pdf` for a per-document estimate.
267
+
268
+ ---
269
+
270
+ ## Caching
271
+
272
+ Parsed pages are stored in `~/.cache/pdf_atomic_parser/.pdf_parser_cache.db`.
273
+ Re-running on the same document skips already-parsed pages automatically.
274
+ The cache key is `(document_SHA256, page_number, model, mode)`.
275
+
276
+ ---
277
+
278
+ ## Project Structure
279
+
280
+ ```
281
+ pdf-atomic-parser/
282
+ pdf_atomic_parser.py Main tool (single file, no splitting needed)
283
+ requirements.txt Python dependencies
284
+ README.md This file
285
+ model_card.yml Hugging Face model card
286
+ .gitignore
287
+ .gitattributes
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Author
293
+
294
+ **algorembrant**
295
+
296
+ ---
297
+
298
+ ## License
299
+
300
+ MIT License. See LICENSE file.
model_card.yml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: mit
5
+ library_name: anthropic
6
+ tags:
7
+ - pdf
8
+ - document-parsing
9
+ - ocr
10
+ - multimodal
11
+ - equations
12
+ - table-extraction
13
+ - agent
14
+ - claude
15
+ - information-extraction
16
+ - scientific-documents
17
+ pipeline_tag: document-question-answering
18
+ model_name: PDF Atomic Parser
19
+ authors:
20
+ - algorembrant
21
+ sdk: other
22
+ sdk_version: "1.0.0"
23
+ app_file: pdf_atomic_parser.py
24
+ short_description: >
25
+ Atomically parse complex PDFs (equations, graphs, algorithms, tables)
26
+ using Claude claude-opus-4-6 without hallucination. Agent-ready.
27
+ ---
28
+
29
+ # PDF Atomic Parser
30
+
31
+ Powered by **claude-opus-4-6** (Anthropic).
32
+
33
+ ## Description
34
+
35
+ A single-file Python tool for extracting structured content from complex
36
+ academic and technical PDFs. Works on documents containing:
37
+
38
+ - Mathematical equations (extracted as LaTeX)
39
+ - Data tables (extracted as Markdown + JSON)
40
+ - Algorithms and pseudocode (verbatim with language detection)
41
+ - Figures, charts, graphs, and drawings (semantic descriptions)
42
+ - Multi-column layouts, footnotes, margin notes
43
+ - 100+ page documents via automatic chunking
44
+
45
+ ## Usage
46
+
47
+ ```bash
48
+ pip install anthropic PyMuPDF rich tqdm
49
+ export ANTHROPIC_API_KEY="sk-ant-..."
50
+
51
+ python pdf_atomic_parser.py parse document.pdf
52
+ python pdf_atomic_parser.py atomic document.pdf --output ./results/
53
+ python pdf_atomic_parser.py extract-equations document.pdf
54
+ python pdf_atomic_parser.py query document.pdf "What is the main theorem?"
55
+ ```
56
+
57
+ ## Agent Integration
58
+
59
+ ```python
60
+ from pdf_atomic_parser import AgentPDFInterface
61
+
62
+ agent = AgentPDFInterface(model="opus")
63
+ result = agent.parse("paper.pdf")
64
+ equations = agent.get_equations("paper.pdf")
65
+ tables = agent.get_tables("paper.pdf")
66
+ answer = agent.ask("paper.pdf", "What datasets were used?")
67
+ ```
68
+
69
+ ## Model Details
70
+
71
+ | Property | Value |
72
+ |---|---|
73
+ | Underlying model | claude-opus-4-6 (Anthropic) |
74
+ | Parsing modes | native PDF, page-as-image (300 DPI) |
75
+ | Max pages per call | 20 (configurable) |
76
+ | Cache | SQLite, keyed by SHA-256 + page + model + mode |
77
+ | Output formats | JSON, Markdown, plain text |
78
+
79
+ ## Citation
80
+
81
+ ```bibtex
82
+ @software{algorembrant2025pdfparser,
83
+ author = {algorembrant},
84
+ title = {PDF Atomic Parser},
85
+ year = {2025},
86
+ url = {https://github.com/algorembrant/pdf-atomic-parser}
87
+ }
88
+ ```
pdf_atomic_parser.py ADDED
@@ -0,0 +1,1405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ pdf_atomic_parser.py
3
+ ====================
4
+ Author : algorembrant
5
+ Version : 1.0.0
6
+ License : MIT
7
+
8
+ DESCRIPTION
9
+ -----------
10
+ Atomically parse and understand complex PDF documents using Claude claude-opus-4-6.
11
+ Handles equations, graphs, algorithms, unique drawings, tables, multi-column
12
+ layouts, and 100+ page documents without hallucination. Designed for local
13
+ agent pipelines.
14
+
15
+ CAPABILITIES
16
+ ------------
17
+ - Native PDF document API (base64) with prompt caching
18
+ - Page-as-image fallback using PyMuPDF at 300 DPI for max fidelity
19
+ - LaTeX equation extraction
20
+ - Table extraction (Markdown + JSON)
21
+ - Algorithm and pseudocode extraction
22
+ - Figure and graph semantic description
23
+ - Multi-column and complex layout handling
24
+ - Chunked processing for 100+ page documents
25
+ - SQLite-backed cache to avoid re-processing pages
26
+ - Structured JSON output per page and full document
27
+ - Agent-callable interface (AgentPDFInterface)
28
+ - Async batch processing for speed
29
+
30
+ USAGE COMMANDS
31
+ --------------
32
+ # Parse a PDF and save structured JSON
33
+ python pdf_atomic_parser.py parse document.pdf
34
+
35
+ # Parse with verbose output
36
+ python pdf_atomic_parser.py parse document.pdf --verbose
37
+
38
+ # Parse specific page range
39
+ python pdf_atomic_parser.py parse document.pdf --pages 1-20
40
+
41
+ # Extract only equations (LaTeX)
42
+ python pdf_atomic_parser.py extract-equations document.pdf
43
+
44
+ # Extract only tables (Markdown)
45
+ python pdf_atomic_parser.py extract-tables document.pdf
46
+
47
+ # Extract only algorithms/code blocks
48
+ python pdf_atomic_parser.py extract-algorithms document.pdf
49
+
50
+ # Extract figures and graph descriptions
51
+ python pdf_atomic_parser.py extract-figures document.pdf
52
+
53
+ # Full atomic extraction (all content types) to output dir
54
+ python pdf_atomic_parser.py atomic document.pdf --output ./results/
55
+
56
+ # Query a parsed PDF (semantic search over cached parse)
57
+ python pdf_atomic_parser.py query document.pdf "What is the main theorem?"
58
+
59
+ # Use faster/cheaper model (Sonnet instead of Opus)
60
+ python pdf_atomic_parser.py parse document.pdf --model sonnet
61
+
62
+ # Use page-as-image mode (higher fidelity for scanned/complex PDFs)
63
+ python pdf_atomic_parser.py parse document.pdf --mode image
64
+
65
+ # Use native PDF mode (default, faster)
66
+ python pdf_atomic_parser.py parse document.pdf --mode native
67
+
68
+ # Set chunk size for large PDFs (default 20 pages per chunk)
69
+ python pdf_atomic_parser.py parse document.pdf --chunk-size 10
70
+
71
+ # Clear cache for a document
72
+ python pdf_atomic_parser.py clear-cache document.pdf
73
+
74
+ # Show cache stats
75
+ python pdf_atomic_parser.py cache-stats
76
+
77
+ # List all cached documents
78
+ python pdf_atomic_parser.py list-cache
79
+
80
+ # Batch process a directory of PDFs
81
+ python pdf_atomic_parser.py batch ./pdf_folder/ --output ./results/
82
+
83
+ # Export parse results as Markdown report
84
+ python pdf_atomic_parser.py parse document.pdf --format markdown
85
+
86
+ # Export as plain text
87
+ python pdf_atomic_parser.py parse document.pdf --format text
88
+
89
+ # Show token usage estimate before parsing
90
+ python pdf_atomic_parser.py estimate document.pdf
91
+
92
+ # Agent interface example (programmatic)
93
+ # from pdf_atomic_parser import AgentPDFInterface
94
+ # agent = AgentPDFInterface()
95
+ # result = agent.parse("document.pdf")
96
+ # equations = agent.get_equations("document.pdf")
97
+ """
98
+
99
+ from __future__ import annotations
100
+
101
+ import argparse
102
+ import asyncio
103
+ import base64
104
+ import hashlib
105
+ import json
106
+ import logging
107
+ import os
108
+ import sqlite3
109
+ import sys
110
+ import time
111
+ from concurrent.futures import ThreadPoolExecutor, as_completed
112
+ from dataclasses import asdict, dataclass, field
113
+ from pathlib import Path
114
+ from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple
115
+
116
+ import anthropic
117
+ import fitz # PyMuPDF
118
+ from rich.console import Console
119
+ from rich.logging import RichHandler
120
+ from rich.progress import (
121
+ BarColumn,
122
+ MofNCompleteColumn,
123
+ Progress,
124
+ SpinnerColumn,
125
+ TaskProgressColumn,
126
+ TextColumn,
127
+ TimeElapsedColumn,
128
+ TimeRemainingColumn,
129
+ )
130
+ from rich.table import Table
131
+ from tqdm import tqdm
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Configuration
136
+ # ---------------------------------------------------------------------------
137
+
138
+ DEFAULT_MODEL_OPUS = "claude-opus-4-6"
139
+ DEFAULT_MODEL_SONNET = "claude-sonnet-4-6"
140
+ DEFAULT_MODEL_HAIKU = "claude-haiku-4-5-20251001"
141
+
142
+ MAX_TOKENS_OUTPUT = 8192
143
+ CHUNK_SIZE_DEFAULT = 20 # pages per API call
144
+ IMAGE_DPI = 300 # render DPI for page-as-image mode
145
+ MAX_PDF_SIZE_BYTES = 32 * 1024 * 1024 # 32 MB native API limit
146
+ MAX_PDF_PAGES_NATIVE = 100 # native API page cap per request
147
+ CACHE_DB_NAME = ".pdf_parser_cache.db"
148
+ LOG_FORMAT = "%(message)s"
149
+
150
+ console = Console()
151
+
152
+ logging.basicConfig(
153
+ level=logging.WARNING,
154
+ format=LOG_FORMAT,
155
+ handlers=[RichHandler(console=console, rich_tracebacks=True, show_path=False)],
156
+ )
157
+ logger = logging.getLogger("pdf_atomic_parser")
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Data structures
162
+ # ---------------------------------------------------------------------------
163
+
164
+ @dataclass
165
+ class EquationBlock:
166
+ page: int
167
+ index: int
168
+ latex: str
169
+ description: str
170
+ inline: bool = False
171
+
172
+
173
+ @dataclass
174
+ class TableBlock:
175
+ page: int
176
+ index: int
177
+ markdown: str
178
+ json_data: List[Dict]
179
+ caption: str = ""
180
+
181
+
182
+ @dataclass
183
+ class AlgorithmBlock:
184
+ page: int
185
+ index: int
186
+ name: str
187
+ language: str
188
+ code: str
189
+ description: str
190
+
191
+
192
+ @dataclass
193
+ class FigureBlock:
194
+ page: int
195
+ index: int
196
+ figure_type: str # chart | diagram | drawing | photograph | plot
197
+ description: str
198
+ data_summary: str
199
+ caption: str = ""
200
+
201
+
202
+ @dataclass
203
+ class PageResult:
204
+ page_number: int
205
+ raw_text: str
206
+ summary: str
207
+ equations: List[EquationBlock] = field(default_factory=list)
208
+ tables: List[TableBlock] = field(default_factory=list)
209
+ algorithms: List[AlgorithmBlock] = field(default_factory=list)
210
+ figures: List[FigureBlock] = field(default_factory=list)
211
+ section_headers: List[str] = field(default_factory=list)
212
+ references: List[str] = field(default_factory=list)
213
+ keywords: List[str] = field(default_factory=list)
214
+ layout_notes: str = ""
215
+ processing_mode: str = "native"
216
+ tokens_used: int = 0
217
+ processing_time_s: float = 0.0
218
+
219
+
220
+ @dataclass
221
+ class DocumentResult:
222
+ document_path: str
223
+ document_hash: str
224
+ total_pages: int
225
+ pages_processed: int
226
+ model: str
227
+ processing_mode: str
228
+ title: str
229
+ authors: List[str]
230
+ abstract: str
231
+ document_summary: str
232
+ page_results: List[PageResult] = field(default_factory=list)
233
+ total_equations: int = 0
234
+ total_tables: int = 0
235
+ total_algorithms: int = 0
236
+ total_figures: int = 0
237
+ total_tokens_used: int = 0
238
+ total_processing_time_s: float = 0.0
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Cache layer
243
+ # ---------------------------------------------------------------------------
244
+
245
+ class ParseCache:
246
+ """SQLite-backed cache for parsed page results."""
247
+
248
+ def __init__(self, cache_dir: Path):
249
+ cache_dir.mkdir(parents=True, exist_ok=True)
250
+ self.db_path = cache_dir / CACHE_DB_NAME
251
+ self._init_db()
252
+
253
+ def _init_db(self) -> None:
254
+ with self._connect() as conn:
255
+ conn.execute("""
256
+ CREATE TABLE IF NOT EXISTS page_cache (
257
+ doc_hash TEXT NOT NULL,
258
+ page_num INTEGER NOT NULL,
259
+ model TEXT NOT NULL,
260
+ mode TEXT NOT NULL,
261
+ result_json TEXT NOT NULL,
262
+ created_at REAL NOT NULL,
263
+ PRIMARY KEY (doc_hash, page_num, model, mode)
264
+ )
265
+ """)
266
+ conn.execute("""
267
+ CREATE TABLE IF NOT EXISTS doc_meta (
268
+ doc_hash TEXT PRIMARY KEY,
269
+ doc_path TEXT NOT NULL,
270
+ total_pages INTEGER NOT NULL,
271
+ created_at REAL NOT NULL
272
+ )
273
+ """)
274
+
275
+ def _connect(self) -> sqlite3.Connection:
276
+ conn = sqlite3.connect(self.db_path, timeout=30)
277
+ conn.execute("PRAGMA journal_mode=WAL")
278
+ return conn
279
+
280
+ @staticmethod
281
+ def file_hash(path: Path) -> str:
282
+ h = hashlib.sha256()
283
+ with open(path, "rb") as fh:
284
+ for chunk in iter(lambda: fh.read(65536), b""):
285
+ h.update(chunk)
286
+ return h.hexdigest()[:16]
287
+
288
+ def get_page(self, doc_hash: str, page_num: int, model: str, mode: str) -> Optional[PageResult]:
289
+ with self._connect() as conn:
290
+ row = conn.execute(
291
+ "SELECT result_json FROM page_cache WHERE doc_hash=? AND page_num=? AND model=? AND mode=?",
292
+ (doc_hash, page_num, model, mode),
293
+ ).fetchone()
294
+ if row:
295
+ return self._deserialize_page(json.loads(row[0]))
296
+ return None
297
+
298
+ def set_page(self, doc_hash: str, result: PageResult, model: str, mode: str) -> None:
299
+ with self._connect() as conn:
300
+ conn.execute(
301
+ "INSERT OR REPLACE INTO page_cache VALUES (?,?,?,?,?,?)",
302
+ (doc_hash, result.page_number, model, mode,
303
+ json.dumps(self._serialize_page(result)), time.time()),
304
+ )
305
+
306
+ def clear_document(self, doc_hash: str) -> int:
307
+ with self._connect() as conn:
308
+ cur = conn.execute("DELETE FROM page_cache WHERE doc_hash=?", (doc_hash,))
309
+ conn.execute("DELETE FROM doc_meta WHERE doc_hash=?", (doc_hash,))
310
+ return cur.rowcount
311
+
312
+ def stats(self) -> Dict[str, Any]:
313
+ with self._connect() as conn:
314
+ total = conn.execute("SELECT COUNT(*) FROM page_cache").fetchone()[0]
315
+ docs = conn.execute("SELECT COUNT(DISTINCT doc_hash) FROM page_cache").fetchone()[0]
316
+ size = self.db_path.stat().st_size if self.db_path.exists() else 0
317
+ return {"total_cached_pages": total, "unique_documents": docs, "cache_size_mb": round(size / 1e6, 2)}
318
+
319
+ def list_documents(self) -> List[Dict]:
320
+ with self._connect() as conn:
321
+ rows = conn.execute("""
322
+ SELECT doc_hash, COUNT(*) as pages, MIN(created_at) as first_seen
323
+ FROM page_cache GROUP BY doc_hash
324
+ """).fetchall()
325
+ return [{"hash": r[0], "cached_pages": r[1], "first_seen": r[2]} for r in rows]
326
+
327
+ # -- serialization helpers -----------------------------------------------
328
+
329
+ @staticmethod
330
+ def _serialize_page(p: PageResult) -> Dict:
331
+ d = asdict(p)
332
+ return d
333
+
334
+ @staticmethod
335
+ def _deserialize_page(d: Dict) -> PageResult:
336
+ d["equations"] = [EquationBlock(**e) for e in d.get("equations", [])]
337
+ d["tables"] = [TableBlock(**t) for t in d.get("tables", [])]
338
+ d["algorithms"] = [AlgorithmBlock(**a) for a in d.get("algorithms", [])]
339
+ d["figures"] = [FigureBlock(**f) for f in d.get("figures", [])]
340
+ return PageResult(**d)
341
+
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # PDF utilities
345
+ # ---------------------------------------------------------------------------
346
+
347
+ class PDFDocument:
348
+ """Thin wrapper around fitz.Document with chunking helpers."""
349
+
350
+ def __init__(self, path: Path):
351
+ self.path = path
352
+ self._doc = fitz.open(str(path))
353
+ self.total_pages = len(self._doc)
354
+
355
+ @property
356
+ def file_size_bytes(self) -> int:
357
+ return self.path.stat().st_size
358
+
359
+ def get_chunk_ranges(self, chunk_size: int) -> List[Tuple[int, int]]:
360
+ """Return list of (start_page_0indexed, end_page_exclusive) tuples."""
361
+ ranges = []
362
+ for start in range(0, self.total_pages, chunk_size):
363
+ end = min(start + chunk_size, self.total_pages)
364
+ ranges.append((start, end))
365
+ return ranges
366
+
367
+ def get_chunk_as_pdf_bytes(self, start: int, end: int) -> bytes:
368
+ """Extract pages [start, end) into a new in-memory PDF."""
369
+ sub = fitz.open()
370
+ sub.insert_pdf(self._doc, from_page=start, to_page=end - 1)
371
+ return sub.write()
372
+
373
+ def get_page_as_png_bytes(self, page_idx: int, dpi: int = IMAGE_DPI) -> bytes:
374
+ """Render a single page to PNG bytes at given DPI."""
375
+ page = self._doc[page_idx]
376
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
377
+ pix = page.get_pixmap(matrix=mat, alpha=False)
378
+ return pix.tobytes("png")
379
+
380
+ def close(self) -> None:
381
+ self._doc.close()
382
+
383
+ def __enter__(self):
384
+ return self
385
+
386
+ def __exit__(self, *_):
387
+ self.close()
388
+
389
+
390
+ # ---------------------------------------------------------------------------
391
+ # Extraction prompts
392
+ # ---------------------------------------------------------------------------
393
+
394
+ SYSTEM_PROMPT = """You are an expert scientific document analyst specializing in atomically
395
+ parsing complex academic and technical PDFs. Your extractions must be:
396
+ - Complete: capture every equation, table, figure, and algorithm
397
+ - Faithful: never invent or hallucinate content
398
+ - Precise: reproduce equations in proper LaTeX
399
+ - Structured: respond only with valid JSON matching the schema provided
400
+
401
+ Do NOT add prose outside the JSON response. If a field has no content, use an
402
+ empty list [] or empty string "" rather than null."""
403
+
404
+ PAGE_EXTRACTION_PROMPT = """\
405
+ Atomically parse the provided PDF page(s) and return a JSON object that matches
406
+ this schema exactly:
407
+
408
+ {
409
+ "raw_text": "<full verbatim text extracted from page, preserving paragraphs>",
410
+ "summary": "<2-4 sentence factual summary of this page>",
411
+ "section_headers": ["<header string>", ...],
412
+ "keywords": ["<important technical term>", ...],
413
+ "layout_notes": "<describe columns, special layouts, footnotes, margin notes>",
414
+ "equations": [
415
+ {
416
+ "index": <int starting at 0>,
417
+ "latex": "<complete LaTeX representation>",
418
+ "description": "<what this equation represents>",
419
+ "inline": <true if inline, false if display/block>
420
+ }
421
+ ],
422
+ "tables": [
423
+ {
424
+ "index": <int>,
425
+ "markdown": "<GitHub-flavored Markdown table>",
426
+ "json_data": [{"col1": "val", ...}, ...],
427
+ "caption": "<table caption or empty string>"
428
+ }
429
+ ],
430
+ "algorithms": [
431
+ {
432
+ "index": <int>,
433
+ "name": "<algorithm name or Algorithm N>",
434
+ "language": "<pseudocode | python | cpp | generic | etc.>",
435
+ "code": "<verbatim algorithm text, preserve indentation>",
436
+ "description": "<what this algorithm does>"
437
+ }
438
+ ],
439
+ "figures": [
440
+ {
441
+ "index": <int>,
442
+ "figure_type": "<chart | bar_chart | line_chart | scatter_plot | histogram | diagram | flowchart | neural_network | tree | graph | drawing | photograph | heatmap | 3d_plot | other>",
443
+ "description": "<detailed semantic description of the visual>",
444
+ "data_summary": "<describe axes, units, trend, key values if quantitative>",
445
+ "caption": "<figure caption or empty string>"
446
+ }
447
+ ],
448
+ "references": ["<any in-text citation or bibliography entry on this page>"]
449
+ }
450
+
451
+ Rules:
452
+ 1. Every equation MUST have LaTeX. Use \\frac, \\sum, \\int, \\mathbf etc. for proper notation.
453
+ 2. Tables must be fully reproduced in both Markdown and as list-of-dicts.
454
+ 3. Algorithms must preserve all steps, loops, conditions verbatim.
455
+ 4. Figures: describe them as if for a blind reader — quantitative values, trends, colors, labels.
456
+ 5. raw_text must include ALL text visible on the page, including headers, footers, captions.
457
+ 6. Do NOT summarize or truncate any content.
458
+ """
459
+
460
+ DOCUMENT_META_PROMPT = """\
461
+ Based on the document pages you have seen, extract high-level metadata as JSON:
462
+
463
+ {
464
+ "title": "<document title>",
465
+ "authors": ["<author name>", ...],
466
+ "abstract": "<full abstract text or empty string if none>",
467
+ "document_summary": "<comprehensive 5-8 sentence summary of the entire document>"
468
+ }
469
+
470
+ Respond with valid JSON only.
471
+ """
472
+
473
+
474
+ # ---------------------------------------------------------------------------
475
+ # Core parser
476
+ # ---------------------------------------------------------------------------
477
+
478
+ class AtomicPDFParser:
479
+ """
480
+ Core parser that sends PDF chunks or page images to the Claude API
481
+ and extracts structured content atomically.
482
+ """
483
+
484
+ def __init__(
485
+ self,
486
+ api_key: Optional[str] = None,
487
+ model: str = DEFAULT_MODEL_OPUS,
488
+ mode: str = "native", # "native" | "image"
489
+ chunk_size: int = CHUNK_SIZE_DEFAULT,
490
+ cache_dir: Optional[Path] = None,
491
+ verbose: bool = False,
492
+ max_workers: int = 4,
493
+ ):
494
+ self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "")
495
+ self.model = self._resolve_model(model)
496
+ self.mode = mode
497
+ self.chunk_size = chunk_size
498
+ self.verbose = verbose
499
+ self.max_workers = max_workers
500
+
501
+ if not self.api_key:
502
+ raise ValueError(
503
+ "ANTHROPIC_API_KEY environment variable not set. "
504
+ "Export it or pass api_key= to AtomicPDFParser."
505
+ )
506
+
507
+ self.client = anthropic.Anthropic(api_key=self.api_key)
508
+
509
+ cache_path = cache_dir or Path.home() / ".cache" / "pdf_atomic_parser"
510
+ self.cache = ParseCache(cache_path)
511
+
512
+ if verbose:
513
+ logger.setLevel(logging.DEBUG)
514
+
515
+ @staticmethod
516
+ def _resolve_model(alias: str) -> str:
517
+ mapping = {
518
+ "opus": DEFAULT_MODEL_OPUS,
519
+ "sonnet": DEFAULT_MODEL_SONNET,
520
+ "haiku": DEFAULT_MODEL_HAIKU,
521
+ }
522
+ return mapping.get(alias.lower(), alias)
523
+
524
+ # ------------------------------------------------------------------
525
+ # Public API
526
+ # ------------------------------------------------------------------
527
+
528
+ def parse(
529
+ self,
530
+ pdf_path: str | Path,
531
+ page_range: Optional[Tuple[int, int]] = None,
532
+ ) -> DocumentResult:
533
+ """
534
+ Parse the entire document (or a page range) atomically.
535
+
536
+ Parameters
537
+ ----------
538
+ pdf_path : Path to the PDF file.
539
+ page_range : Optional (start, end) 1-indexed inclusive page numbers.
540
+
541
+ Returns
542
+ -------
543
+ DocumentResult with full structured extraction.
544
+ """
545
+ path = Path(pdf_path).resolve()
546
+ if not path.exists():
547
+ raise FileNotFoundError(f"PDF not found: {path}")
548
+
549
+ doc_hash = self.cache.file_hash(path)
550
+ t_start = time.time()
551
+
552
+ with PDFDocument(path) as pdf:
553
+ total = pdf.total_pages
554
+ if page_range:
555
+ p_start = max(0, page_range[0] - 1)
556
+ p_end = min(total, page_range[1])
557
+ else:
558
+ p_start, p_end = 0, total
559
+
560
+ chunks = []
561
+ for s in range(p_start, p_end, self.chunk_size):
562
+ e = min(s + self.chunk_size, p_end)
563
+ chunks.append((s, e))
564
+
565
+ page_results: List[PageResult] = []
566
+
567
+ with Progress(
568
+ SpinnerColumn(),
569
+ TextColumn("[bold cyan]{task.description}"),
570
+ BarColumn(),
571
+ MofNCompleteColumn(),
572
+ TaskProgressColumn(),
573
+ TimeElapsedColumn(),
574
+ TimeRemainingColumn(),
575
+ console=console,
576
+ transient=False,
577
+ ) as progress:
578
+ task = progress.add_task(
579
+ f"[cyan]Parsing {path.name}", total=len(chunks)
580
+ )
581
+
582
+ for chunk_start, chunk_end in chunks:
583
+ chunk_pages = self._parse_chunk(
584
+ pdf, doc_hash, chunk_start, chunk_end
585
+ )
586
+ page_results.extend(chunk_pages)
587
+ progress.advance(task)
588
+
589
+ # Build document-level metadata
590
+ meta = self._extract_document_meta(page_results)
591
+
592
+ doc_result = DocumentResult(
593
+ document_path = str(path),
594
+ document_hash = doc_hash,
595
+ total_pages = total,
596
+ pages_processed = len(page_results),
597
+ model = self.model,
598
+ processing_mode = self.mode,
599
+ title = meta.get("title", ""),
600
+ authors = meta.get("authors", []),
601
+ abstract = meta.get("abstract", ""),
602
+ document_summary = meta.get("document_summary", ""),
603
+ page_results = page_results,
604
+ total_equations = sum(len(p.equations) for p in page_results),
605
+ total_tables = sum(len(p.tables) for p in page_results),
606
+ total_algorithms = sum(len(p.algorithms) for p in page_results),
607
+ total_figures = sum(len(p.figures) for p in page_results),
608
+ total_tokens_used = sum(p.tokens_used for p in page_results),
609
+ total_processing_time_s = time.time() - t_start,
610
+ )
611
+ return doc_result
612
+
613
+ def extract_equations(self, pdf_path: str | Path) -> List[EquationBlock]:
614
+ result = self.parse(pdf_path)
615
+ return [eq for p in result.page_results for eq in p.equations]
616
+
617
+ def extract_tables(self, pdf_path: str | Path) -> List[TableBlock]:
618
+ result = self.parse(pdf_path)
619
+ return [tb for p in result.page_results for tb in p.tables]
620
+
621
+ def extract_algorithms(self, pdf_path: str | Path) -> List[AlgorithmBlock]:
622
+ result = self.parse(pdf_path)
623
+ return [al for p in result.page_results for al in p.algorithms]
624
+
625
+ def extract_figures(self, pdf_path: str | Path) -> List[FigureBlock]:
626
+ result = self.parse(pdf_path)
627
+ return [fg for p in result.page_results for fg in p.figures]
628
+
629
+ def query(self, pdf_path: str | Path, question: str) -> str:
630
+ """
631
+ Semantic query over cached parse results. Re-parses if not cached.
632
+ """
633
+ result = self.parse(pdf_path)
634
+ full_text = "\n\n".join(
635
+ f"[Page {p.page_number}]\n{p.raw_text}" for p in result.page_results
636
+ )
637
+ messages = [
638
+ {
639
+ "role": "user",
640
+ "content": (
641
+ f"Based on the following document content, answer this question "
642
+ f"precisely and cite page numbers where relevant.\n\n"
643
+ f"Question: {question}\n\n"
644
+ f"Document content:\n{full_text[:60000]}"
645
+ ),
646
+ }
647
+ ]
648
+ resp = self.client.messages.create(
649
+ model=self.model,
650
+ max_tokens=2048,
651
+ messages=messages,
652
+ )
653
+ return resp.content[0].text
654
+
655
+ # ------------------------------------------------------------------
656
+ # Internal methods
657
+ # ------------------------------------------------------------------
658
+
659
+ def _parse_chunk(
660
+ self,
661
+ pdf: PDFDocument,
662
+ doc_hash: str,
663
+ chunk_start: int,
664
+ chunk_end: int,
665
+ ) -> List[PageResult]:
666
+ """Parse a range of pages, using cache when available."""
667
+ results = []
668
+ pages_to_process = []
669
+
670
+ for pg in range(chunk_start, chunk_end):
671
+ cached = self.cache.get_page(doc_hash, pg + 1, self.model, self.mode)
672
+ if cached:
673
+ logger.debug("Cache hit page %d", pg + 1)
674
+ results.append(cached)
675
+ else:
676
+ pages_to_process.append(pg)
677
+
678
+ if not pages_to_process:
679
+ return results
680
+
681
+ # Group consecutive un-cached pages into sub-chunks
682
+ sub_chunks = self._group_consecutive(pages_to_process)
683
+ for sub_start, sub_end in sub_chunks:
684
+ sub_results = self._call_api_chunk(pdf, doc_hash, sub_start, sub_end)
685
+ results.extend(sub_results)
686
+
687
+ results.sort(key=lambda r: r.page_number)
688
+ return results
689
+
690
+ @staticmethod
691
+ def _group_consecutive(pages: List[int]) -> List[Tuple[int, int]]:
692
+ if not pages:
693
+ return []
694
+ groups, start, prev = [], pages[0], pages[0]
695
+ for p in pages[1:]:
696
+ if p != prev + 1:
697
+ groups.append((start, prev + 1))
698
+ start = p
699
+ prev = p
700
+ groups.append((start, prev + 1))
701
+ return groups
702
+
703
+ def _call_api_chunk(
704
+ self,
705
+ pdf: PDFDocument,
706
+ doc_hash: str,
707
+ chunk_start: int,
708
+ chunk_end: int,
709
+ ) -> List[PageResult]:
710
+ """Send pages to Claude API and parse response."""
711
+ t_start = time.time()
712
+
713
+ if self.mode == "image":
714
+ return self._call_api_as_images(pdf, doc_hash, chunk_start, chunk_end, t_start)
715
+ else:
716
+ return self._call_api_native(pdf, doc_hash, chunk_start, chunk_end, t_start)
717
+
718
+ def _call_api_native(
719
+ self,
720
+ pdf: PDFDocument,
721
+ doc_hash: str,
722
+ chunk_start: int,
723
+ chunk_end: int,
724
+ t_start: float,
725
+ ) -> List[PageResult]:
726
+ chunk_bytes = pdf.get_chunk_as_pdf_bytes(chunk_start, chunk_end)
727
+ b64_pdf = base64.standard_b64encode(chunk_bytes).decode("utf-8")
728
+ num_pages = chunk_end - chunk_start
729
+
730
+ prompt_suffix = (
731
+ f"\nThis PDF chunk contains pages {chunk_start + 1} to {chunk_end} "
732
+ f"of the original document. "
733
+ f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
734
+ f"Index them page_number={chunk_start + 1} through {chunk_end}."
735
+ )
736
+
737
+ messages = [
738
+ {
739
+ "role": "user",
740
+ "content": [
741
+ {
742
+ "type": "document",
743
+ "source": {
744
+ "type": "base64",
745
+ "media_type": "application/pdf",
746
+ "data": b64_pdf,
747
+ },
748
+ "cache_control": {"type": "ephemeral"},
749
+ },
750
+ {
751
+ "type": "text",
752
+ "text": PAGE_EXTRACTION_PROMPT + prompt_suffix,
753
+ },
754
+ ],
755
+ }
756
+ ]
757
+
758
+ return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "native")
759
+
760
+ def _call_api_as_images(
761
+ self,
762
+ pdf: PDFDocument,
763
+ doc_hash: str,
764
+ chunk_start: int,
765
+ chunk_end: int,
766
+ t_start: float,
767
+ ) -> List[PageResult]:
768
+ content = []
769
+ for pg_idx in range(chunk_start, chunk_end):
770
+ png_bytes = pdf.get_page_as_png_bytes(pg_idx, dpi=IMAGE_DPI)
771
+ b64_img = base64.standard_b64encode(png_bytes).decode("utf-8")
772
+ content.append({
773
+ "type": "text",
774
+ "text": f"--- Page {pg_idx + 1} ---",
775
+ })
776
+ content.append({
777
+ "type": "image",
778
+ "source": {
779
+ "type": "base64",
780
+ "media_type": "image/png",
781
+ "data": b64_img,
782
+ },
783
+ })
784
+
785
+ num_pages = chunk_end - chunk_start
786
+ prompt_suffix = (
787
+ f"\nThese are page images {chunk_start + 1} through {chunk_end}. "
788
+ f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
789
+ f"Index them page_number={chunk_start + 1} through {chunk_end}."
790
+ )
791
+ content.append({"type": "text", "text": PAGE_EXTRACTION_PROMPT + prompt_suffix})
792
+
793
+ messages = [{"role": "user", "content": content}]
794
+ return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "image")
795
+
796
+ def _execute_api_call(
797
+ self,
798
+ messages: List[Dict],
799
+ doc_hash: str,
800
+ chunk_start: int,
801
+ chunk_end: int,
802
+ t_start: float,
803
+ mode: str,
804
+ ) -> List[PageResult]:
805
+ retries, delay = 3, 5
806
+ for attempt in range(retries):
807
+ try:
808
+ resp = self.client.messages.create(
809
+ model=self.model,
810
+ max_tokens=MAX_TOKENS_OUTPUT,
811
+ system=SYSTEM_PROMPT,
812
+ messages=messages,
813
+ )
814
+ break
815
+ except anthropic.RateLimitError:
816
+ if attempt == retries - 1:
817
+ raise
818
+ logger.warning("Rate limit hit; retrying in %ds...", delay)
819
+ time.sleep(delay)
820
+ delay *= 2
821
+ except anthropic.APIStatusError as exc:
822
+ logger.error("API error: %s", exc)
823
+ raise
824
+
825
+ raw_response = resp.content[0].text.strip()
826
+ tokens_used = resp.usage.input_tokens + resp.usage.output_tokens
827
+ elapsed = time.time() - t_start
828
+
829
+ # Clean possible markdown fences
830
+ if raw_response.startswith("```"):
831
+ lines = raw_response.split("\n")
832
+ raw_response = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
833
+
834
+ try:
835
+ parsed = json.loads(raw_response)
836
+ except json.JSONDecodeError as exc:
837
+ logger.error("JSON parse error on API response: %s\nRaw:\n%s", exc, raw_response[:500])
838
+ # Return minimal fallback for affected pages
839
+ return [
840
+ PageResult(
841
+ page_number=pg + 1,
842
+ raw_text="[PARSE ERROR: JSON decode failed]",
843
+ summary="Failed to parse this page.",
844
+ processing_mode=mode,
845
+ tokens_used=tokens_used // max(1, chunk_end - chunk_start),
846
+ processing_time_s=elapsed,
847
+ )
848
+ for pg in range(chunk_start, chunk_end)
849
+ ]
850
+
851
+ # Handle both array-of-pages and single-page responses
852
+ if isinstance(parsed, dict):
853
+ parsed = [parsed]
854
+
855
+ results = []
856
+ for i, page_data in enumerate(parsed):
857
+ pg_num = chunk_start + i + 1
858
+ page_data["page_number"] = pg_num
859
+ page_data["processing_mode"] = mode
860
+ page_data["tokens_used"] = tokens_used // len(parsed)
861
+ page_data["processing_time_s"] = elapsed / len(parsed)
862
+
863
+ pr = self._dict_to_page_result(page_data)
864
+ self.cache.set_page(doc_hash, pr, self.model, mode)
865
+ results.append(pr)
866
+
867
+ return results
868
+
869
+ @staticmethod
870
+ def _dict_to_page_result(d: Dict) -> PageResult:
871
+ equations = [
872
+ EquationBlock(
873
+ page=d["page_number"],
874
+ index=e.get("index", i),
875
+ latex=e.get("latex", ""),
876
+ description=e.get("description", ""),
877
+ inline=e.get("inline", False),
878
+ )
879
+ for i, e in enumerate(d.get("equations", []))
880
+ ]
881
+ tables = [
882
+ TableBlock(
883
+ page=d["page_number"],
884
+ index=t.get("index", i),
885
+ markdown=t.get("markdown", ""),
886
+ json_data=t.get("json_data", []),
887
+ caption=t.get("caption", ""),
888
+ )
889
+ for i, t in enumerate(d.get("tables", []))
890
+ ]
891
+ algorithms = [
892
+ AlgorithmBlock(
893
+ page=d["page_number"],
894
+ index=a.get("index", i),
895
+ name=a.get("name", f"Algorithm {i+1}"),
896
+ language=a.get("language", "pseudocode"),
897
+ code=a.get("code", ""),
898
+ description=a.get("description", ""),
899
+ )
900
+ for i, a in enumerate(d.get("algorithms", []))
901
+ ]
902
+ figures = [
903
+ FigureBlock(
904
+ page=d["page_number"],
905
+ index=f.get("index", i),
906
+ figure_type=f.get("figure_type", "other"),
907
+ description=f.get("description", ""),
908
+ data_summary=f.get("data_summary", ""),
909
+ caption=f.get("caption", ""),
910
+ )
911
+ for i, f in enumerate(d.get("figures", []))
912
+ ]
913
+ return PageResult(
914
+ page_number = d["page_number"],
915
+ raw_text = d.get("raw_text", ""),
916
+ summary = d.get("summary", ""),
917
+ equations = equations,
918
+ tables = tables,
919
+ algorithms = algorithms,
920
+ figures = figures,
921
+ section_headers = d.get("section_headers", []),
922
+ references = d.get("references", []),
923
+ keywords = d.get("keywords", []),
924
+ layout_notes = d.get("layout_notes", ""),
925
+ processing_mode = d.get("processing_mode", "native"),
926
+ tokens_used = d.get("tokens_used", 0),
927
+ processing_time_s = d.get("processing_time_s", 0.0),
928
+ )
929
+
930
+ def _extract_document_meta(self, page_results: List[PageResult]) -> Dict:
931
+ # Use first 5 pages for metadata extraction
932
+ sample_text = "\n\n".join(
933
+ f"[Page {p.page_number}]\n{p.raw_text}" for p in page_results[:5]
934
+ )
935
+ messages = [
936
+ {
937
+ "role": "user",
938
+ "content": (
939
+ f"{DOCUMENT_META_PROMPT}\n\nDocument sample:\n{sample_text[:8000]}"
940
+ ),
941
+ }
942
+ ]
943
+ try:
944
+ resp = self.client.messages.create(
945
+ model=self.model,
946
+ max_tokens=1024,
947
+ system=SYSTEM_PROMPT,
948
+ messages=messages,
949
+ )
950
+ raw = resp.content[0].text.strip()
951
+ if raw.startswith("```"):
952
+ lines = raw.split("\n")
953
+ raw = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
954
+ return json.loads(raw)
955
+ except Exception as exc:
956
+ logger.warning("Document meta extraction failed: %s", exc)
957
+ return {"title": "", "authors": [], "abstract": "", "document_summary": ""}
958
+
959
+
960
+ # ---------------------------------------------------------------------------
961
+ # Output formatters
962
+ # ---------------------------------------------------------------------------
963
+
964
+ class OutputFormatter:
965
+ @staticmethod
966
+ def to_json(result: DocumentResult, indent: int = 2) -> str:
967
+ return json.dumps(asdict(result), indent=indent, ensure_ascii=False)
968
+
969
+ @staticmethod
970
+ def to_markdown(result: DocumentResult) -> str:
971
+ lines = []
972
+ lines.append(f"# {result.title or Path(result.document_path).name}")
973
+ if result.authors:
974
+ lines.append(f"\n**Authors:** {', '.join(result.authors)}")
975
+ lines.append(f"\n**Document Hash:** `{result.document_hash}`")
976
+ lines.append(f"**Model:** {result.model} | **Mode:** {result.processing_mode}")
977
+ lines.append(
978
+ f"**Pages:** {result.pages_processed}/{result.total_pages} | "
979
+ f"**Tokens:** {result.total_tokens_used:,} | "
980
+ f"**Time:** {result.total_processing_time_s:.1f}s"
981
+ )
982
+ lines.append(
983
+ f"**Equations:** {result.total_equations} | "
984
+ f"**Tables:** {result.total_tables} | "
985
+ f"**Algorithms:** {result.total_algorithms} | "
986
+ f"**Figures:** {result.total_figures}"
987
+ )
988
+ if result.abstract:
989
+ lines.append(f"\n## Abstract\n\n{result.abstract}")
990
+ if result.document_summary:
991
+ lines.append(f"\n## Document Summary\n\n{result.document_summary}")
992
+
993
+ for page in result.page_results:
994
+ lines.append(f"\n---\n\n## Page {page.page_number}")
995
+ if page.section_headers:
996
+ lines.append("\n### Sections\n" + "\n".join(f"- {h}" for h in page.section_headers))
997
+ lines.append(f"\n### Summary\n{page.summary}")
998
+ lines.append(f"\n### Full Text\n\n{page.raw_text}")
999
+
1000
+ if page.equations:
1001
+ lines.append("\n### Equations\n")
1002
+ for eq in page.equations:
1003
+ lines.append(f"**Eq {eq.index}** ({('inline' if eq.inline else 'display')})")
1004
+ lines.append(f"```latex\n{eq.latex}\n```")
1005
+ lines.append(f"*{eq.description}*\n")
1006
+
1007
+ if page.tables:
1008
+ lines.append("\n### Tables\n")
1009
+ for tb in page.tables:
1010
+ if tb.caption:
1011
+ lines.append(f"**{tb.caption}**\n")
1012
+ lines.append(tb.markdown + "\n")
1013
+
1014
+ if page.algorithms:
1015
+ lines.append("\n### Algorithms\n")
1016
+ for al in page.algorithms:
1017
+ lines.append(f"**{al.name}** ({al.language})\n")
1018
+ lines.append(f"```{al.language}\n{al.code}\n```")
1019
+ lines.append(f"*{al.description}*\n")
1020
+
1021
+ if page.figures:
1022
+ lines.append("\n### Figures\n")
1023
+ for fg in page.figures:
1024
+ lines.append(f"**Figure {fg.index}** [{fg.figure_type}]")
1025
+ if fg.caption:
1026
+ lines.append(f"*{fg.caption}*")
1027
+ lines.append(fg.description)
1028
+ if fg.data_summary:
1029
+ lines.append(f"Data: {fg.data_summary}\n")
1030
+
1031
+ return "\n".join(lines)
1032
+
1033
+ @staticmethod
1034
+ def to_text(result: DocumentResult) -> str:
1035
+ lines = [
1036
+ f"DOCUMENT: {result.title or Path(result.document_path).name}",
1037
+ f"Authors: {', '.join(result.authors)}",
1038
+ f"Pages processed: {result.pages_processed}/{result.total_pages}",
1039
+ "",
1040
+ "SUMMARY",
1041
+ "=" * 60,
1042
+ result.document_summary,
1043
+ "",
1044
+ ]
1045
+ for page in result.page_results:
1046
+ lines.append(f"\n[PAGE {page.page_number}]")
1047
+ lines.append(page.raw_text)
1048
+ return "\n".join(lines)
1049
+
1050
+ @staticmethod
1051
+ def print_summary_table(result: DocumentResult) -> None:
1052
+ table = Table(title=f"Parse Results: {Path(result.document_path).name}", show_lines=True)
1053
+ table.add_column("Metric", style="cyan", no_wrap=True)
1054
+ table.add_column("Value", style="green")
1055
+
1056
+ table.add_row("Title", result.title or "(unknown)")
1057
+ table.add_row("Authors", ", ".join(result.authors) or "(unknown)")
1058
+ table.add_row("Model", result.model)
1059
+ table.add_row("Mode", result.processing_mode)
1060
+ table.add_row("Pages total", str(result.total_pages))
1061
+ table.add_row("Pages parsed", str(result.pages_processed))
1062
+ table.add_row("Equations", str(result.total_equations))
1063
+ table.add_row("Tables", str(result.total_tables))
1064
+ table.add_row("Algorithms", str(result.total_algorithms))
1065
+ table.add_row("Figures", str(result.total_figures))
1066
+ table.add_row("Tokens used", f"{result.total_tokens_used:,}")
1067
+ table.add_row("Processing time", f"{result.total_processing_time_s:.1f}s")
1068
+ table.add_row("Document hash", result.document_hash)
1069
+
1070
+ console.print(table)
1071
+
1072
+
1073
+ # ---------------------------------------------------------------------------
1074
+ # Agent interface
1075
+ # ---------------------------------------------------------------------------
1076
+
1077
+ class AgentPDFInterface:
1078
+ """
1079
+ High-level interface designed for use within agent pipelines.
1080
+ All methods accept a file path and return serializable Python objects.
1081
+
1082
+ Example usage in an agent:
1083
+ from pdf_atomic_parser import AgentPDFInterface
1084
+
1085
+ agent = AgentPDFInterface(model="opus")
1086
+ full = agent.parse("paper.pdf")
1087
+ eqs = agent.get_equations("paper.pdf")
1088
+ answer = agent.ask("paper.pdf", "What is the loss function?")
1089
+ """
1090
+
1091
+ def __init__(self, **kwargs):
1092
+ self._parser = AtomicPDFParser(**kwargs)
1093
+
1094
+ def parse(self, pdf_path: str, page_range: Optional[Tuple[int, int]] = None) -> Dict:
1095
+ result = self._parser.parse(pdf_path, page_range)
1096
+ return asdict(result)
1097
+
1098
+ def get_equations(self, pdf_path: str) -> List[Dict]:
1099
+ return [asdict(e) for e in self._parser.extract_equations(pdf_path)]
1100
+
1101
+ def get_tables(self, pdf_path: str) -> List[Dict]:
1102
+ return [asdict(t) for t in self._parser.extract_tables(pdf_path)]
1103
+
1104
+ def get_algorithms(self, pdf_path: str) -> List[Dict]:
1105
+ return [asdict(a) for a in self._parser.extract_algorithms(pdf_path)]
1106
+
1107
+ def get_figures(self, pdf_path: str) -> List[Dict]:
1108
+ return [asdict(f) for f in self._parser.extract_figures(pdf_path)]
1109
+
1110
+ def ask(self, pdf_path: str, question: str) -> str:
1111
+ return self._parser.query(pdf_path, question)
1112
+
1113
+ def get_full_text(self, pdf_path: str) -> str:
1114
+ result = self._parser.parse(pdf_path)
1115
+ return "\n\n".join(
1116
+ f"[Page {p.page_number}]\n{p.raw_text}"
1117
+ for p in result.page_results
1118
+ )
1119
+
1120
+ def cache_stats(self) -> Dict:
1121
+ return self._parser.cache.stats()
1122
+
1123
+
1124
+ # ---------------------------------------------------------------------------
1125
+ # Batch processor
1126
+ # ---------------------------------------------------------------------------
1127
+
1128
+ def batch_process(
1129
+ input_dir: Path,
1130
+ output_dir: Path,
1131
+ parser: AtomicPDFParser,
1132
+ fmt: str = "json",
1133
+ ) -> None:
1134
+ pdfs = sorted(input_dir.glob("**/*.pdf"))
1135
+ if not pdfs:
1136
+ console.print(f"[yellow]No PDF files found in {input_dir}[/yellow]")
1137
+ return
1138
+
1139
+ output_dir.mkdir(parents=True, exist_ok=True)
1140
+ console.print(f"[cyan]Found {len(pdfs)} PDF files to process.[/cyan]")
1141
+
1142
+ for pdf_path in pdfs:
1143
+ console.print(f"\n[bold]Processing:[/bold] {pdf_path.name}")
1144
+ try:
1145
+ result = parser.parse(pdf_path)
1146
+ stem = pdf_path.stem
1147
+ if fmt == "json":
1148
+ out = output_dir / f"{stem}.json"
1149
+ out.write_text(OutputFormatter.to_json(result), encoding="utf-8")
1150
+ elif fmt == "markdown":
1151
+ out = output_dir / f"{stem}.md"
1152
+ out.write_text(OutputFormatter.to_markdown(result), encoding="utf-8")
1153
+ else:
1154
+ out = output_dir / f"{stem}.txt"
1155
+ out.write_text(OutputFormatter.to_text(result), encoding="utf-8")
1156
+ console.print(f" [green]Saved:[/green] {out}")
1157
+ OutputFormatter.print_summary_table(result)
1158
+ except Exception as exc:
1159
+ console.print(f" [red]Error processing {pdf_path.name}: {exc}[/red]")
1160
+ logger.exception("Batch error")
1161
+
1162
+
1163
+ # ---------------------------------------------------------------------------
1164
+ # Token estimator
1165
+ # ---------------------------------------------------------------------------
1166
+
1167
+ def estimate_tokens(pdf_path: Path) -> None:
1168
+ with PDFDocument(pdf_path) as pdf:
1169
+ total = pdf.total_pages
1170
+ size_mb = pdf.file_size_bytes / 1e6
1171
+
1172
+ # Rough estimate: ~800 tokens per page for dense academic content
1173
+ est_tokens_in = total * 800
1174
+ est_tokens_out = total * 400
1175
+ est_total = est_tokens_in + est_tokens_out
1176
+
1177
+ # Pricing approximate (Opus: $15/Mtok in, $75/Mtok out as of 2025)
1178
+ est_cost_opus = (est_tokens_in * 15 + est_tokens_out * 75) / 1_000_000
1179
+
1180
+ table = Table(title=f"Token Estimate: {pdf_path.name}", show_lines=True)
1181
+ table.add_column("Metric", style="cyan")
1182
+ table.add_column("Estimate", style="yellow")
1183
+
1184
+ table.add_row("Total pages", str(total))
1185
+ table.add_row("File size", f"{size_mb:.2f} MB")
1186
+ table.add_row("Est. input tokens", f"{est_tokens_in:,}")
1187
+ table.add_row("Est. output tokens", f"{est_tokens_out:,}")
1188
+ table.add_row("Est. total tokens", f"{est_total:,}")
1189
+ table.add_row("Est. cost (Opus)", f"${est_cost_opus:.2f}")
1190
+ table.add_row("Note", "Estimate only; actual usage varies")
1191
+
1192
+ console.print(table)
1193
+
1194
+
1195
+ # ---------------------------------------------------------------------------
1196
+ # CLI
1197
+ # ---------------------------------------------------------------------------
1198
+
1199
+ def build_cli() -> argparse.ArgumentParser:
1200
+ parser = argparse.ArgumentParser(
1201
+ prog="pdf_atomic_parser",
1202
+ description="Atomic PDF parser powered by Claude claude-opus-4-6",
1203
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1204
+ )
1205
+ parser.add_argument("--model", default="opus", help="opus | sonnet | haiku | full-model-string")
1206
+ parser.add_argument("--mode", default="native", choices=["native", "image"], help="Parsing mode")
1207
+ parser.add_argument("--chunk-size", type=int, default=CHUNK_SIZE_DEFAULT, help="Pages per API call")
1208
+ parser.add_argument("--verbose", action="store_true")
1209
+
1210
+ sub = parser.add_subparsers(dest="command", required=True)
1211
+
1212
+ # parse
1213
+ p_parse = sub.add_parser("parse", help="Parse a PDF fully")
1214
+ p_parse.add_argument("pdf", help="Path to PDF file")
1215
+ p_parse.add_argument("--output", "-o", help="Output file path")
1216
+ p_parse.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])
1217
+ p_parse.add_argument("--pages", help="Page range e.g. 1-50")
1218
+
1219
+ # atomic (alias for parse with all content)
1220
+ p_atomic = sub.add_parser("atomic", help="Full atomic extraction to directory")
1221
+ p_atomic.add_argument("pdf", help="Path to PDF file")
1222
+ p_atomic.add_argument("--output", "-o", default="./atomic_output")
1223
+
1224
+ # extract-equations
1225
+ p_eq = sub.add_parser("extract-equations", help="Extract LaTeX equations")
1226
+ p_eq.add_argument("pdf")
1227
+ p_eq.add_argument("--output", "-o")
1228
+
1229
+ # extract-tables
1230
+ p_tb = sub.add_parser("extract-tables", help="Extract tables")
1231
+ p_tb.add_argument("pdf")
1232
+ p_tb.add_argument("--output", "-o")
1233
+
1234
+ # extract-algorithms
1235
+ p_al = sub.add_parser("extract-algorithms", help="Extract algorithms/code")
1236
+ p_al.add_argument("pdf")
1237
+ p_al.add_argument("--output", "-o")
1238
+
1239
+ # extract-figures
1240
+ p_fg = sub.add_parser("extract-figures", help="Extract figure descriptions")
1241
+ p_fg.add_argument("pdf")
1242
+ p_fg.add_argument("--output", "-o")
1243
+
1244
+ # query
1245
+ p_q = sub.add_parser("query", help="Ask a question about the PDF")
1246
+ p_q.add_argument("pdf")
1247
+ p_q.add_argument("question", help="Question to ask")
1248
+
1249
+ # batch
1250
+ p_batch = sub.add_parser("batch", help="Batch process a directory of PDFs")
1251
+ p_batch.add_argument("directory")
1252
+ p_batch.add_argument("--output", "-o", default="./batch_output")
1253
+ p_batch.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])
1254
+
1255
+ # estimate
1256
+ p_est = sub.add_parser("estimate", help="Estimate token cost before parsing")
1257
+ p_est.add_argument("pdf")
1258
+
1259
+ # cache commands
1260
+ sub.add_parser("cache-stats", help="Show cache statistics")
1261
+ sub.add_parser("list-cache", help="List all cached documents")
1262
+ p_cc = sub.add_parser("clear-cache", help="Clear cache for a document")
1263
+ p_cc.add_argument("pdf", help="PDF path (to identify document)")
1264
+
1265
+ return parser
1266
+
1267
+
1268
+ def parse_page_range(s: str) -> Tuple[int, int]:
1269
+ parts = s.split("-")
1270
+ if len(parts) != 2:
1271
+ raise ValueError(f"Page range must be in format start-end, got: {s}")
1272
+ return int(parts[0]), int(parts[1])
1273
+
1274
+
1275
+ def save_output(content: str, output_path: Optional[str], default_name: str) -> None:
1276
+ path = Path(output_path) if output_path else Path(default_name)
1277
+ path.parent.mkdir(parents=True, exist_ok=True)
1278
+ path.write_text(content, encoding="utf-8")
1279
+ console.print(f"[green]Saved:[/green] {path}")
1280
+
1281
+
1282
+ def main() -> None:
1283
+ cli = build_cli()
1284
+ args = cli.parse_args()
1285
+ cache = ParseCache(Path.home() / ".cache" / "pdf_atomic_parser")
1286
+
1287
+ if args.command == "cache-stats":
1288
+ stats = cache.stats()
1289
+ table = Table(title="Cache Statistics", show_lines=True)
1290
+ table.add_column("Key", style="cyan")
1291
+ table.add_column("Value", style="green")
1292
+ for k, v in stats.items():
1293
+ table.add_row(k.replace("_", " ").title(), str(v))
1294
+ console.print(table)
1295
+ return
1296
+
1297
+ if args.command == "list-cache":
1298
+ docs = cache.list_documents()
1299
+ if not docs:
1300
+ console.print("[yellow]Cache is empty.[/yellow]")
1301
+ return
1302
+ table = Table(title="Cached Documents", show_lines=True)
1303
+ table.add_column("Hash", style="cyan")
1304
+ table.add_column("Cached Pages", style="green")
1305
+ table.add_column("First Seen", style="dim")
1306
+ for d in docs:
1307
+ import datetime
1308
+ ts = datetime.datetime.fromtimestamp(d["first_seen"]).strftime("%Y-%m-%d %H:%M")
1309
+ table.add_row(d["hash"], str(d["cached_pages"]), ts)
1310
+ console.print(table)
1311
+ return
1312
+
1313
+ if args.command == "estimate":
1314
+ estimate_tokens(Path(args.pdf))
1315
+ return
1316
+
1317
+ parser = AtomicPDFParser(
1318
+ model=args.model,
1319
+ mode=args.mode,
1320
+ chunk_size=args.chunk_size,
1321
+ verbose=args.verbose,
1322
+ )
1323
+
1324
+ if args.command == "clear-cache":
1325
+ doc_hash = cache.file_hash(Path(args.pdf))
1326
+ n = cache.clear_document(doc_hash)
1327
+ console.print(f"[green]Cleared {n} cached pages for {Path(args.pdf).name}[/green]")
1328
+ return
1329
+
1330
+ if args.command in ("parse", "atomic"):
1331
+ page_range = None
1332
+ if hasattr(args, "pages") and args.pages:
1333
+ page_range = parse_page_range(args.pages)
1334
+
1335
+ result = parser.parse(args.pdf, page_range)
1336
+ OutputFormatter.print_summary_table(result)
1337
+
1338
+ if args.command == "atomic":
1339
+ out_dir = Path(args.output)
1340
+ stem = Path(args.pdf).stem
1341
+ for fmt, fn in [("json", f"{stem}.json"), ("markdown", f"{stem}.md"), ("text", f"{stem}.txt")]:
1342
+ (out_dir / fn).parent.mkdir(parents=True, exist_ok=True)
1343
+ if fmt == "json":
1344
+ content = OutputFormatter.to_json(result)
1345
+ elif fmt == "markdown":
1346
+ content = OutputFormatter.to_markdown(result)
1347
+ else:
1348
+ content = OutputFormatter.to_text(result)
1349
+ (out_dir / fn).write_text(content, encoding="utf-8")
1350
+ console.print(f"[green]Saved {fmt}:[/green] {out_dir / fn}")
1351
+ else:
1352
+ fmt = args.format
1353
+ if fmt == "json":
1354
+ content = OutputFormatter.to_json(result)
1355
+ elif fmt == "markdown":
1356
+ content = OutputFormatter.to_markdown(result)
1357
+ else:
1358
+ content = OutputFormatter.to_text(result)
1359
+
1360
+ stem = Path(args.pdf).stem
1361
+ save_output(content, getattr(args, "output", None), f"{stem}_parsed.{fmt if fmt != 'markdown' else 'md'}")
1362
+
1363
+ elif args.command == "extract-equations":
1364
+ result = parser.parse(args.pdf)
1365
+ eqs = [asdict(e) for p in result.page_results for e in p.equations]
1366
+ content = json.dumps(eqs, indent=2, ensure_ascii=False)
1367
+ save_output(content, args.output, f"{Path(args.pdf).stem}_equations.json")
1368
+ console.print(f"[cyan]{len(eqs)} equations extracted.[/cyan]")
1369
+
1370
+ elif args.command == "extract-tables":
1371
+ result = parser.parse(args.pdf)
1372
+ tables = [asdict(t) for p in result.page_results for t in p.tables]
1373
+ content = json.dumps(tables, indent=2, ensure_ascii=False)
1374
+ save_output(content, args.output, f"{Path(args.pdf).stem}_tables.json")
1375
+ console.print(f"[cyan]{len(tables)} tables extracted.[/cyan]")
1376
+
1377
+ elif args.command == "extract-algorithms":
1378
+ result = parser.parse(args.pdf)
1379
+ algos = [asdict(a) for p in result.page_results for a in p.algorithms]
1380
+ content = json.dumps(algos, indent=2, ensure_ascii=False)
1381
+ save_output(content, args.output, f"{Path(args.pdf).stem}_algorithms.json")
1382
+ console.print(f"[cyan]{len(algos)} algorithms extracted.[/cyan]")
1383
+
1384
+ elif args.command == "extract-figures":
1385
+ result = parser.parse(args.pdf)
1386
+ figures = [asdict(f) for p in result.page_results for f in p.figures]
1387
+ content = json.dumps(figures, indent=2, ensure_ascii=False)
1388
+ save_output(content, args.output, f"{Path(args.pdf).stem}_figures.json")
1389
+ console.print(f"[cyan]{len(figures)} figures extracted.[/cyan]")
1390
+
1391
+ elif args.command == "query":
1392
+ answer = parser.query(args.pdf, args.question)
1393
+ console.print(f"\n[bold cyan]Answer:[/bold cyan]\n{answer}")
1394
+
1395
+ elif args.command == "batch":
1396
+ batch_process(
1397
+ Path(args.directory),
1398
+ Path(args.output),
1399
+ parser,
1400
+ getattr(args, "format", "json"),
1401
+ )
1402
+
1403
+
1404
+ if __name__ == "__main__":
1405
+ main()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ anthropic>=0.43.0
2
+ PyMuPDF>=1.24.0
3
+ rich>=13.7.0
4
+ tqdm>=4.66.0