Upload 6 files
Browse files- .gitattributes +37 -34
- .gitignore +97 -0
- README.md +300 -0
- model_card.yml +88 -0
- pdf_atomic_parser.py +1405 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,38 @@
|
|
| 1 |
-
|
| 2 |
-
*
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
*.
|
| 6 |
-
*.
|
| 7 |
-
*.
|
| 8 |
-
*.
|
| 9 |
-
*.
|
| 10 |
-
*.
|
| 11 |
-
*.
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
*.
|
| 15 |
-
*.
|
| 16 |
-
*.
|
| 17 |
-
*.
|
| 18 |
-
*.
|
| 19 |
-
*.
|
| 20 |
-
*.
|
| 21 |
-
*.
|
| 22 |
-
*.
|
| 23 |
-
*.
|
| 24 |
-
*.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
|
| 27 |
-
*.
|
| 28 |
-
*.
|
| 29 |
-
*.
|
| 30 |
-
*.
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
*.
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Handle line endings automatically for all files tracked by Git
|
| 2 |
+
* text=auto eol=lf
|
| 3 |
+
|
| 4 |
+
# Explicitly declare Python as text
|
| 5 |
+
*.py text eol=lf linguist-language=Python
|
| 6 |
+
*.txt text eol=lf
|
| 7 |
+
*.md text eol=lf
|
| 8 |
+
*.yml text eol=lf
|
| 9 |
+
*.yaml text eol=lf
|
| 10 |
+
*.json text eol=lf
|
| 11 |
+
*.sh text eol=lf
|
| 12 |
+
|
| 13 |
+
# Binary files - do not attempt to process line endings
|
| 14 |
+
*.pdf binary
|
| 15 |
+
*.png binary
|
| 16 |
+
*.jpg binary
|
| 17 |
+
*.jpeg binary
|
| 18 |
+
*.gif binary
|
| 19 |
+
*.ico binary
|
| 20 |
+
*.db binary
|
| 21 |
+
*.zip binary
|
| 22 |
+
*.tar binary
|
| 23 |
+
*.gz binary
|
| 24 |
+
*.whl binary
|
| 25 |
+
|
| 26 |
+
# Hugging Face LFS tracked files
|
| 27 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
|
| 37 |
+
# Statistics
|
| 38 |
+
*.ipynb linguist-detectable=true
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
share/python-wheels/
|
| 20 |
+
*.egg-info/
|
| 21 |
+
.installed.cfg
|
| 22 |
+
*.egg
|
| 23 |
+
MANIFEST
|
| 24 |
+
pip-wheel-metadata/
|
| 25 |
+
share/python-wheels/
|
| 26 |
+
*.egg-info/
|
| 27 |
+
|
| 28 |
+
# Virtual environments
|
| 29 |
+
.venv/
|
| 30 |
+
venv/
|
| 31 |
+
ENV/
|
| 32 |
+
env/
|
| 33 |
+
.env
|
| 34 |
+
|
| 35 |
+
# Environment / secrets
|
| 36 |
+
.env
|
| 37 |
+
*.env
|
| 38 |
+
.env.*
|
| 39 |
+
!.env.example
|
| 40 |
+
|
| 41 |
+
# IDE
|
| 42 |
+
.vscode/
|
| 43 |
+
.idea/
|
| 44 |
+
*.sublime-project
|
| 45 |
+
*.sublime-workspace
|
| 46 |
+
.DS_Store
|
| 47 |
+
Thumbs.db
|
| 48 |
+
|
| 49 |
+
# Testing
|
| 50 |
+
.pytest_cache/
|
| 51 |
+
.coverage
|
| 52 |
+
htmlcov/
|
| 53 |
+
.tox/
|
| 54 |
+
.nox/
|
| 55 |
+
*.cover
|
| 56 |
+
*.py,cover
|
| 57 |
+
.hypothesis/
|
| 58 |
+
coverage.xml
|
| 59 |
+
nosetests.xml
|
| 60 |
+
pytest.xml
|
| 61 |
+
|
| 62 |
+
# Distribution
|
| 63 |
+
*.tar.gz
|
| 64 |
+
*.whl
|
| 65 |
+
|
| 66 |
+
# PDF parser specific
|
| 67 |
+
.pdf_parser_cache.db
|
| 68 |
+
atomic_output/
|
| 69 |
+
batch_output/
|
| 70 |
+
results/
|
| 71 |
+
*.parsed.json
|
| 72 |
+
*.parsed.md
|
| 73 |
+
*.parsed.txt
|
| 74 |
+
|
| 75 |
+
# Logs
|
| 76 |
+
*.log
|
| 77 |
+
logs/
|
| 78 |
+
|
| 79 |
+
# Jupyter
|
| 80 |
+
.ipynb_checkpoints/
|
| 81 |
+
*.ipynb
|
| 82 |
+
|
| 83 |
+
# macOS
|
| 84 |
+
.DS_Store
|
| 85 |
+
.AppleDouble
|
| 86 |
+
.LSOverride
|
| 87 |
+
|
| 88 |
+
# Windows
|
| 89 |
+
Thumbs.db
|
| 90 |
+
ehthumbs.db
|
| 91 |
+
Desktop.ini
|
| 92 |
+
|
| 93 |
+
# Type checking
|
| 94 |
+
.mypy_cache/
|
| 95 |
+
.dmypy.json
|
| 96 |
+
dmypy.json
|
| 97 |
+
.pytype/
|
README.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF Atomic Parser
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+

|
| 5 |
+

|
| 6 |
+

|
| 7 |
+

|
| 8 |
+

|
| 9 |
+
|
| 10 |
+
Atomically parse and understand complex PDF documents using **claude-opus-4-6** (Anthropic).
|
| 11 |
+
Handles equations, graphs, algorithms, unique drawings, multi-column layouts, scanned pages,
|
| 12 |
+
and 100+ page documents without hallucination.
|
| 13 |
+
|
| 14 |
+
Designed to be dropped into local agent pipelines as a callable module.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## What Makes This Work
|
| 19 |
+
|
| 20 |
+
Claude processes PDFs natively through Anthropic's document API. Each page is sent as a
|
| 21 |
+
base64-encoded PDF chunk (or rendered at 300 DPI in image mode) alongside a structured
|
| 22 |
+
JSON extraction prompt. The model simultaneously sees:
|
| 23 |
+
|
| 24 |
+
- The rasterized visual content (charts, graphs, drawings, handwriting)
|
| 25 |
+
- The underlying text layer (searchable text, equations, captions)
|
| 26 |
+
|
| 27 |
+
This dual perception eliminates the need for separate OCR, layout parsers, or equation
|
| 28 |
+
recognizers. The model returns fully structured JSON containing LaTeX equations, Markdown
|
| 29 |
+
tables, verbatim algorithm code, and semantic figure descriptions per page.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Features
|
| 34 |
+
|
| 35 |
+
| Feature | Description |
|
| 36 |
+
|---|---|
|
| 37 |
+
| Native PDF API | Sends PDF bytes directly; Claude sees both text and visuals |
|
| 38 |
+
| Image mode | Renders pages at 300 DPI via PyMuPDF for maximum fidelity |
|
| 39 |
+
| LaTeX equations | Every equation extracted as proper LaTeX |
|
| 40 |
+
| Table extraction | Tables as Markdown and list-of-dicts JSON |
|
| 41 |
+
| Algorithm extraction | Pseudocode and code blocks verbatim with language detection |
|
| 42 |
+
| Figure description | Semantic descriptions of charts, plots, diagrams, drawings |
|
| 43 |
+
| SQLite caching | Pages are cached; re-runs skip already-parsed pages |
|
| 44 |
+
| Chunked processing | Handles 100+ page documents by splitting into chunks |
|
| 45 |
+
| Multiple output formats | JSON, Markdown, plain text |
|
| 46 |
+
| Agent interface | `AgentPDFInterface` class for programmatic use |
|
| 47 |
+
| Batch processing | Process entire directories of PDFs |
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Requirements
|
| 52 |
+
|
| 53 |
+
- Python 3.10 or higher
|
| 54 |
+
- An Anthropic API key with access to `claude-opus-4-6`
|
| 55 |
+
- No GPU required; all inference runs through the Anthropic API
|
| 56 |
+
|
| 57 |
+
### External System Dependencies
|
| 58 |
+
|
| 59 |
+
PyMuPDF (installed via pip) requires no external system libraries on most platforms.
|
| 60 |
+
On some Linux systems you may need:
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
sudo apt-get install -y libmupdf-dev
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
On macOS:
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
brew install mupdf
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
On Windows: PyMuPDF ships with pre-built wheels on PyPI; no additional steps needed.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Installation
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
git clone https://github.com/algorembrant/pdf-atomic-parser.git
|
| 80 |
+
cd pdf-atomic-parser
|
| 81 |
+
|
| 82 |
+
python -m venv .venv
|
| 83 |
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
| 84 |
+
|
| 85 |
+
pip install -r requirements.txt
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
Set your API key:
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
export ANTHROPIC_API_KEY="sk-ant-..." # Linux / macOS
|
| 92 |
+
set ANTHROPIC_API_KEY=sk-ant-... # Windows CMD
|
| 93 |
+
$env:ANTHROPIC_API_KEY="sk-ant-..." # Windows PowerShell
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## Quick Start
|
| 99 |
+
|
| 100 |
+
### Parse a PDF
|
| 101 |
+
|
| 102 |
+
```bash
|
| 103 |
+
python pdf_atomic_parser.py parse document.pdf
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
Outputs `document_parsed.json` in the current directory.
|
| 107 |
+
|
| 108 |
+
### Full Atomic Extraction (JSON + Markdown + Text)
|
| 109 |
+
|
| 110 |
+
```bash
|
| 111 |
+
python pdf_atomic_parser.py atomic document.pdf --output ./results/
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
### Ask a Question
|
| 115 |
+
|
| 116 |
+
```bash
|
| 117 |
+
python pdf_atomic_parser.py query document.pdf "What is the main loss function?"
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
### Extract Only Equations
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python pdf_atomic_parser.py extract-equations document.pdf
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
### Use in an Agent Pipeline
|
| 127 |
+
|
| 128 |
+
```python
|
| 129 |
+
from pdf_atomic_parser import AgentPDFInterface
|
| 130 |
+
|
| 131 |
+
agent = AgentPDFInterface(model="opus")
|
| 132 |
+
|
| 133 |
+
# Full structured parse
|
| 134 |
+
result = agent.parse("paper.pdf")
|
| 135 |
+
|
| 136 |
+
# Just equations as list of dicts
|
| 137 |
+
equations = agent.get_equations("paper.pdf")
|
| 138 |
+
for eq in equations:
|
| 139 |
+
print(f"Page {eq['page']}: {eq['latex']}")
|
| 140 |
+
|
| 141 |
+
# Just tables
|
| 142 |
+
tables = agent.get_tables("paper.pdf")
|
| 143 |
+
|
| 144 |
+
# Semantic query
|
| 145 |
+
answer = agent.ask("paper.pdf", "What datasets were used for evaluation?")
|
| 146 |
+
print(answer)
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
---
|
| 150 |
+
|
| 151 |
+
## Usage Reference
|
| 152 |
+
|
| 153 |
+
### Command Overview
|
| 154 |
+
|
| 155 |
+
| Command | Purpose |
|
| 156 |
+
|---|---|
|
| 157 |
+
| `parse <pdf>` | Parse entire PDF to JSON/Markdown/text |
|
| 158 |
+
| `atomic <pdf>` | Full extraction to output directory (all formats) |
|
| 159 |
+
| `extract-equations <pdf>` | Extract LaTeX equations only |
|
| 160 |
+
| `extract-tables <pdf>` | Extract tables only |
|
| 161 |
+
| `extract-algorithms <pdf>` | Extract algorithms and code blocks only |
|
| 162 |
+
| `extract-figures <pdf>` | Extract figure descriptions only |
|
| 163 |
+
| `query <pdf> "<question>"` | Semantic question-answering over document |
|
| 164 |
+
| `batch <dir>` | Batch process all PDFs in a directory |
|
| 165 |
+
| `estimate <pdf>` | Estimate token count and cost before parsing |
|
| 166 |
+
| `cache-stats` | Show SQLite cache statistics |
|
| 167 |
+
| `list-cache` | List all cached documents |
|
| 168 |
+
| `clear-cache <pdf>` | Clear cached pages for a document |
|
| 169 |
+
|
| 170 |
+
### Global Options
|
| 171 |
+
|
| 172 |
+
| Option | Default | Description |
|
| 173 |
+
|---|---|---|
|
| 174 |
+
| `--model` | `opus` | `opus`, `sonnet`, `haiku`, or full model string |
|
| 175 |
+
| `--mode` | `native` | `native` (PDF bytes) or `image` (300 DPI PNG per page) |
|
| 176 |
+
| `--chunk-size` | `20` | Number of pages per API call |
|
| 177 |
+
| `--verbose` | off | Enable debug logging |
|
| 178 |
+
|
| 179 |
+
### parse / atomic Options
|
| 180 |
+
|
| 181 |
+
| Option | Default | Description |
|
| 182 |
+
|---|---|---|
|
| 183 |
+
| `--output / -o` | auto | Output file or directory path |
|
| 184 |
+
| `--format / -f` | `json` | `json`, `markdown`, or `text` |
|
| 185 |
+
| `--pages` | all | Page range, e.g. `1-50` |
|
| 186 |
+
|
| 187 |
+
---
|
| 188 |
+
|
| 189 |
+
## Output Schema
|
| 190 |
+
|
| 191 |
+
Each parsed document returns a `DocumentResult` with:
|
| 192 |
+
|
| 193 |
+
- `title`, `authors`, `abstract`, `document_summary`
|
| 194 |
+
- `page_results`: list of `PageResult` per page
|
| 195 |
+
|
| 196 |
+
Each `PageResult` contains:
|
| 197 |
+
|
| 198 |
+
```json
|
| 199 |
+
{
|
| 200 |
+
"page_number": 3,
|
| 201 |
+
"raw_text": "Full verbatim text...",
|
| 202 |
+
"summary": "This page describes...",
|
| 203 |
+
"section_headers": ["Introduction", "Related Work"],
|
| 204 |
+
"keywords": ["transformer", "attention", "BERT"],
|
| 205 |
+
"equations": [
|
| 206 |
+
{
|
| 207 |
+
"index": 0,
|
| 208 |
+
"latex": "\\mathcal{L} = -\\sum_{i} y_i \\log \\hat{y}_i",
|
| 209 |
+
"description": "Cross-entropy loss function",
|
| 210 |
+
"inline": false
|
| 211 |
+
}
|
| 212 |
+
],
|
| 213 |
+
"tables": [
|
| 214 |
+
{
|
| 215 |
+
"index": 0,
|
| 216 |
+
"markdown": "| Model | Accuracy |\n|---|---|\n| BERT | 94.2 |",
|
| 217 |
+
"json_data": [{"Model": "BERT", "Accuracy": "94.2"}],
|
| 218 |
+
"caption": "Table 1: Benchmark results"
|
| 219 |
+
}
|
| 220 |
+
],
|
| 221 |
+
"algorithms": [
|
| 222 |
+
{
|
| 223 |
+
"index": 0,
|
| 224 |
+
"name": "Algorithm 1: Backpropagation",
|
| 225 |
+
"language": "pseudocode",
|
| 226 |
+
"code": "for each layer l from L to 1:\n ...",
|
| 227 |
+
"description": "Gradient descent update rule"
|
| 228 |
+
}
|
| 229 |
+
],
|
| 230 |
+
"figures": [
|
| 231 |
+
{
|
| 232 |
+
"index": 0,
|
| 233 |
+
"figure_type": "line_chart",
|
| 234 |
+
"description": "Training loss over 100 epochs...",
|
| 235 |
+
"data_summary": "Y-axis: loss 0-2.0, X-axis: epoch 0-100...",
|
| 236 |
+
"caption": "Figure 2: Training curves"
|
| 237 |
+
}
|
| 238 |
+
]
|
| 239 |
+
}
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## Choosing a Mode
|
| 245 |
+
|
| 246 |
+
| Scenario | Recommended Mode | Reason |
|
| 247 |
+
|---|---|---|
|
| 248 |
+
| Standard digital PDF | `native` (default) | Fastest, uses both text and visual layers |
|
| 249 |
+
| Scanned / photographed PDF | `image` | Text layer absent; vision handles everything |
|
| 250 |
+
| PDF with complex math | `image` | 300 DPI render ensures equation clarity |
|
| 251 |
+
| Very large file (>32 MB) | `image` | Native API has 32 MB size limit per chunk |
|
| 252 |
+
| Cost-sensitive workflow | `native` | Fewer tokens consumed |
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## Cost Estimate
|
| 257 |
+
|
| 258 |
+
Rough estimates per 100-page academic paper:
|
| 259 |
+
|
| 260 |
+
| Model | Est. Tokens | Est. Cost |
|
| 261 |
+
|---|---|---|
|
| 262 |
+
| claude-opus-4-6 | ~120,000 | ~$3.50 |
|
| 263 |
+
| claude-sonnet-4-6 | ~120,000 | ~$0.60 |
|
| 264 |
+
| claude-haiku-4-5 | ~120,000 | ~$0.10 |
|
| 265 |
+
|
| 266 |
+
Use `python pdf_atomic_parser.py estimate document.pdf` for a per-document estimate.
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## Caching
|
| 271 |
+
|
| 272 |
+
Parsed pages are stored in `~/.cache/pdf_atomic_parser/.pdf_parser_cache.db`.
|
| 273 |
+
Re-running on the same document skips already-parsed pages automatically.
|
| 274 |
+
The cache key is `(document_SHA256, page_number, model, mode)`.
|
| 275 |
+
|
| 276 |
+
---
|
| 277 |
+
|
| 278 |
+
## Project Structure
|
| 279 |
+
|
| 280 |
+
```
|
| 281 |
+
pdf-atomic-parser/
|
| 282 |
+
pdf_atomic_parser.py Main tool (single file, no splitting needed)
|
| 283 |
+
requirements.txt Python dependencies
|
| 284 |
+
README.md This file
|
| 285 |
+
model_card.yml Hugging Face model card
|
| 286 |
+
.gitignore
|
| 287 |
+
.gitattributes
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
---
|
| 291 |
+
|
| 292 |
+
## Author
|
| 293 |
+
|
| 294 |
+
**algorembrant**
|
| 295 |
+
|
| 296 |
+
---
|
| 297 |
+
|
| 298 |
+
## License
|
| 299 |
+
|
| 300 |
+
MIT License. See LICENSE file.
|
model_card.yml
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
license: mit
|
| 5 |
+
library_name: anthropic
|
| 6 |
+
tags:
|
| 7 |
+
- pdf
|
| 8 |
+
- document-parsing
|
| 9 |
+
- ocr
|
| 10 |
+
- multimodal
|
| 11 |
+
- equations
|
| 12 |
+
- table-extraction
|
| 13 |
+
- agent
|
| 14 |
+
- claude
|
| 15 |
+
- information-extraction
|
| 16 |
+
- scientific-documents
|
| 17 |
+
pipeline_tag: document-question-answering
|
| 18 |
+
model_name: PDF Atomic Parser
|
| 19 |
+
authors:
|
| 20 |
+
- algorembrant
|
| 21 |
+
sdk: other
|
| 22 |
+
sdk_version: "1.0.0"
|
| 23 |
+
app_file: pdf_atomic_parser.py
|
| 24 |
+
short_description: >
|
| 25 |
+
Atomically parse complex PDFs (equations, graphs, algorithms, tables)
|
| 26 |
+
using Claude claude-opus-4-6 without hallucination. Agent-ready.
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
# PDF Atomic Parser
|
| 30 |
+
|
| 31 |
+
Powered by **claude-opus-4-6** (Anthropic).
|
| 32 |
+
|
| 33 |
+
## Description
|
| 34 |
+
|
| 35 |
+
A single-file Python tool for extracting structured content from complex
|
| 36 |
+
academic and technical PDFs. Works on documents containing:
|
| 37 |
+
|
| 38 |
+
- Mathematical equations (extracted as LaTeX)
|
| 39 |
+
- Data tables (extracted as Markdown + JSON)
|
| 40 |
+
- Algorithms and pseudocode (verbatim with language detection)
|
| 41 |
+
- Figures, charts, graphs, and drawings (semantic descriptions)
|
| 42 |
+
- Multi-column layouts, footnotes, margin notes
|
| 43 |
+
- 100+ page documents via automatic chunking
|
| 44 |
+
|
| 45 |
+
## Usage
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
pip install anthropic PyMuPDF rich tqdm
|
| 49 |
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
| 50 |
+
|
| 51 |
+
python pdf_atomic_parser.py parse document.pdf
|
| 52 |
+
python pdf_atomic_parser.py atomic document.pdf --output ./results/
|
| 53 |
+
python pdf_atomic_parser.py extract-equations document.pdf
|
| 54 |
+
python pdf_atomic_parser.py query document.pdf "What is the main theorem?"
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Agent Integration
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
from pdf_atomic_parser import AgentPDFInterface
|
| 61 |
+
|
| 62 |
+
agent = AgentPDFInterface(model="opus")
|
| 63 |
+
result = agent.parse("paper.pdf")
|
| 64 |
+
equations = agent.get_equations("paper.pdf")
|
| 65 |
+
tables = agent.get_tables("paper.pdf")
|
| 66 |
+
answer = agent.ask("paper.pdf", "What datasets were used?")
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Model Details
|
| 70 |
+
|
| 71 |
+
| Property | Value |
|
| 72 |
+
|---|---|
|
| 73 |
+
| Underlying model | claude-opus-4-6 (Anthropic) |
|
| 74 |
+
| Parsing modes | native PDF, page-as-image (300 DPI) |
|
| 75 |
+
| Max pages per call | 20 (configurable) |
|
| 76 |
+
| Cache | SQLite, keyed by SHA-256 + page + model + mode |
|
| 77 |
+
| Output formats | JSON, Markdown, plain text |
|
| 78 |
+
|
| 79 |
+
## Citation
|
| 80 |
+
|
| 81 |
+
```bibtex
|
| 82 |
+
@software{algorembrant2025pdfparser,
|
| 83 |
+
author = {algorembrant},
|
| 84 |
+
title = {PDF Atomic Parser},
|
| 85 |
+
year = {2025},
|
| 86 |
+
url = {https://github.com/algorembrant/pdf-atomic-parser}
|
| 87 |
+
}
|
| 88 |
+
```
|
pdf_atomic_parser.py
ADDED
|
@@ -0,0 +1,1405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
pdf_atomic_parser.py
|
| 3 |
+
====================
|
| 4 |
+
Author : algorembrant
|
| 5 |
+
Version : 1.0.0
|
| 6 |
+
License : MIT
|
| 7 |
+
|
| 8 |
+
DESCRIPTION
|
| 9 |
+
-----------
|
| 10 |
+
Atomically parse and understand complex PDF documents using Claude claude-opus-4-6.
|
| 11 |
+
Handles equations, graphs, algorithms, unique drawings, tables, multi-column
|
| 12 |
+
layouts, and 100+ page documents without hallucination. Designed for local
|
| 13 |
+
agent pipelines.
|
| 14 |
+
|
| 15 |
+
CAPABILITIES
|
| 16 |
+
------------
|
| 17 |
+
- Native PDF document API (base64) with prompt caching
|
| 18 |
+
- Page-as-image fallback using PyMuPDF at 300 DPI for max fidelity
|
| 19 |
+
- LaTeX equation extraction
|
| 20 |
+
- Table extraction (Markdown + JSON)
|
| 21 |
+
- Algorithm and pseudocode extraction
|
| 22 |
+
- Figure and graph semantic description
|
| 23 |
+
- Multi-column and complex layout handling
|
| 24 |
+
- Chunked processing for 100+ page documents
|
| 25 |
+
- SQLite-backed cache to avoid re-processing pages
|
| 26 |
+
- Structured JSON output per page and full document
|
| 27 |
+
- Agent-callable interface (AgentPDFInterface)
|
| 28 |
+
- Async batch processing for speed
|
| 29 |
+
|
| 30 |
+
USAGE COMMANDS
|
| 31 |
+
--------------
|
| 32 |
+
# Parse a PDF and save structured JSON
|
| 33 |
+
python pdf_atomic_parser.py parse document.pdf
|
| 34 |
+
|
| 35 |
+
# Parse with verbose output
|
| 36 |
+
python pdf_atomic_parser.py parse document.pdf --verbose
|
| 37 |
+
|
| 38 |
+
# Parse specific page range
|
| 39 |
+
python pdf_atomic_parser.py parse document.pdf --pages 1-20
|
| 40 |
+
|
| 41 |
+
# Extract only equations (LaTeX)
|
| 42 |
+
python pdf_atomic_parser.py extract-equations document.pdf
|
| 43 |
+
|
| 44 |
+
# Extract only tables (Markdown)
|
| 45 |
+
python pdf_atomic_parser.py extract-tables document.pdf
|
| 46 |
+
|
| 47 |
+
# Extract only algorithms/code blocks
|
| 48 |
+
python pdf_atomic_parser.py extract-algorithms document.pdf
|
| 49 |
+
|
| 50 |
+
# Extract figures and graph descriptions
|
| 51 |
+
python pdf_atomic_parser.py extract-figures document.pdf
|
| 52 |
+
|
| 53 |
+
# Full atomic extraction (all content types) to output dir
|
| 54 |
+
python pdf_atomic_parser.py atomic document.pdf --output ./results/
|
| 55 |
+
|
| 56 |
+
# Query a parsed PDF (semantic search over cached parse)
|
| 57 |
+
python pdf_atomic_parser.py query document.pdf "What is the main theorem?"
|
| 58 |
+
|
| 59 |
+
# Use faster/cheaper model (Sonnet instead of Opus)
|
| 60 |
+
python pdf_atomic_parser.py parse document.pdf --model sonnet
|
| 61 |
+
|
| 62 |
+
# Use page-as-image mode (higher fidelity for scanned/complex PDFs)
|
| 63 |
+
python pdf_atomic_parser.py parse document.pdf --mode image
|
| 64 |
+
|
| 65 |
+
# Use native PDF mode (default, faster)
|
| 66 |
+
python pdf_atomic_parser.py parse document.pdf --mode native
|
| 67 |
+
|
| 68 |
+
# Set chunk size for large PDFs (default 20 pages per chunk)
|
| 69 |
+
python pdf_atomic_parser.py parse document.pdf --chunk-size 10
|
| 70 |
+
|
| 71 |
+
# Clear cache for a document
|
| 72 |
+
python pdf_atomic_parser.py clear-cache document.pdf
|
| 73 |
+
|
| 74 |
+
# Show cache stats
|
| 75 |
+
python pdf_atomic_parser.py cache-stats
|
| 76 |
+
|
| 77 |
+
# List all cached documents
|
| 78 |
+
python pdf_atomic_parser.py list-cache
|
| 79 |
+
|
| 80 |
+
# Batch process a directory of PDFs
|
| 81 |
+
python pdf_atomic_parser.py batch ./pdf_folder/ --output ./results/
|
| 82 |
+
|
| 83 |
+
# Export parse results as Markdown report
|
| 84 |
+
python pdf_atomic_parser.py parse document.pdf --format markdown
|
| 85 |
+
|
| 86 |
+
# Export as plain text
|
| 87 |
+
python pdf_atomic_parser.py parse document.pdf --format text
|
| 88 |
+
|
| 89 |
+
# Show token usage estimate before parsing
|
| 90 |
+
python pdf_atomic_parser.py estimate document.pdf
|
| 91 |
+
|
| 92 |
+
# Agent interface example (programmatic)
|
| 93 |
+
# from pdf_atomic_parser import AgentPDFInterface
|
| 94 |
+
# agent = AgentPDFInterface()
|
| 95 |
+
# result = agent.parse("document.pdf")
|
| 96 |
+
# equations = agent.get_equations("document.pdf")
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
from __future__ import annotations
|
| 100 |
+
|
| 101 |
+
import argparse
|
| 102 |
+
import asyncio
|
| 103 |
+
import base64
|
| 104 |
+
import hashlib
|
| 105 |
+
import json
|
| 106 |
+
import logging
|
| 107 |
+
import os
|
| 108 |
+
import sqlite3
|
| 109 |
+
import sys
|
| 110 |
+
import time
|
| 111 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 112 |
+
from dataclasses import asdict, dataclass, field
|
| 113 |
+
from pathlib import Path
|
| 114 |
+
from typing import Any, Dict, Generator, Iterator, List, Optional, Tuple
|
| 115 |
+
|
| 116 |
+
import anthropic
|
| 117 |
+
import fitz # PyMuPDF
|
| 118 |
+
from rich.console import Console
|
| 119 |
+
from rich.logging import RichHandler
|
| 120 |
+
from rich.progress import (
|
| 121 |
+
BarColumn,
|
| 122 |
+
MofNCompleteColumn,
|
| 123 |
+
Progress,
|
| 124 |
+
SpinnerColumn,
|
| 125 |
+
TaskProgressColumn,
|
| 126 |
+
TextColumn,
|
| 127 |
+
TimeElapsedColumn,
|
| 128 |
+
TimeRemainingColumn,
|
| 129 |
+
)
|
| 130 |
+
from rich.table import Table
|
| 131 |
+
from tqdm import tqdm
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
# Configuration
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
|
| 138 |
+
DEFAULT_MODEL_OPUS = "claude-opus-4-6"
|
| 139 |
+
DEFAULT_MODEL_SONNET = "claude-sonnet-4-6"
|
| 140 |
+
DEFAULT_MODEL_HAIKU = "claude-haiku-4-5-20251001"
|
| 141 |
+
|
| 142 |
+
MAX_TOKENS_OUTPUT = 8192
|
| 143 |
+
CHUNK_SIZE_DEFAULT = 20 # pages per API call
|
| 144 |
+
IMAGE_DPI = 300 # render DPI for page-as-image mode
|
| 145 |
+
MAX_PDF_SIZE_BYTES = 32 * 1024 * 1024 # 32 MB native API limit
|
| 146 |
+
MAX_PDF_PAGES_NATIVE = 100 # native API page cap per request
|
| 147 |
+
CACHE_DB_NAME = ".pdf_parser_cache.db"
|
| 148 |
+
LOG_FORMAT = "%(message)s"
|
| 149 |
+
|
| 150 |
+
console = Console()
|
| 151 |
+
|
| 152 |
+
logging.basicConfig(
|
| 153 |
+
level=logging.WARNING,
|
| 154 |
+
format=LOG_FORMAT,
|
| 155 |
+
handlers=[RichHandler(console=console, rich_tracebacks=True, show_path=False)],
|
| 156 |
+
)
|
| 157 |
+
logger = logging.getLogger("pdf_atomic_parser")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ---------------------------------------------------------------------------
|
| 161 |
+
# Data structures
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
|
| 164 |
+
@dataclass
|
| 165 |
+
class EquationBlock:
|
| 166 |
+
page: int
|
| 167 |
+
index: int
|
| 168 |
+
latex: str
|
| 169 |
+
description: str
|
| 170 |
+
inline: bool = False
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@dataclass
|
| 174 |
+
class TableBlock:
|
| 175 |
+
page: int
|
| 176 |
+
index: int
|
| 177 |
+
markdown: str
|
| 178 |
+
json_data: List[Dict]
|
| 179 |
+
caption: str = ""
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@dataclass
|
| 183 |
+
class AlgorithmBlock:
|
| 184 |
+
page: int
|
| 185 |
+
index: int
|
| 186 |
+
name: str
|
| 187 |
+
language: str
|
| 188 |
+
code: str
|
| 189 |
+
description: str
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@dataclass
|
| 193 |
+
class FigureBlock:
|
| 194 |
+
page: int
|
| 195 |
+
index: int
|
| 196 |
+
figure_type: str # chart | diagram | drawing | photograph | plot
|
| 197 |
+
description: str
|
| 198 |
+
data_summary: str
|
| 199 |
+
caption: str = ""
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
@dataclass
|
| 203 |
+
class PageResult:
|
| 204 |
+
page_number: int
|
| 205 |
+
raw_text: str
|
| 206 |
+
summary: str
|
| 207 |
+
equations: List[EquationBlock] = field(default_factory=list)
|
| 208 |
+
tables: List[TableBlock] = field(default_factory=list)
|
| 209 |
+
algorithms: List[AlgorithmBlock] = field(default_factory=list)
|
| 210 |
+
figures: List[FigureBlock] = field(default_factory=list)
|
| 211 |
+
section_headers: List[str] = field(default_factory=list)
|
| 212 |
+
references: List[str] = field(default_factory=list)
|
| 213 |
+
keywords: List[str] = field(default_factory=list)
|
| 214 |
+
layout_notes: str = ""
|
| 215 |
+
processing_mode: str = "native"
|
| 216 |
+
tokens_used: int = 0
|
| 217 |
+
processing_time_s: float = 0.0
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
@dataclass
|
| 221 |
+
class DocumentResult:
|
| 222 |
+
document_path: str
|
| 223 |
+
document_hash: str
|
| 224 |
+
total_pages: int
|
| 225 |
+
pages_processed: int
|
| 226 |
+
model: str
|
| 227 |
+
processing_mode: str
|
| 228 |
+
title: str
|
| 229 |
+
authors: List[str]
|
| 230 |
+
abstract: str
|
| 231 |
+
document_summary: str
|
| 232 |
+
page_results: List[PageResult] = field(default_factory=list)
|
| 233 |
+
total_equations: int = 0
|
| 234 |
+
total_tables: int = 0
|
| 235 |
+
total_algorithms: int = 0
|
| 236 |
+
total_figures: int = 0
|
| 237 |
+
total_tokens_used: int = 0
|
| 238 |
+
total_processing_time_s: float = 0.0
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ---------------------------------------------------------------------------
|
| 242 |
+
# Cache layer
|
| 243 |
+
# ---------------------------------------------------------------------------
|
| 244 |
+
|
| 245 |
+
class ParseCache:
|
| 246 |
+
"""SQLite-backed cache for parsed page results."""
|
| 247 |
+
|
| 248 |
+
def __init__(self, cache_dir: Path):
|
| 249 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 250 |
+
self.db_path = cache_dir / CACHE_DB_NAME
|
| 251 |
+
self._init_db()
|
| 252 |
+
|
| 253 |
+
def _init_db(self) -> None:
|
| 254 |
+
with self._connect() as conn:
|
| 255 |
+
conn.execute("""
|
| 256 |
+
CREATE TABLE IF NOT EXISTS page_cache (
|
| 257 |
+
doc_hash TEXT NOT NULL,
|
| 258 |
+
page_num INTEGER NOT NULL,
|
| 259 |
+
model TEXT NOT NULL,
|
| 260 |
+
mode TEXT NOT NULL,
|
| 261 |
+
result_json TEXT NOT NULL,
|
| 262 |
+
created_at REAL NOT NULL,
|
| 263 |
+
PRIMARY KEY (doc_hash, page_num, model, mode)
|
| 264 |
+
)
|
| 265 |
+
""")
|
| 266 |
+
conn.execute("""
|
| 267 |
+
CREATE TABLE IF NOT EXISTS doc_meta (
|
| 268 |
+
doc_hash TEXT PRIMARY KEY,
|
| 269 |
+
doc_path TEXT NOT NULL,
|
| 270 |
+
total_pages INTEGER NOT NULL,
|
| 271 |
+
created_at REAL NOT NULL
|
| 272 |
+
)
|
| 273 |
+
""")
|
| 274 |
+
|
| 275 |
+
def _connect(self) -> sqlite3.Connection:
|
| 276 |
+
conn = sqlite3.connect(self.db_path, timeout=30)
|
| 277 |
+
conn.execute("PRAGMA journal_mode=WAL")
|
| 278 |
+
return conn
|
| 279 |
+
|
| 280 |
+
@staticmethod
|
| 281 |
+
def file_hash(path: Path) -> str:
|
| 282 |
+
h = hashlib.sha256()
|
| 283 |
+
with open(path, "rb") as fh:
|
| 284 |
+
for chunk in iter(lambda: fh.read(65536), b""):
|
| 285 |
+
h.update(chunk)
|
| 286 |
+
return h.hexdigest()[:16]
|
| 287 |
+
|
| 288 |
+
def get_page(self, doc_hash: str, page_num: int, model: str, mode: str) -> Optional[PageResult]:
|
| 289 |
+
with self._connect() as conn:
|
| 290 |
+
row = conn.execute(
|
| 291 |
+
"SELECT result_json FROM page_cache WHERE doc_hash=? AND page_num=? AND model=? AND mode=?",
|
| 292 |
+
(doc_hash, page_num, model, mode),
|
| 293 |
+
).fetchone()
|
| 294 |
+
if row:
|
| 295 |
+
return self._deserialize_page(json.loads(row[0]))
|
| 296 |
+
return None
|
| 297 |
+
|
| 298 |
+
def set_page(self, doc_hash: str, result: PageResult, model: str, mode: str) -> None:
|
| 299 |
+
with self._connect() as conn:
|
| 300 |
+
conn.execute(
|
| 301 |
+
"INSERT OR REPLACE INTO page_cache VALUES (?,?,?,?,?,?)",
|
| 302 |
+
(doc_hash, result.page_number, model, mode,
|
| 303 |
+
json.dumps(self._serialize_page(result)), time.time()),
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
def clear_document(self, doc_hash: str) -> int:
|
| 307 |
+
with self._connect() as conn:
|
| 308 |
+
cur = conn.execute("DELETE FROM page_cache WHERE doc_hash=?", (doc_hash,))
|
| 309 |
+
conn.execute("DELETE FROM doc_meta WHERE doc_hash=?", (doc_hash,))
|
| 310 |
+
return cur.rowcount
|
| 311 |
+
|
| 312 |
+
def stats(self) -> Dict[str, Any]:
|
| 313 |
+
with self._connect() as conn:
|
| 314 |
+
total = conn.execute("SELECT COUNT(*) FROM page_cache").fetchone()[0]
|
| 315 |
+
docs = conn.execute("SELECT COUNT(DISTINCT doc_hash) FROM page_cache").fetchone()[0]
|
| 316 |
+
size = self.db_path.stat().st_size if self.db_path.exists() else 0
|
| 317 |
+
return {"total_cached_pages": total, "unique_documents": docs, "cache_size_mb": round(size / 1e6, 2)}
|
| 318 |
+
|
| 319 |
+
def list_documents(self) -> List[Dict]:
|
| 320 |
+
with self._connect() as conn:
|
| 321 |
+
rows = conn.execute("""
|
| 322 |
+
SELECT doc_hash, COUNT(*) as pages, MIN(created_at) as first_seen
|
| 323 |
+
FROM page_cache GROUP BY doc_hash
|
| 324 |
+
""").fetchall()
|
| 325 |
+
return [{"hash": r[0], "cached_pages": r[1], "first_seen": r[2]} for r in rows]
|
| 326 |
+
|
| 327 |
+
# -- serialization helpers -----------------------------------------------
|
| 328 |
+
|
| 329 |
+
@staticmethod
|
| 330 |
+
def _serialize_page(p: PageResult) -> Dict:
|
| 331 |
+
d = asdict(p)
|
| 332 |
+
return d
|
| 333 |
+
|
| 334 |
+
@staticmethod
|
| 335 |
+
def _deserialize_page(d: Dict) -> PageResult:
|
| 336 |
+
d["equations"] = [EquationBlock(**e) for e in d.get("equations", [])]
|
| 337 |
+
d["tables"] = [TableBlock(**t) for t in d.get("tables", [])]
|
| 338 |
+
d["algorithms"] = [AlgorithmBlock(**a) for a in d.get("algorithms", [])]
|
| 339 |
+
d["figures"] = [FigureBlock(**f) for f in d.get("figures", [])]
|
| 340 |
+
return PageResult(**d)
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
# ---------------------------------------------------------------------------
|
| 344 |
+
# PDF utilities
|
| 345 |
+
# ---------------------------------------------------------------------------
|
| 346 |
+
|
| 347 |
+
class PDFDocument:
|
| 348 |
+
"""Thin wrapper around fitz.Document with chunking helpers."""
|
| 349 |
+
|
| 350 |
+
def __init__(self, path: Path):
|
| 351 |
+
self.path = path
|
| 352 |
+
self._doc = fitz.open(str(path))
|
| 353 |
+
self.total_pages = len(self._doc)
|
| 354 |
+
|
| 355 |
+
@property
|
| 356 |
+
def file_size_bytes(self) -> int:
|
| 357 |
+
return self.path.stat().st_size
|
| 358 |
+
|
| 359 |
+
def get_chunk_ranges(self, chunk_size: int) -> List[Tuple[int, int]]:
|
| 360 |
+
"""Return list of (start_page_0indexed, end_page_exclusive) tuples."""
|
| 361 |
+
ranges = []
|
| 362 |
+
for start in range(0, self.total_pages, chunk_size):
|
| 363 |
+
end = min(start + chunk_size, self.total_pages)
|
| 364 |
+
ranges.append((start, end))
|
| 365 |
+
return ranges
|
| 366 |
+
|
| 367 |
+
def get_chunk_as_pdf_bytes(self, start: int, end: int) -> bytes:
|
| 368 |
+
"""Extract pages [start, end) into a new in-memory PDF."""
|
| 369 |
+
sub = fitz.open()
|
| 370 |
+
sub.insert_pdf(self._doc, from_page=start, to_page=end - 1)
|
| 371 |
+
return sub.write()
|
| 372 |
+
|
| 373 |
+
def get_page_as_png_bytes(self, page_idx: int, dpi: int = IMAGE_DPI) -> bytes:
|
| 374 |
+
"""Render a single page to PNG bytes at given DPI."""
|
| 375 |
+
page = self._doc[page_idx]
|
| 376 |
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
| 377 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 378 |
+
return pix.tobytes("png")
|
| 379 |
+
|
| 380 |
+
def close(self) -> None:
|
| 381 |
+
self._doc.close()
|
| 382 |
+
|
| 383 |
+
def __enter__(self):
|
| 384 |
+
return self
|
| 385 |
+
|
| 386 |
+
def __exit__(self, *_):
|
| 387 |
+
self.close()
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
# ---------------------------------------------------------------------------
|
| 391 |
+
# Extraction prompts
|
| 392 |
+
# ---------------------------------------------------------------------------
|
| 393 |
+
|
| 394 |
+
SYSTEM_PROMPT = """You are an expert scientific document analyst specializing in atomically
|
| 395 |
+
parsing complex academic and technical PDFs. Your extractions must be:
|
| 396 |
+
- Complete: capture every equation, table, figure, and algorithm
|
| 397 |
+
- Faithful: never invent or hallucinate content
|
| 398 |
+
- Precise: reproduce equations in proper LaTeX
|
| 399 |
+
- Structured: respond only with valid JSON matching the schema provided
|
| 400 |
+
|
| 401 |
+
Do NOT add prose outside the JSON response. If a field has no content, use an
|
| 402 |
+
empty list [] or empty string "" rather than null."""
|
| 403 |
+
|
| 404 |
+
PAGE_EXTRACTION_PROMPT = """\
|
| 405 |
+
Atomically parse the provided PDF page(s) and return a JSON object that matches
|
| 406 |
+
this schema exactly:
|
| 407 |
+
|
| 408 |
+
{
|
| 409 |
+
"raw_text": "<full verbatim text extracted from page, preserving paragraphs>",
|
| 410 |
+
"summary": "<2-4 sentence factual summary of this page>",
|
| 411 |
+
"section_headers": ["<header string>", ...],
|
| 412 |
+
"keywords": ["<important technical term>", ...],
|
| 413 |
+
"layout_notes": "<describe columns, special layouts, footnotes, margin notes>",
|
| 414 |
+
"equations": [
|
| 415 |
+
{
|
| 416 |
+
"index": <int starting at 0>,
|
| 417 |
+
"latex": "<complete LaTeX representation>",
|
| 418 |
+
"description": "<what this equation represents>",
|
| 419 |
+
"inline": <true if inline, false if display/block>
|
| 420 |
+
}
|
| 421 |
+
],
|
| 422 |
+
"tables": [
|
| 423 |
+
{
|
| 424 |
+
"index": <int>,
|
| 425 |
+
"markdown": "<GitHub-flavored Markdown table>",
|
| 426 |
+
"json_data": [{"col1": "val", ...}, ...],
|
| 427 |
+
"caption": "<table caption or empty string>"
|
| 428 |
+
}
|
| 429 |
+
],
|
| 430 |
+
"algorithms": [
|
| 431 |
+
{
|
| 432 |
+
"index": <int>,
|
| 433 |
+
"name": "<algorithm name or Algorithm N>",
|
| 434 |
+
"language": "<pseudocode | python | cpp | generic | etc.>",
|
| 435 |
+
"code": "<verbatim algorithm text, preserve indentation>",
|
| 436 |
+
"description": "<what this algorithm does>"
|
| 437 |
+
}
|
| 438 |
+
],
|
| 439 |
+
"figures": [
|
| 440 |
+
{
|
| 441 |
+
"index": <int>,
|
| 442 |
+
"figure_type": "<chart | bar_chart | line_chart | scatter_plot | histogram | diagram | flowchart | neural_network | tree | graph | drawing | photograph | heatmap | 3d_plot | other>",
|
| 443 |
+
"description": "<detailed semantic description of the visual>",
|
| 444 |
+
"data_summary": "<describe axes, units, trend, key values if quantitative>",
|
| 445 |
+
"caption": "<figure caption or empty string>"
|
| 446 |
+
}
|
| 447 |
+
],
|
| 448 |
+
"references": ["<any in-text citation or bibliography entry on this page>"]
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
Rules:
|
| 452 |
+
1. Every equation MUST have LaTeX. Use \\frac, \\sum, \\int, \\mathbf etc. for proper notation.
|
| 453 |
+
2. Tables must be fully reproduced in both Markdown and as list-of-dicts.
|
| 454 |
+
3. Algorithms must preserve all steps, loops, conditions verbatim.
|
| 455 |
+
4. Figures: describe them as if for a blind reader — quantitative values, trends, colors, labels.
|
| 456 |
+
5. raw_text must include ALL text visible on the page, including headers, footers, captions.
|
| 457 |
+
6. Do NOT summarize or truncate any content.
|
| 458 |
+
"""
|
| 459 |
+
|
| 460 |
+
DOCUMENT_META_PROMPT = """\
|
| 461 |
+
Based on the document pages you have seen, extract high-level metadata as JSON:
|
| 462 |
+
|
| 463 |
+
{
|
| 464 |
+
"title": "<document title>",
|
| 465 |
+
"authors": ["<author name>", ...],
|
| 466 |
+
"abstract": "<full abstract text or empty string if none>",
|
| 467 |
+
"document_summary": "<comprehensive 5-8 sentence summary of the entire document>"
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
Respond with valid JSON only.
|
| 471 |
+
"""
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
# ---------------------------------------------------------------------------
|
| 475 |
+
# Core parser
|
| 476 |
+
# ---------------------------------------------------------------------------
|
| 477 |
+
|
| 478 |
+
class AtomicPDFParser:
|
| 479 |
+
"""
|
| 480 |
+
Core parser that sends PDF chunks or page images to the Claude API
|
| 481 |
+
and extracts structured content atomically.
|
| 482 |
+
"""
|
| 483 |
+
|
| 484 |
+
def __init__(
|
| 485 |
+
self,
|
| 486 |
+
api_key: Optional[str] = None,
|
| 487 |
+
model: str = DEFAULT_MODEL_OPUS,
|
| 488 |
+
mode: str = "native", # "native" | "image"
|
| 489 |
+
chunk_size: int = CHUNK_SIZE_DEFAULT,
|
| 490 |
+
cache_dir: Optional[Path] = None,
|
| 491 |
+
verbose: bool = False,
|
| 492 |
+
max_workers: int = 4,
|
| 493 |
+
):
|
| 494 |
+
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "")
|
| 495 |
+
self.model = self._resolve_model(model)
|
| 496 |
+
self.mode = mode
|
| 497 |
+
self.chunk_size = chunk_size
|
| 498 |
+
self.verbose = verbose
|
| 499 |
+
self.max_workers = max_workers
|
| 500 |
+
|
| 501 |
+
if not self.api_key:
|
| 502 |
+
raise ValueError(
|
| 503 |
+
"ANTHROPIC_API_KEY environment variable not set. "
|
| 504 |
+
"Export it or pass api_key= to AtomicPDFParser."
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
self.client = anthropic.Anthropic(api_key=self.api_key)
|
| 508 |
+
|
| 509 |
+
cache_path = cache_dir or Path.home() / ".cache" / "pdf_atomic_parser"
|
| 510 |
+
self.cache = ParseCache(cache_path)
|
| 511 |
+
|
| 512 |
+
if verbose:
|
| 513 |
+
logger.setLevel(logging.DEBUG)
|
| 514 |
+
|
| 515 |
+
@staticmethod
|
| 516 |
+
def _resolve_model(alias: str) -> str:
|
| 517 |
+
mapping = {
|
| 518 |
+
"opus": DEFAULT_MODEL_OPUS,
|
| 519 |
+
"sonnet": DEFAULT_MODEL_SONNET,
|
| 520 |
+
"haiku": DEFAULT_MODEL_HAIKU,
|
| 521 |
+
}
|
| 522 |
+
return mapping.get(alias.lower(), alias)
|
| 523 |
+
|
| 524 |
+
# ------------------------------------------------------------------
|
| 525 |
+
# Public API
|
| 526 |
+
# ------------------------------------------------------------------
|
| 527 |
+
|
| 528 |
+
def parse(
|
| 529 |
+
self,
|
| 530 |
+
pdf_path: str | Path,
|
| 531 |
+
page_range: Optional[Tuple[int, int]] = None,
|
| 532 |
+
) -> DocumentResult:
|
| 533 |
+
"""
|
| 534 |
+
Parse the entire document (or a page range) atomically.
|
| 535 |
+
|
| 536 |
+
Parameters
|
| 537 |
+
----------
|
| 538 |
+
pdf_path : Path to the PDF file.
|
| 539 |
+
page_range : Optional (start, end) 1-indexed inclusive page numbers.
|
| 540 |
+
|
| 541 |
+
Returns
|
| 542 |
+
-------
|
| 543 |
+
DocumentResult with full structured extraction.
|
| 544 |
+
"""
|
| 545 |
+
path = Path(pdf_path).resolve()
|
| 546 |
+
if not path.exists():
|
| 547 |
+
raise FileNotFoundError(f"PDF not found: {path}")
|
| 548 |
+
|
| 549 |
+
doc_hash = self.cache.file_hash(path)
|
| 550 |
+
t_start = time.time()
|
| 551 |
+
|
| 552 |
+
with PDFDocument(path) as pdf:
|
| 553 |
+
total = pdf.total_pages
|
| 554 |
+
if page_range:
|
| 555 |
+
p_start = max(0, page_range[0] - 1)
|
| 556 |
+
p_end = min(total, page_range[1])
|
| 557 |
+
else:
|
| 558 |
+
p_start, p_end = 0, total
|
| 559 |
+
|
| 560 |
+
chunks = []
|
| 561 |
+
for s in range(p_start, p_end, self.chunk_size):
|
| 562 |
+
e = min(s + self.chunk_size, p_end)
|
| 563 |
+
chunks.append((s, e))
|
| 564 |
+
|
| 565 |
+
page_results: List[PageResult] = []
|
| 566 |
+
|
| 567 |
+
with Progress(
|
| 568 |
+
SpinnerColumn(),
|
| 569 |
+
TextColumn("[bold cyan]{task.description}"),
|
| 570 |
+
BarColumn(),
|
| 571 |
+
MofNCompleteColumn(),
|
| 572 |
+
TaskProgressColumn(),
|
| 573 |
+
TimeElapsedColumn(),
|
| 574 |
+
TimeRemainingColumn(),
|
| 575 |
+
console=console,
|
| 576 |
+
transient=False,
|
| 577 |
+
) as progress:
|
| 578 |
+
task = progress.add_task(
|
| 579 |
+
f"[cyan]Parsing {path.name}", total=len(chunks)
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
for chunk_start, chunk_end in chunks:
|
| 583 |
+
chunk_pages = self._parse_chunk(
|
| 584 |
+
pdf, doc_hash, chunk_start, chunk_end
|
| 585 |
+
)
|
| 586 |
+
page_results.extend(chunk_pages)
|
| 587 |
+
progress.advance(task)
|
| 588 |
+
|
| 589 |
+
# Build document-level metadata
|
| 590 |
+
meta = self._extract_document_meta(page_results)
|
| 591 |
+
|
| 592 |
+
doc_result = DocumentResult(
|
| 593 |
+
document_path = str(path),
|
| 594 |
+
document_hash = doc_hash,
|
| 595 |
+
total_pages = total,
|
| 596 |
+
pages_processed = len(page_results),
|
| 597 |
+
model = self.model,
|
| 598 |
+
processing_mode = self.mode,
|
| 599 |
+
title = meta.get("title", ""),
|
| 600 |
+
authors = meta.get("authors", []),
|
| 601 |
+
abstract = meta.get("abstract", ""),
|
| 602 |
+
document_summary = meta.get("document_summary", ""),
|
| 603 |
+
page_results = page_results,
|
| 604 |
+
total_equations = sum(len(p.equations) for p in page_results),
|
| 605 |
+
total_tables = sum(len(p.tables) for p in page_results),
|
| 606 |
+
total_algorithms = sum(len(p.algorithms) for p in page_results),
|
| 607 |
+
total_figures = sum(len(p.figures) for p in page_results),
|
| 608 |
+
total_tokens_used = sum(p.tokens_used for p in page_results),
|
| 609 |
+
total_processing_time_s = time.time() - t_start,
|
| 610 |
+
)
|
| 611 |
+
return doc_result
|
| 612 |
+
|
| 613 |
+
def extract_equations(self, pdf_path: str | Path) -> List[EquationBlock]:
|
| 614 |
+
result = self.parse(pdf_path)
|
| 615 |
+
return [eq for p in result.page_results for eq in p.equations]
|
| 616 |
+
|
| 617 |
+
def extract_tables(self, pdf_path: str | Path) -> List[TableBlock]:
|
| 618 |
+
result = self.parse(pdf_path)
|
| 619 |
+
return [tb for p in result.page_results for tb in p.tables]
|
| 620 |
+
|
| 621 |
+
def extract_algorithms(self, pdf_path: str | Path) -> List[AlgorithmBlock]:
|
| 622 |
+
result = self.parse(pdf_path)
|
| 623 |
+
return [al for p in result.page_results for al in p.algorithms]
|
| 624 |
+
|
| 625 |
+
def extract_figures(self, pdf_path: str | Path) -> List[FigureBlock]:
|
| 626 |
+
result = self.parse(pdf_path)
|
| 627 |
+
return [fg for p in result.page_results for fg in p.figures]
|
| 628 |
+
|
| 629 |
+
def query(self, pdf_path: str | Path, question: str) -> str:
|
| 630 |
+
"""
|
| 631 |
+
Semantic query over cached parse results. Re-parses if not cached.
|
| 632 |
+
"""
|
| 633 |
+
result = self.parse(pdf_path)
|
| 634 |
+
full_text = "\n\n".join(
|
| 635 |
+
f"[Page {p.page_number}]\n{p.raw_text}" for p in result.page_results
|
| 636 |
+
)
|
| 637 |
+
messages = [
|
| 638 |
+
{
|
| 639 |
+
"role": "user",
|
| 640 |
+
"content": (
|
| 641 |
+
f"Based on the following document content, answer this question "
|
| 642 |
+
f"precisely and cite page numbers where relevant.\n\n"
|
| 643 |
+
f"Question: {question}\n\n"
|
| 644 |
+
f"Document content:\n{full_text[:60000]}"
|
| 645 |
+
),
|
| 646 |
+
}
|
| 647 |
+
]
|
| 648 |
+
resp = self.client.messages.create(
|
| 649 |
+
model=self.model,
|
| 650 |
+
max_tokens=2048,
|
| 651 |
+
messages=messages,
|
| 652 |
+
)
|
| 653 |
+
return resp.content[0].text
|
| 654 |
+
|
| 655 |
+
# ------------------------------------------------------------------
|
| 656 |
+
# Internal methods
|
| 657 |
+
# ------------------------------------------------------------------
|
| 658 |
+
|
| 659 |
+
def _parse_chunk(
|
| 660 |
+
self,
|
| 661 |
+
pdf: PDFDocument,
|
| 662 |
+
doc_hash: str,
|
| 663 |
+
chunk_start: int,
|
| 664 |
+
chunk_end: int,
|
| 665 |
+
) -> List[PageResult]:
|
| 666 |
+
"""Parse a range of pages, using cache when available."""
|
| 667 |
+
results = []
|
| 668 |
+
pages_to_process = []
|
| 669 |
+
|
| 670 |
+
for pg in range(chunk_start, chunk_end):
|
| 671 |
+
cached = self.cache.get_page(doc_hash, pg + 1, self.model, self.mode)
|
| 672 |
+
if cached:
|
| 673 |
+
logger.debug("Cache hit page %d", pg + 1)
|
| 674 |
+
results.append(cached)
|
| 675 |
+
else:
|
| 676 |
+
pages_to_process.append(pg)
|
| 677 |
+
|
| 678 |
+
if not pages_to_process:
|
| 679 |
+
return results
|
| 680 |
+
|
| 681 |
+
# Group consecutive un-cached pages into sub-chunks
|
| 682 |
+
sub_chunks = self._group_consecutive(pages_to_process)
|
| 683 |
+
for sub_start, sub_end in sub_chunks:
|
| 684 |
+
sub_results = self._call_api_chunk(pdf, doc_hash, sub_start, sub_end)
|
| 685 |
+
results.extend(sub_results)
|
| 686 |
+
|
| 687 |
+
results.sort(key=lambda r: r.page_number)
|
| 688 |
+
return results
|
| 689 |
+
|
| 690 |
+
@staticmethod
|
| 691 |
+
def _group_consecutive(pages: List[int]) -> List[Tuple[int, int]]:
|
| 692 |
+
if not pages:
|
| 693 |
+
return []
|
| 694 |
+
groups, start, prev = [], pages[0], pages[0]
|
| 695 |
+
for p in pages[1:]:
|
| 696 |
+
if p != prev + 1:
|
| 697 |
+
groups.append((start, prev + 1))
|
| 698 |
+
start = p
|
| 699 |
+
prev = p
|
| 700 |
+
groups.append((start, prev + 1))
|
| 701 |
+
return groups
|
| 702 |
+
|
| 703 |
+
def _call_api_chunk(
|
| 704 |
+
self,
|
| 705 |
+
pdf: PDFDocument,
|
| 706 |
+
doc_hash: str,
|
| 707 |
+
chunk_start: int,
|
| 708 |
+
chunk_end: int,
|
| 709 |
+
) -> List[PageResult]:
|
| 710 |
+
"""Send pages to Claude API and parse response."""
|
| 711 |
+
t_start = time.time()
|
| 712 |
+
|
| 713 |
+
if self.mode == "image":
|
| 714 |
+
return self._call_api_as_images(pdf, doc_hash, chunk_start, chunk_end, t_start)
|
| 715 |
+
else:
|
| 716 |
+
return self._call_api_native(pdf, doc_hash, chunk_start, chunk_end, t_start)
|
| 717 |
+
|
| 718 |
+
def _call_api_native(
|
| 719 |
+
self,
|
| 720 |
+
pdf: PDFDocument,
|
| 721 |
+
doc_hash: str,
|
| 722 |
+
chunk_start: int,
|
| 723 |
+
chunk_end: int,
|
| 724 |
+
t_start: float,
|
| 725 |
+
) -> List[PageResult]:
|
| 726 |
+
chunk_bytes = pdf.get_chunk_as_pdf_bytes(chunk_start, chunk_end)
|
| 727 |
+
b64_pdf = base64.standard_b64encode(chunk_bytes).decode("utf-8")
|
| 728 |
+
num_pages = chunk_end - chunk_start
|
| 729 |
+
|
| 730 |
+
prompt_suffix = (
|
| 731 |
+
f"\nThis PDF chunk contains pages {chunk_start + 1} to {chunk_end} "
|
| 732 |
+
f"of the original document. "
|
| 733 |
+
f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
|
| 734 |
+
f"Index them page_number={chunk_start + 1} through {chunk_end}."
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
messages = [
|
| 738 |
+
{
|
| 739 |
+
"role": "user",
|
| 740 |
+
"content": [
|
| 741 |
+
{
|
| 742 |
+
"type": "document",
|
| 743 |
+
"source": {
|
| 744 |
+
"type": "base64",
|
| 745 |
+
"media_type": "application/pdf",
|
| 746 |
+
"data": b64_pdf,
|
| 747 |
+
},
|
| 748 |
+
"cache_control": {"type": "ephemeral"},
|
| 749 |
+
},
|
| 750 |
+
{
|
| 751 |
+
"type": "text",
|
| 752 |
+
"text": PAGE_EXTRACTION_PROMPT + prompt_suffix,
|
| 753 |
+
},
|
| 754 |
+
],
|
| 755 |
+
}
|
| 756 |
+
]
|
| 757 |
+
|
| 758 |
+
return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "native")
|
| 759 |
+
|
| 760 |
+
def _call_api_as_images(
|
| 761 |
+
self,
|
| 762 |
+
pdf: PDFDocument,
|
| 763 |
+
doc_hash: str,
|
| 764 |
+
chunk_start: int,
|
| 765 |
+
chunk_end: int,
|
| 766 |
+
t_start: float,
|
| 767 |
+
) -> List[PageResult]:
|
| 768 |
+
content = []
|
| 769 |
+
for pg_idx in range(chunk_start, chunk_end):
|
| 770 |
+
png_bytes = pdf.get_page_as_png_bytes(pg_idx, dpi=IMAGE_DPI)
|
| 771 |
+
b64_img = base64.standard_b64encode(png_bytes).decode("utf-8")
|
| 772 |
+
content.append({
|
| 773 |
+
"type": "text",
|
| 774 |
+
"text": f"--- Page {pg_idx + 1} ---",
|
| 775 |
+
})
|
| 776 |
+
content.append({
|
| 777 |
+
"type": "image",
|
| 778 |
+
"source": {
|
| 779 |
+
"type": "base64",
|
| 780 |
+
"media_type": "image/png",
|
| 781 |
+
"data": b64_img,
|
| 782 |
+
},
|
| 783 |
+
})
|
| 784 |
+
|
| 785 |
+
num_pages = chunk_end - chunk_start
|
| 786 |
+
prompt_suffix = (
|
| 787 |
+
f"\nThese are page images {chunk_start + 1} through {chunk_end}. "
|
| 788 |
+
f"Return a JSON array with exactly {num_pages} page objects matching the schema. "
|
| 789 |
+
f"Index them page_number={chunk_start + 1} through {chunk_end}."
|
| 790 |
+
)
|
| 791 |
+
content.append({"type": "text", "text": PAGE_EXTRACTION_PROMPT + prompt_suffix})
|
| 792 |
+
|
| 793 |
+
messages = [{"role": "user", "content": content}]
|
| 794 |
+
return self._execute_api_call(messages, doc_hash, chunk_start, chunk_end, t_start, "image")
|
| 795 |
+
|
| 796 |
+
def _execute_api_call(
|
| 797 |
+
self,
|
| 798 |
+
messages: List[Dict],
|
| 799 |
+
doc_hash: str,
|
| 800 |
+
chunk_start: int,
|
| 801 |
+
chunk_end: int,
|
| 802 |
+
t_start: float,
|
| 803 |
+
mode: str,
|
| 804 |
+
) -> List[PageResult]:
|
| 805 |
+
retries, delay = 3, 5
|
| 806 |
+
for attempt in range(retries):
|
| 807 |
+
try:
|
| 808 |
+
resp = self.client.messages.create(
|
| 809 |
+
model=self.model,
|
| 810 |
+
max_tokens=MAX_TOKENS_OUTPUT,
|
| 811 |
+
system=SYSTEM_PROMPT,
|
| 812 |
+
messages=messages,
|
| 813 |
+
)
|
| 814 |
+
break
|
| 815 |
+
except anthropic.RateLimitError:
|
| 816 |
+
if attempt == retries - 1:
|
| 817 |
+
raise
|
| 818 |
+
logger.warning("Rate limit hit; retrying in %ds...", delay)
|
| 819 |
+
time.sleep(delay)
|
| 820 |
+
delay *= 2
|
| 821 |
+
except anthropic.APIStatusError as exc:
|
| 822 |
+
logger.error("API error: %s", exc)
|
| 823 |
+
raise
|
| 824 |
+
|
| 825 |
+
raw_response = resp.content[0].text.strip()
|
| 826 |
+
tokens_used = resp.usage.input_tokens + resp.usage.output_tokens
|
| 827 |
+
elapsed = time.time() - t_start
|
| 828 |
+
|
| 829 |
+
# Clean possible markdown fences
|
| 830 |
+
if raw_response.startswith("```"):
|
| 831 |
+
lines = raw_response.split("\n")
|
| 832 |
+
raw_response = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
|
| 833 |
+
|
| 834 |
+
try:
|
| 835 |
+
parsed = json.loads(raw_response)
|
| 836 |
+
except json.JSONDecodeError as exc:
|
| 837 |
+
logger.error("JSON parse error on API response: %s\nRaw:\n%s", exc, raw_response[:500])
|
| 838 |
+
# Return minimal fallback for affected pages
|
| 839 |
+
return [
|
| 840 |
+
PageResult(
|
| 841 |
+
page_number=pg + 1,
|
| 842 |
+
raw_text="[PARSE ERROR: JSON decode failed]",
|
| 843 |
+
summary="Failed to parse this page.",
|
| 844 |
+
processing_mode=mode,
|
| 845 |
+
tokens_used=tokens_used // max(1, chunk_end - chunk_start),
|
| 846 |
+
processing_time_s=elapsed,
|
| 847 |
+
)
|
| 848 |
+
for pg in range(chunk_start, chunk_end)
|
| 849 |
+
]
|
| 850 |
+
|
| 851 |
+
# Handle both array-of-pages and single-page responses
|
| 852 |
+
if isinstance(parsed, dict):
|
| 853 |
+
parsed = [parsed]
|
| 854 |
+
|
| 855 |
+
results = []
|
| 856 |
+
for i, page_data in enumerate(parsed):
|
| 857 |
+
pg_num = chunk_start + i + 1
|
| 858 |
+
page_data["page_number"] = pg_num
|
| 859 |
+
page_data["processing_mode"] = mode
|
| 860 |
+
page_data["tokens_used"] = tokens_used // len(parsed)
|
| 861 |
+
page_data["processing_time_s"] = elapsed / len(parsed)
|
| 862 |
+
|
| 863 |
+
pr = self._dict_to_page_result(page_data)
|
| 864 |
+
self.cache.set_page(doc_hash, pr, self.model, mode)
|
| 865 |
+
results.append(pr)
|
| 866 |
+
|
| 867 |
+
return results
|
| 868 |
+
|
| 869 |
+
@staticmethod
|
| 870 |
+
def _dict_to_page_result(d: Dict) -> PageResult:
|
| 871 |
+
equations = [
|
| 872 |
+
EquationBlock(
|
| 873 |
+
page=d["page_number"],
|
| 874 |
+
index=e.get("index", i),
|
| 875 |
+
latex=e.get("latex", ""),
|
| 876 |
+
description=e.get("description", ""),
|
| 877 |
+
inline=e.get("inline", False),
|
| 878 |
+
)
|
| 879 |
+
for i, e in enumerate(d.get("equations", []))
|
| 880 |
+
]
|
| 881 |
+
tables = [
|
| 882 |
+
TableBlock(
|
| 883 |
+
page=d["page_number"],
|
| 884 |
+
index=t.get("index", i),
|
| 885 |
+
markdown=t.get("markdown", ""),
|
| 886 |
+
json_data=t.get("json_data", []),
|
| 887 |
+
caption=t.get("caption", ""),
|
| 888 |
+
)
|
| 889 |
+
for i, t in enumerate(d.get("tables", []))
|
| 890 |
+
]
|
| 891 |
+
algorithms = [
|
| 892 |
+
AlgorithmBlock(
|
| 893 |
+
page=d["page_number"],
|
| 894 |
+
index=a.get("index", i),
|
| 895 |
+
name=a.get("name", f"Algorithm {i+1}"),
|
| 896 |
+
language=a.get("language", "pseudocode"),
|
| 897 |
+
code=a.get("code", ""),
|
| 898 |
+
description=a.get("description", ""),
|
| 899 |
+
)
|
| 900 |
+
for i, a in enumerate(d.get("algorithms", []))
|
| 901 |
+
]
|
| 902 |
+
figures = [
|
| 903 |
+
FigureBlock(
|
| 904 |
+
page=d["page_number"],
|
| 905 |
+
index=f.get("index", i),
|
| 906 |
+
figure_type=f.get("figure_type", "other"),
|
| 907 |
+
description=f.get("description", ""),
|
| 908 |
+
data_summary=f.get("data_summary", ""),
|
| 909 |
+
caption=f.get("caption", ""),
|
| 910 |
+
)
|
| 911 |
+
for i, f in enumerate(d.get("figures", []))
|
| 912 |
+
]
|
| 913 |
+
return PageResult(
|
| 914 |
+
page_number = d["page_number"],
|
| 915 |
+
raw_text = d.get("raw_text", ""),
|
| 916 |
+
summary = d.get("summary", ""),
|
| 917 |
+
equations = equations,
|
| 918 |
+
tables = tables,
|
| 919 |
+
algorithms = algorithms,
|
| 920 |
+
figures = figures,
|
| 921 |
+
section_headers = d.get("section_headers", []),
|
| 922 |
+
references = d.get("references", []),
|
| 923 |
+
keywords = d.get("keywords", []),
|
| 924 |
+
layout_notes = d.get("layout_notes", ""),
|
| 925 |
+
processing_mode = d.get("processing_mode", "native"),
|
| 926 |
+
tokens_used = d.get("tokens_used", 0),
|
| 927 |
+
processing_time_s = d.get("processing_time_s", 0.0),
|
| 928 |
+
)
|
| 929 |
+
|
| 930 |
+
def _extract_document_meta(self, page_results: List[PageResult]) -> Dict:
|
| 931 |
+
# Use first 5 pages for metadata extraction
|
| 932 |
+
sample_text = "\n\n".join(
|
| 933 |
+
f"[Page {p.page_number}]\n{p.raw_text}" for p in page_results[:5]
|
| 934 |
+
)
|
| 935 |
+
messages = [
|
| 936 |
+
{
|
| 937 |
+
"role": "user",
|
| 938 |
+
"content": (
|
| 939 |
+
f"{DOCUMENT_META_PROMPT}\n\nDocument sample:\n{sample_text[:8000]}"
|
| 940 |
+
),
|
| 941 |
+
}
|
| 942 |
+
]
|
| 943 |
+
try:
|
| 944 |
+
resp = self.client.messages.create(
|
| 945 |
+
model=self.model,
|
| 946 |
+
max_tokens=1024,
|
| 947 |
+
system=SYSTEM_PROMPT,
|
| 948 |
+
messages=messages,
|
| 949 |
+
)
|
| 950 |
+
raw = resp.content[0].text.strip()
|
| 951 |
+
if raw.startswith("```"):
|
| 952 |
+
lines = raw.split("\n")
|
| 953 |
+
raw = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
|
| 954 |
+
return json.loads(raw)
|
| 955 |
+
except Exception as exc:
|
| 956 |
+
logger.warning("Document meta extraction failed: %s", exc)
|
| 957 |
+
return {"title": "", "authors": [], "abstract": "", "document_summary": ""}
|
| 958 |
+
|
| 959 |
+
|
| 960 |
+
# ---------------------------------------------------------------------------
|
| 961 |
+
# Output formatters
|
| 962 |
+
# ---------------------------------------------------------------------------
|
| 963 |
+
|
| 964 |
+
class OutputFormatter:
|
| 965 |
+
@staticmethod
|
| 966 |
+
def to_json(result: DocumentResult, indent: int = 2) -> str:
|
| 967 |
+
return json.dumps(asdict(result), indent=indent, ensure_ascii=False)
|
| 968 |
+
|
| 969 |
+
@staticmethod
|
| 970 |
+
def to_markdown(result: DocumentResult) -> str:
|
| 971 |
+
lines = []
|
| 972 |
+
lines.append(f"# {result.title or Path(result.document_path).name}")
|
| 973 |
+
if result.authors:
|
| 974 |
+
lines.append(f"\n**Authors:** {', '.join(result.authors)}")
|
| 975 |
+
lines.append(f"\n**Document Hash:** `{result.document_hash}`")
|
| 976 |
+
lines.append(f"**Model:** {result.model} | **Mode:** {result.processing_mode}")
|
| 977 |
+
lines.append(
|
| 978 |
+
f"**Pages:** {result.pages_processed}/{result.total_pages} | "
|
| 979 |
+
f"**Tokens:** {result.total_tokens_used:,} | "
|
| 980 |
+
f"**Time:** {result.total_processing_time_s:.1f}s"
|
| 981 |
+
)
|
| 982 |
+
lines.append(
|
| 983 |
+
f"**Equations:** {result.total_equations} | "
|
| 984 |
+
f"**Tables:** {result.total_tables} | "
|
| 985 |
+
f"**Algorithms:** {result.total_algorithms} | "
|
| 986 |
+
f"**Figures:** {result.total_figures}"
|
| 987 |
+
)
|
| 988 |
+
if result.abstract:
|
| 989 |
+
lines.append(f"\n## Abstract\n\n{result.abstract}")
|
| 990 |
+
if result.document_summary:
|
| 991 |
+
lines.append(f"\n## Document Summary\n\n{result.document_summary}")
|
| 992 |
+
|
| 993 |
+
for page in result.page_results:
|
| 994 |
+
lines.append(f"\n---\n\n## Page {page.page_number}")
|
| 995 |
+
if page.section_headers:
|
| 996 |
+
lines.append("\n### Sections\n" + "\n".join(f"- {h}" for h in page.section_headers))
|
| 997 |
+
lines.append(f"\n### Summary\n{page.summary}")
|
| 998 |
+
lines.append(f"\n### Full Text\n\n{page.raw_text}")
|
| 999 |
+
|
| 1000 |
+
if page.equations:
|
| 1001 |
+
lines.append("\n### Equations\n")
|
| 1002 |
+
for eq in page.equations:
|
| 1003 |
+
lines.append(f"**Eq {eq.index}** ({('inline' if eq.inline else 'display')})")
|
| 1004 |
+
lines.append(f"```latex\n{eq.latex}\n```")
|
| 1005 |
+
lines.append(f"*{eq.description}*\n")
|
| 1006 |
+
|
| 1007 |
+
if page.tables:
|
| 1008 |
+
lines.append("\n### Tables\n")
|
| 1009 |
+
for tb in page.tables:
|
| 1010 |
+
if tb.caption:
|
| 1011 |
+
lines.append(f"**{tb.caption}**\n")
|
| 1012 |
+
lines.append(tb.markdown + "\n")
|
| 1013 |
+
|
| 1014 |
+
if page.algorithms:
|
| 1015 |
+
lines.append("\n### Algorithms\n")
|
| 1016 |
+
for al in page.algorithms:
|
| 1017 |
+
lines.append(f"**{al.name}** ({al.language})\n")
|
| 1018 |
+
lines.append(f"```{al.language}\n{al.code}\n```")
|
| 1019 |
+
lines.append(f"*{al.description}*\n")
|
| 1020 |
+
|
| 1021 |
+
if page.figures:
|
| 1022 |
+
lines.append("\n### Figures\n")
|
| 1023 |
+
for fg in page.figures:
|
| 1024 |
+
lines.append(f"**Figure {fg.index}** [{fg.figure_type}]")
|
| 1025 |
+
if fg.caption:
|
| 1026 |
+
lines.append(f"*{fg.caption}*")
|
| 1027 |
+
lines.append(fg.description)
|
| 1028 |
+
if fg.data_summary:
|
| 1029 |
+
lines.append(f"Data: {fg.data_summary}\n")
|
| 1030 |
+
|
| 1031 |
+
return "\n".join(lines)
|
| 1032 |
+
|
| 1033 |
+
@staticmethod
|
| 1034 |
+
def to_text(result: DocumentResult) -> str:
|
| 1035 |
+
lines = [
|
| 1036 |
+
f"DOCUMENT: {result.title or Path(result.document_path).name}",
|
| 1037 |
+
f"Authors: {', '.join(result.authors)}",
|
| 1038 |
+
f"Pages processed: {result.pages_processed}/{result.total_pages}",
|
| 1039 |
+
"",
|
| 1040 |
+
"SUMMARY",
|
| 1041 |
+
"=" * 60,
|
| 1042 |
+
result.document_summary,
|
| 1043 |
+
"",
|
| 1044 |
+
]
|
| 1045 |
+
for page in result.page_results:
|
| 1046 |
+
lines.append(f"\n[PAGE {page.page_number}]")
|
| 1047 |
+
lines.append(page.raw_text)
|
| 1048 |
+
return "\n".join(lines)
|
| 1049 |
+
|
| 1050 |
+
@staticmethod
|
| 1051 |
+
def print_summary_table(result: DocumentResult) -> None:
|
| 1052 |
+
table = Table(title=f"Parse Results: {Path(result.document_path).name}", show_lines=True)
|
| 1053 |
+
table.add_column("Metric", style="cyan", no_wrap=True)
|
| 1054 |
+
table.add_column("Value", style="green")
|
| 1055 |
+
|
| 1056 |
+
table.add_row("Title", result.title or "(unknown)")
|
| 1057 |
+
table.add_row("Authors", ", ".join(result.authors) or "(unknown)")
|
| 1058 |
+
table.add_row("Model", result.model)
|
| 1059 |
+
table.add_row("Mode", result.processing_mode)
|
| 1060 |
+
table.add_row("Pages total", str(result.total_pages))
|
| 1061 |
+
table.add_row("Pages parsed", str(result.pages_processed))
|
| 1062 |
+
table.add_row("Equations", str(result.total_equations))
|
| 1063 |
+
table.add_row("Tables", str(result.total_tables))
|
| 1064 |
+
table.add_row("Algorithms", str(result.total_algorithms))
|
| 1065 |
+
table.add_row("Figures", str(result.total_figures))
|
| 1066 |
+
table.add_row("Tokens used", f"{result.total_tokens_used:,}")
|
| 1067 |
+
table.add_row("Processing time", f"{result.total_processing_time_s:.1f}s")
|
| 1068 |
+
table.add_row("Document hash", result.document_hash)
|
| 1069 |
+
|
| 1070 |
+
console.print(table)
|
| 1071 |
+
|
| 1072 |
+
|
| 1073 |
+
# ---------------------------------------------------------------------------
|
| 1074 |
+
# Agent interface
|
| 1075 |
+
# ---------------------------------------------------------------------------
|
| 1076 |
+
|
| 1077 |
+
class AgentPDFInterface:
|
| 1078 |
+
"""
|
| 1079 |
+
High-level interface designed for use within agent pipelines.
|
| 1080 |
+
All methods accept a file path and return serializable Python objects.
|
| 1081 |
+
|
| 1082 |
+
Example usage in an agent:
|
| 1083 |
+
from pdf_atomic_parser import AgentPDFInterface
|
| 1084 |
+
|
| 1085 |
+
agent = AgentPDFInterface(model="opus")
|
| 1086 |
+
full = agent.parse("paper.pdf")
|
| 1087 |
+
eqs = agent.get_equations("paper.pdf")
|
| 1088 |
+
answer = agent.ask("paper.pdf", "What is the loss function?")
|
| 1089 |
+
"""
|
| 1090 |
+
|
| 1091 |
+
def __init__(self, **kwargs):
|
| 1092 |
+
self._parser = AtomicPDFParser(**kwargs)
|
| 1093 |
+
|
| 1094 |
+
def parse(self, pdf_path: str, page_range: Optional[Tuple[int, int]] = None) -> Dict:
|
| 1095 |
+
result = self._parser.parse(pdf_path, page_range)
|
| 1096 |
+
return asdict(result)
|
| 1097 |
+
|
| 1098 |
+
def get_equations(self, pdf_path: str) -> List[Dict]:
|
| 1099 |
+
return [asdict(e) for e in self._parser.extract_equations(pdf_path)]
|
| 1100 |
+
|
| 1101 |
+
def get_tables(self, pdf_path: str) -> List[Dict]:
|
| 1102 |
+
return [asdict(t) for t in self._parser.extract_tables(pdf_path)]
|
| 1103 |
+
|
| 1104 |
+
def get_algorithms(self, pdf_path: str) -> List[Dict]:
|
| 1105 |
+
return [asdict(a) for a in self._parser.extract_algorithms(pdf_path)]
|
| 1106 |
+
|
| 1107 |
+
def get_figures(self, pdf_path: str) -> List[Dict]:
|
| 1108 |
+
return [asdict(f) for f in self._parser.extract_figures(pdf_path)]
|
| 1109 |
+
|
| 1110 |
+
def ask(self, pdf_path: str, question: str) -> str:
|
| 1111 |
+
return self._parser.query(pdf_path, question)
|
| 1112 |
+
|
| 1113 |
+
def get_full_text(self, pdf_path: str) -> str:
|
| 1114 |
+
result = self._parser.parse(pdf_path)
|
| 1115 |
+
return "\n\n".join(
|
| 1116 |
+
f"[Page {p.page_number}]\n{p.raw_text}"
|
| 1117 |
+
for p in result.page_results
|
| 1118 |
+
)
|
| 1119 |
+
|
| 1120 |
+
def cache_stats(self) -> Dict:
|
| 1121 |
+
return self._parser.cache.stats()
|
| 1122 |
+
|
| 1123 |
+
|
| 1124 |
+
# ---------------------------------------------------------------------------
|
| 1125 |
+
# Batch processor
|
| 1126 |
+
# ---------------------------------------------------------------------------
|
| 1127 |
+
|
| 1128 |
+
def batch_process(
|
| 1129 |
+
input_dir: Path,
|
| 1130 |
+
output_dir: Path,
|
| 1131 |
+
parser: AtomicPDFParser,
|
| 1132 |
+
fmt: str = "json",
|
| 1133 |
+
) -> None:
|
| 1134 |
+
pdfs = sorted(input_dir.glob("**/*.pdf"))
|
| 1135 |
+
if not pdfs:
|
| 1136 |
+
console.print(f"[yellow]No PDF files found in {input_dir}[/yellow]")
|
| 1137 |
+
return
|
| 1138 |
+
|
| 1139 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 1140 |
+
console.print(f"[cyan]Found {len(pdfs)} PDF files to process.[/cyan]")
|
| 1141 |
+
|
| 1142 |
+
for pdf_path in pdfs:
|
| 1143 |
+
console.print(f"\n[bold]Processing:[/bold] {pdf_path.name}")
|
| 1144 |
+
try:
|
| 1145 |
+
result = parser.parse(pdf_path)
|
| 1146 |
+
stem = pdf_path.stem
|
| 1147 |
+
if fmt == "json":
|
| 1148 |
+
out = output_dir / f"{stem}.json"
|
| 1149 |
+
out.write_text(OutputFormatter.to_json(result), encoding="utf-8")
|
| 1150 |
+
elif fmt == "markdown":
|
| 1151 |
+
out = output_dir / f"{stem}.md"
|
| 1152 |
+
out.write_text(OutputFormatter.to_markdown(result), encoding="utf-8")
|
| 1153 |
+
else:
|
| 1154 |
+
out = output_dir / f"{stem}.txt"
|
| 1155 |
+
out.write_text(OutputFormatter.to_text(result), encoding="utf-8")
|
| 1156 |
+
console.print(f" [green]Saved:[/green] {out}")
|
| 1157 |
+
OutputFormatter.print_summary_table(result)
|
| 1158 |
+
except Exception as exc:
|
| 1159 |
+
console.print(f" [red]Error processing {pdf_path.name}: {exc}[/red]")
|
| 1160 |
+
logger.exception("Batch error")
|
| 1161 |
+
|
| 1162 |
+
|
| 1163 |
+
# ---------------------------------------------------------------------------
|
| 1164 |
+
# Token estimator
|
| 1165 |
+
# ---------------------------------------------------------------------------
|
| 1166 |
+
|
| 1167 |
+
def estimate_tokens(pdf_path: Path) -> None:
|
| 1168 |
+
with PDFDocument(pdf_path) as pdf:
|
| 1169 |
+
total = pdf.total_pages
|
| 1170 |
+
size_mb = pdf.file_size_bytes / 1e6
|
| 1171 |
+
|
| 1172 |
+
# Rough estimate: ~800 tokens per page for dense academic content
|
| 1173 |
+
est_tokens_in = total * 800
|
| 1174 |
+
est_tokens_out = total * 400
|
| 1175 |
+
est_total = est_tokens_in + est_tokens_out
|
| 1176 |
+
|
| 1177 |
+
# Pricing approximate (Opus: $15/Mtok in, $75/Mtok out as of 2025)
|
| 1178 |
+
est_cost_opus = (est_tokens_in * 15 + est_tokens_out * 75) / 1_000_000
|
| 1179 |
+
|
| 1180 |
+
table = Table(title=f"Token Estimate: {pdf_path.name}", show_lines=True)
|
| 1181 |
+
table.add_column("Metric", style="cyan")
|
| 1182 |
+
table.add_column("Estimate", style="yellow")
|
| 1183 |
+
|
| 1184 |
+
table.add_row("Total pages", str(total))
|
| 1185 |
+
table.add_row("File size", f"{size_mb:.2f} MB")
|
| 1186 |
+
table.add_row("Est. input tokens", f"{est_tokens_in:,}")
|
| 1187 |
+
table.add_row("Est. output tokens", f"{est_tokens_out:,}")
|
| 1188 |
+
table.add_row("Est. total tokens", f"{est_total:,}")
|
| 1189 |
+
table.add_row("Est. cost (Opus)", f"${est_cost_opus:.2f}")
|
| 1190 |
+
table.add_row("Note", "Estimate only; actual usage varies")
|
| 1191 |
+
|
| 1192 |
+
console.print(table)
|
| 1193 |
+
|
| 1194 |
+
|
| 1195 |
+
# ---------------------------------------------------------------------------
|
| 1196 |
+
# CLI
|
| 1197 |
+
# ---------------------------------------------------------------------------
|
| 1198 |
+
|
| 1199 |
+
def build_cli() -> argparse.ArgumentParser:
|
| 1200 |
+
parser = argparse.ArgumentParser(
|
| 1201 |
+
prog="pdf_atomic_parser",
|
| 1202 |
+
description="Atomic PDF parser powered by Claude claude-opus-4-6",
|
| 1203 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 1204 |
+
)
|
| 1205 |
+
parser.add_argument("--model", default="opus", help="opus | sonnet | haiku | full-model-string")
|
| 1206 |
+
parser.add_argument("--mode", default="native", choices=["native", "image"], help="Parsing mode")
|
| 1207 |
+
parser.add_argument("--chunk-size", type=int, default=CHUNK_SIZE_DEFAULT, help="Pages per API call")
|
| 1208 |
+
parser.add_argument("--verbose", action="store_true")
|
| 1209 |
+
|
| 1210 |
+
sub = parser.add_subparsers(dest="command", required=True)
|
| 1211 |
+
|
| 1212 |
+
# parse
|
| 1213 |
+
p_parse = sub.add_parser("parse", help="Parse a PDF fully")
|
| 1214 |
+
p_parse.add_argument("pdf", help="Path to PDF file")
|
| 1215 |
+
p_parse.add_argument("--output", "-o", help="Output file path")
|
| 1216 |
+
p_parse.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])
|
| 1217 |
+
p_parse.add_argument("--pages", help="Page range e.g. 1-50")
|
| 1218 |
+
|
| 1219 |
+
# atomic (alias for parse with all content)
|
| 1220 |
+
p_atomic = sub.add_parser("atomic", help="Full atomic extraction to directory")
|
| 1221 |
+
p_atomic.add_argument("pdf", help="Path to PDF file")
|
| 1222 |
+
p_atomic.add_argument("--output", "-o", default="./atomic_output")
|
| 1223 |
+
|
| 1224 |
+
# extract-equations
|
| 1225 |
+
p_eq = sub.add_parser("extract-equations", help="Extract LaTeX equations")
|
| 1226 |
+
p_eq.add_argument("pdf")
|
| 1227 |
+
p_eq.add_argument("--output", "-o")
|
| 1228 |
+
|
| 1229 |
+
# extract-tables
|
| 1230 |
+
p_tb = sub.add_parser("extract-tables", help="Extract tables")
|
| 1231 |
+
p_tb.add_argument("pdf")
|
| 1232 |
+
p_tb.add_argument("--output", "-o")
|
| 1233 |
+
|
| 1234 |
+
# extract-algorithms
|
| 1235 |
+
p_al = sub.add_parser("extract-algorithms", help="Extract algorithms/code")
|
| 1236 |
+
p_al.add_argument("pdf")
|
| 1237 |
+
p_al.add_argument("--output", "-o")
|
| 1238 |
+
|
| 1239 |
+
# extract-figures
|
| 1240 |
+
p_fg = sub.add_parser("extract-figures", help="Extract figure descriptions")
|
| 1241 |
+
p_fg.add_argument("pdf")
|
| 1242 |
+
p_fg.add_argument("--output", "-o")
|
| 1243 |
+
|
| 1244 |
+
# query
|
| 1245 |
+
p_q = sub.add_parser("query", help="Ask a question about the PDF")
|
| 1246 |
+
p_q.add_argument("pdf")
|
| 1247 |
+
p_q.add_argument("question", help="Question to ask")
|
| 1248 |
+
|
| 1249 |
+
# batch
|
| 1250 |
+
p_batch = sub.add_parser("batch", help="Batch process a directory of PDFs")
|
| 1251 |
+
p_batch.add_argument("directory")
|
| 1252 |
+
p_batch.add_argument("--output", "-o", default="./batch_output")
|
| 1253 |
+
p_batch.add_argument("--format", "-f", default="json", choices=["json", "markdown", "text"])
|
| 1254 |
+
|
| 1255 |
+
# estimate
|
| 1256 |
+
p_est = sub.add_parser("estimate", help="Estimate token cost before parsing")
|
| 1257 |
+
p_est.add_argument("pdf")
|
| 1258 |
+
|
| 1259 |
+
# cache commands
|
| 1260 |
+
sub.add_parser("cache-stats", help="Show cache statistics")
|
| 1261 |
+
sub.add_parser("list-cache", help="List all cached documents")
|
| 1262 |
+
p_cc = sub.add_parser("clear-cache", help="Clear cache for a document")
|
| 1263 |
+
p_cc.add_argument("pdf", help="PDF path (to identify document)")
|
| 1264 |
+
|
| 1265 |
+
return parser
|
| 1266 |
+
|
| 1267 |
+
|
| 1268 |
+
def parse_page_range(s: str) -> Tuple[int, int]:
|
| 1269 |
+
parts = s.split("-")
|
| 1270 |
+
if len(parts) != 2:
|
| 1271 |
+
raise ValueError(f"Page range must be in format start-end, got: {s}")
|
| 1272 |
+
return int(parts[0]), int(parts[1])
|
| 1273 |
+
|
| 1274 |
+
|
| 1275 |
+
def save_output(content: str, output_path: Optional[str], default_name: str) -> None:
|
| 1276 |
+
path = Path(output_path) if output_path else Path(default_name)
|
| 1277 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 1278 |
+
path.write_text(content, encoding="utf-8")
|
| 1279 |
+
console.print(f"[green]Saved:[/green] {path}")
|
| 1280 |
+
|
| 1281 |
+
|
| 1282 |
+
def main() -> None:
|
| 1283 |
+
cli = build_cli()
|
| 1284 |
+
args = cli.parse_args()
|
| 1285 |
+
cache = ParseCache(Path.home() / ".cache" / "pdf_atomic_parser")
|
| 1286 |
+
|
| 1287 |
+
if args.command == "cache-stats":
|
| 1288 |
+
stats = cache.stats()
|
| 1289 |
+
table = Table(title="Cache Statistics", show_lines=True)
|
| 1290 |
+
table.add_column("Key", style="cyan")
|
| 1291 |
+
table.add_column("Value", style="green")
|
| 1292 |
+
for k, v in stats.items():
|
| 1293 |
+
table.add_row(k.replace("_", " ").title(), str(v))
|
| 1294 |
+
console.print(table)
|
| 1295 |
+
return
|
| 1296 |
+
|
| 1297 |
+
if args.command == "list-cache":
|
| 1298 |
+
docs = cache.list_documents()
|
| 1299 |
+
if not docs:
|
| 1300 |
+
console.print("[yellow]Cache is empty.[/yellow]")
|
| 1301 |
+
return
|
| 1302 |
+
table = Table(title="Cached Documents", show_lines=True)
|
| 1303 |
+
table.add_column("Hash", style="cyan")
|
| 1304 |
+
table.add_column("Cached Pages", style="green")
|
| 1305 |
+
table.add_column("First Seen", style="dim")
|
| 1306 |
+
for d in docs:
|
| 1307 |
+
import datetime
|
| 1308 |
+
ts = datetime.datetime.fromtimestamp(d["first_seen"]).strftime("%Y-%m-%d %H:%M")
|
| 1309 |
+
table.add_row(d["hash"], str(d["cached_pages"]), ts)
|
| 1310 |
+
console.print(table)
|
| 1311 |
+
return
|
| 1312 |
+
|
| 1313 |
+
if args.command == "estimate":
|
| 1314 |
+
estimate_tokens(Path(args.pdf))
|
| 1315 |
+
return
|
| 1316 |
+
|
| 1317 |
+
parser = AtomicPDFParser(
|
| 1318 |
+
model=args.model,
|
| 1319 |
+
mode=args.mode,
|
| 1320 |
+
chunk_size=args.chunk_size,
|
| 1321 |
+
verbose=args.verbose,
|
| 1322 |
+
)
|
| 1323 |
+
|
| 1324 |
+
if args.command == "clear-cache":
|
| 1325 |
+
doc_hash = cache.file_hash(Path(args.pdf))
|
| 1326 |
+
n = cache.clear_document(doc_hash)
|
| 1327 |
+
console.print(f"[green]Cleared {n} cached pages for {Path(args.pdf).name}[/green]")
|
| 1328 |
+
return
|
| 1329 |
+
|
| 1330 |
+
if args.command in ("parse", "atomic"):
|
| 1331 |
+
page_range = None
|
| 1332 |
+
if hasattr(args, "pages") and args.pages:
|
| 1333 |
+
page_range = parse_page_range(args.pages)
|
| 1334 |
+
|
| 1335 |
+
result = parser.parse(args.pdf, page_range)
|
| 1336 |
+
OutputFormatter.print_summary_table(result)
|
| 1337 |
+
|
| 1338 |
+
if args.command == "atomic":
|
| 1339 |
+
out_dir = Path(args.output)
|
| 1340 |
+
stem = Path(args.pdf).stem
|
| 1341 |
+
for fmt, fn in [("json", f"{stem}.json"), ("markdown", f"{stem}.md"), ("text", f"{stem}.txt")]:
|
| 1342 |
+
(out_dir / fn).parent.mkdir(parents=True, exist_ok=True)
|
| 1343 |
+
if fmt == "json":
|
| 1344 |
+
content = OutputFormatter.to_json(result)
|
| 1345 |
+
elif fmt == "markdown":
|
| 1346 |
+
content = OutputFormatter.to_markdown(result)
|
| 1347 |
+
else:
|
| 1348 |
+
content = OutputFormatter.to_text(result)
|
| 1349 |
+
(out_dir / fn).write_text(content, encoding="utf-8")
|
| 1350 |
+
console.print(f"[green]Saved {fmt}:[/green] {out_dir / fn}")
|
| 1351 |
+
else:
|
| 1352 |
+
fmt = args.format
|
| 1353 |
+
if fmt == "json":
|
| 1354 |
+
content = OutputFormatter.to_json(result)
|
| 1355 |
+
elif fmt == "markdown":
|
| 1356 |
+
content = OutputFormatter.to_markdown(result)
|
| 1357 |
+
else:
|
| 1358 |
+
content = OutputFormatter.to_text(result)
|
| 1359 |
+
|
| 1360 |
+
stem = Path(args.pdf).stem
|
| 1361 |
+
save_output(content, getattr(args, "output", None), f"{stem}_parsed.{fmt if fmt != 'markdown' else 'md'}")
|
| 1362 |
+
|
| 1363 |
+
elif args.command == "extract-equations":
|
| 1364 |
+
result = parser.parse(args.pdf)
|
| 1365 |
+
eqs = [asdict(e) for p in result.page_results for e in p.equations]
|
| 1366 |
+
content = json.dumps(eqs, indent=2, ensure_ascii=False)
|
| 1367 |
+
save_output(content, args.output, f"{Path(args.pdf).stem}_equations.json")
|
| 1368 |
+
console.print(f"[cyan]{len(eqs)} equations extracted.[/cyan]")
|
| 1369 |
+
|
| 1370 |
+
elif args.command == "extract-tables":
|
| 1371 |
+
result = parser.parse(args.pdf)
|
| 1372 |
+
tables = [asdict(t) for p in result.page_results for t in p.tables]
|
| 1373 |
+
content = json.dumps(tables, indent=2, ensure_ascii=False)
|
| 1374 |
+
save_output(content, args.output, f"{Path(args.pdf).stem}_tables.json")
|
| 1375 |
+
console.print(f"[cyan]{len(tables)} tables extracted.[/cyan]")
|
| 1376 |
+
|
| 1377 |
+
elif args.command == "extract-algorithms":
|
| 1378 |
+
result = parser.parse(args.pdf)
|
| 1379 |
+
algos = [asdict(a) for p in result.page_results for a in p.algorithms]
|
| 1380 |
+
content = json.dumps(algos, indent=2, ensure_ascii=False)
|
| 1381 |
+
save_output(content, args.output, f"{Path(args.pdf).stem}_algorithms.json")
|
| 1382 |
+
console.print(f"[cyan]{len(algos)} algorithms extracted.[/cyan]")
|
| 1383 |
+
|
| 1384 |
+
elif args.command == "extract-figures":
|
| 1385 |
+
result = parser.parse(args.pdf)
|
| 1386 |
+
figures = [asdict(f) for p in result.page_results for f in p.figures]
|
| 1387 |
+
content = json.dumps(figures, indent=2, ensure_ascii=False)
|
| 1388 |
+
save_output(content, args.output, f"{Path(args.pdf).stem}_figures.json")
|
| 1389 |
+
console.print(f"[cyan]{len(figures)} figures extracted.[/cyan]")
|
| 1390 |
+
|
| 1391 |
+
elif args.command == "query":
|
| 1392 |
+
answer = parser.query(args.pdf, args.question)
|
| 1393 |
+
console.print(f"\n[bold cyan]Answer:[/bold cyan]\n{answer}")
|
| 1394 |
+
|
| 1395 |
+
elif args.command == "batch":
|
| 1396 |
+
batch_process(
|
| 1397 |
+
Path(args.directory),
|
| 1398 |
+
Path(args.output),
|
| 1399 |
+
parser,
|
| 1400 |
+
getattr(args, "format", "json"),
|
| 1401 |
+
)
|
| 1402 |
+
|
| 1403 |
+
|
| 1404 |
+
if __name__ == "__main__":
|
| 1405 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anthropic>=0.43.0
|
| 2 |
+
PyMuPDF>=1.24.0
|
| 3 |
+
rich>=13.7.0
|
| 4 |
+
tqdm>=4.66.0
|