diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..5b1c3ac0de8096376f74ef34bf08eaa86f651e3a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,59 @@
+# Ignore development artifacts
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.dylib
+*.log
+.venv/
+venv/
+ENV/
+env/
+.git/
+.gitignore
+.gitlab-ci.yml
+*.md
+!README.md
+.pytest_cache/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Ignore data directories (too large for Docker context)
+data/
+!data/prompt_templates/
+!data/visual_element_prefabs/
+
+# Ignore build artifacts
+*.egg-info/
+dist/
+build/
+*.whl
+
+# Ignore handwriting service (separate deployment)
+handwriting_service/
+
+# Ignore WordStylist (not needed for API)
+WordStylist/
+
+# Ignore scripts (not needed for API runtime)
+scripts/
+
+# Ignore documentation and deployment files
+ARCHITECTURE.md
+DEPLOYMENT.md
+*.sh
+!start.sh
+!start_worker.sh
+docker-compose.yml
+railway.json
+railway_setup_vars.sh
+
+# Keep only essential code
+!docgenie/
+!api/
+!setup.py
+!pyproject.toml
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..aec80c52e8a5408669d9e12fe7376fc1488be89b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,7 @@
+*.svg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000000000000000000000000000000000000..92b93970145852b19d0dab127119aab54824a7ad
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,172 @@
+# Project
+data/clusters/
+data/embeddings/
+data/temp/
+wandb/
+data/models/
+data/webapp_cache/
+data/analyzation/
+data/cherrypicks/
+data/hw_imgs/
+/data/seed-images/*
+/docgenie/playground/test.py
+/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images
+/docgenie/playground/handwritten_text/handwriting_raw_tokens
+/docgenie/playground/handwritten_text/temp
+data/datasets
+data/models
+data/cluster_plots
+data/syn_dataset_statistics_plots
+data/gt_embeddings
+data/wandb_downloads
+data/wandb_project_csvs
+data/folders.txt
+cache
+runs
+visualizations
+.venv
+**/**.__pycache__
+/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images
+/docgenie/playground/handwritten_text/temp
+data/datasets
+data/models
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.log
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Jupyter Notebook
+.ipynb_checkpoints
+*.ipynb_checkpoints/
+
+# Model artifacts - download separately
+inference/
+inference_new/
+inference_hf/
+model/experiments/hf_conditional_latent/cached_vae/
+*.zip
+
+
+# Datasets - download separately
+docvqa-handwritten-sizes4/
+syn_docvqa/
+iam_dataset/
+iam_dataset_processed/
+iam_dataset_processed_partial/
+docvqa-test/
+docvqa-viselems/
+docvqa-viselems2/
+temp/
+generations/
+
+# Generated outputs
+output/
+
+# Backup files
+*.bak
+*.backup
+*.tmp
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# OS
+./data/clusters_old/
+Thumbs.db
+
+
+# Training
+training/
+vae_evaluation/
+
+
+# Logs and checkpoints
+*.pt
+# But allow the inference model for handwriting service
+!handwriting_service/WordStylist/models/ema_ckpt.pt
+*.ckpt
+*.pth
+*.safetensors
+
+.env
+
+# Playwright
+node_modules/
+/test-results/
+/playwright-report/
+/blob-report/
+/playwright/.cache/
+/playwright/.auth/
+
+
+!data/models/
+!data/models/handwriting/
+!data/models/handwriting/char_vocab.json
+!data/models/handwriting/config.yaml
+!data/models/handwriting/writer_id_map.json
+!data/models/handwriting/cached_vae/config.json
+data/models/.locks*
+data/models/baseline
+data/models/legacy
+data/models/models*
+data/models/pretrained
+test_run.py
+test_vlm.ipynb
+test.ipynb
+test2.ipynb
+test3.py
+test4.py
+test5.py
+test6.py
+data/results
+data/results_old/
+data/tmp/
+docgenie/playground/extract_02_eval_metrics_from_wandb.py
+docgenie/playground/extract_metrics_from_wandb.py
+data/cached_subsets
+data/mixed_datasets
+data/results_backup_v1
+data/results_v1
+data/old-results/
+data/embeddings
+data/mixed_datasets
+data/results_backup_v1
+sync_datasets.sh
+data/results_latest
+data/results_latest copy
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100755
index 0000000000000000000000000000000000000000..6cd5d7481bede501a691eac5043403cd029d7eec
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,16 @@
+# You can override the included template(s) by including variable overrides
+# SAST customization: https://docs.gitlab.com/ee/user/application_security/sast/#customizing-the-sast-settings
+# Secret Detection customization: https://docs.gitlab.com/user/application_security/secret_detection/pipeline/configure
+# Dependency Scanning customization: https://docs.gitlab.com/ee/user/application_security/dependency_scanning/#customizing-the-dependency-scanning-settings
+# Container Scanning customization: https://docs.gitlab.com/ee/user/application_security/container_scanning/#customizing-the-container-scanning-settings
+# Note that environment variables can be set in several places
+# See https://docs.gitlab.com/ee/ci/variables/#cicd-variable-precedence
+stages:
+- test
+- secret-detection
+variables:
+  SECRET_DETECTION_ENABLED: 'true'
+secret_detection:
+  stage: secret-detection
+include:
+- template: Security/Secret-Detection.gitlab-ci.yml
diff --git a/.python-version b/.python-version
new file mode 100755
index 0000000000000000000000000000000000000000..efbce23a0e1b1eed58654641085f009d5233a0fb
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11.12
diff --git a/API_FLOW_DOCUMENTATION.md b/API_FLOW_DOCUMENTATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8a7767b0b10d8485dfad38c82284ade137eb4d3
--- /dev/null
+++ b/API_FLOW_DOCUMENTATION.md
@@ -0,0 +1,1024 @@
+# Complete API Flow Documentation
+
+## Overview
+The DocGenie API provides three endpoints for synthetic document generation, implementing a 19-stage pipeline that transforms seed images and prompts into complete datasets with OCR, ground truth, and optional handwriting/visual elements.
+
+**Base URL**: `http://localhost:8000` (development) or Railway deployment  
+**Documentation**: `/docs` (FastAPI auto-generated Swagger UI)
+
+---
+
+## API Endpoints
+
+### 1. `/generate` - Legacy JSON Response (POST)
+**Purpose**: Generate documents and return complete JSON metadata  
+**Response**: JSON with HTML, PDF (base64), bounding boxes, optional handwriting/visual elements  
+**Use Case**: Testing, development, full metadata inspection  
+**Pipeline Stages**: 1-19 (configurable via parameters)
+
+### 2. `/generate/pdf` - Sync PDF+Dataset ZIP (POST)
+**Purpose**: Generate documents and return ZIP file with all artifacts  
+**Response**: ZIP file containing:
+- `*.pdf` - Generated document PDFs
+- `*_final.pdf` - PDFs with handwriting/visual elements (if enabled)
+- `*.msgpack` - Dataset format (if export enabled)
+- `metadata.json` - Complete generation metadata
+- `handwriting/` - Individual handwriting images
+- `visual_elements/` - Individual visual element images
+
+**Use Case**: Production dataset generation, batch processing  
+**Pipeline Stages**: 1-19 (all features available)
+
+### 3. `/generate/async` - Async Batch Processing (POST)
+**Purpose**: Queue large batch jobs via background worker (Redis Queue)  
+**Response**: Task ID for status polling  
+**Status Check**: `GET /generate/async/status/{task_id}`  
+**Result Download**: `GET /generate/async/result/{task_id}` (returns ZIP)  
+**Use Case**: Large-scale dataset generation (100+ documents)  
+**Pipeline Stages**: 1-19 (via worker.py)
+
+---
+
+## Request Parameters
+
+```python
+class GenerateDocumentRequest:
+    seed_images: List[HttpUrl]              # 1-8 seed images from web URLs
+    prompt_params: PromptParameters          # Generation configuration
+    
+class PromptParameters:
+    # Core Parameters
+    language: str = "english"                # Document language
+    doc_type: str = "invoice"                # Document type (invoice, receipt, form, etc.)
+    gt_type: str = "qa"                      # Ground truth format (qa, kie)
+    gt_format: str = "json"                  # GT encoding (json, annotation)
+    num_solutions: int = 1                   # Documents per seed set
+    
+    # Feature Toggles (Stages 07-19)
+    enable_handwriting: bool = False         # Stage 07-09, 12
+    handwriting_ratio: float = 0.2           # Probabilistic filter (0.0-1.0)
+    enable_visual_elements: bool = False     # Stage 08, 10, 13
+    visual_element_types: List[str] = []     # Filter types: logo, photo, figure, barcode, etc.
+    enable_ocr: bool = True                  # Stage 15
+    enable_bbox_normalization: bool = True   # Stage 16
+    enable_gt_verification: bool = False     # Stage 17
+    enable_analysis: bool = False            # Stage 18
+    enable_debug_visualization: bool = False # Stage 19
+    enable_dataset_export: bool = False      # Stage 19 (msgpack format)
+    dataset_export_format: str = "msgpack"   # Currently only msgpack supported
+    
+    # Reproducibility
+    seed: Optional[int] = None               # Random seed (null = random, int = reproducible)
+```
+
+---
+
+## Pipeline Architecture: The 19 Stages
+
+The API implements all 19 stages of the original batch pipeline in `docgenie/generation/`. Each stage is mapped to corresponding functions in `api/utils.py`.
+
+### **Phase 1: Core Pipeline (Stages 01-06)**
+Generate base documents from seed images and LLM prompts.
+
+#### **Stage 01: Seed Selection & Download**
+- **Original**: `pipeline_01_select_seeds.py`
+- **API**: `download_seed_images()` in `api/utils.py:117-161`
+- **Process**:
+  1. Accept user-provided seed image URLs (1-8 images)
+  2. Download with retry logic (3 attempts, exponential backoff)
+  3. Handle transient HTTP errors (502, 503, 504, 429)
+  4. Convert to base64 for LLM input
+- **Error Handling**: Retry with 2s, 4s, 8s delays; raise HTTPException on failure
+
+#### **Stage 02: Prompt LLM**
+- **Original**: `pipeline_02_prompt_llm.py`
+- **API**: `call_claude_api_direct()` in `api/utils.py:550-600`
+- **Process**:
+  1. Load prompt template: `data/prompt_templates/ClaudeRefined12/seed-based-json.txt`
+  2. Build prompt with parameters: language, doc_type, gt_type, num_solutions
+  3. Call Claude API (Anthropic Messages API v1)
+     - Model: `claude-3-5-sonnet-20241022` (configurable)
+     - Max tokens: 16,000
+     - Temperature: 1.0
+     - Vision: Send base64-encoded seed images
+  4. Receive HTML documents with embedded ground truth
+- **LLM Output Format**: Multiple `<!DOCTYPE html>...</html>` blocks with:
+  - CSS styling with page dimensions
+  - HTML elements with semantic classes
+  - Handwriting markers: `class="handwritten author1"` (author1, author2, etc.)
+  - Visual element placeholders: `data-placeholder="logo"`, `data-content="company-logo"`
+  - Ground truth: `<script id="GT">{...json...}</script>`
+
+#### **Stage 03: Process Response & Extract HTML**
+- **Original**: `pipeline_03_process_response.py`
+- **API**: `extract_html_documents_from_response()` in `api/utils.py:605-635`
+- **Process**:
+  1. Parse LLM response for `<!DOCTYPE html>...</html>` blocks (regex)
+  2. Prettify HTML with BeautifulSoup
+  3. Validate HTML structure
+  4. Extract ground truth JSON from `<script id="GT">` tag
+  5. Remove GT script tag, clean HTML for rendering
+- **Validation**: Check for required elements, CSS, proper structure
+
+#### **Stage 04: Render PDF & Extract Geometries**
+- **Original**: `pipeline_04_render_pdf_and_extract_geos.py`
+- **API**: `render_html_to_pdf()` in `api/utils.py:650-740`
+- **Process**:
+  1. Launch Playwright browser (Chromium)
+  2. Set page dimensions from CSS `@page` rule
+  3. Render HTML to PDF via `page.pdf()`
+  4. Extract element geometries:
+     - Handwriting elements: `.handwritten` class → `{rect, text, classes, selectorTypes: ["handwriting"]}`
+     - Visual elements: `[data-placeholder]` attribute → `{rect, dataPlaceholder, dataContent, selectorTypes: ["visual_element"]}`
+  5. Save PDF and geometries JSON
+- **Output**: 
+  - PDF at 72 DPI (PyMuPDF standard)
+  - Geometries at 96 DPI (browser rendering)
+  - Dimensions in mm
+
+#### **Stage 05: Extract Bounding Boxes**
+- **Original**: `pipeline_05_extract_bboxes_from_pdf.py`
+- **API**: `extract_bboxes_from_rendered_pdf()` in `api/utils.py:750-825`
+- **Process**:
+  1. Open PDF with PyMuPDF (fitz)
+  2. Extract text at word level: `page.get_text("words")`
+  3. Structure bboxes as:
+     ```python
+     {
+         "text": "word",
+         "x0": float,  # left
+         "y0": float,  # top
+         "x1": float,  # right (x2)
+         "y1": float,  # bottom (y2)
+         "block_no": int,
+         "line_no": int,
+         "word_no": int
+     }
+     ```
+  4. Filter whitespace-only text
+  5. Convert to OCRBox objects for processing
+- **Coordinate System**: PDF points (72 DPI), origin top-left
+
+#### **Stage 06: Validation**
+- **Original**: `pipeline_06_validation.py` (implicit)
+- **API**: `validate_html_structure()`, `validate_pdf()`, `validate_bboxes()` in `api/utils.py:830-890`
+- **Checks**:
+  - HTML: Required DOCTYPE, head, body, CSS
+  - PDF: File readable, page count = 1, has text
+  - Bboxes: Minimum count (configurable), valid coordinates
+
+---
+
+### **Phase 2: Feature Synthesis (Stages 07-13)**
+Add handwriting and visual elements to base documents.
+
+#### **Stage 07: Extract Handwriting Definitions**
+- **Original**: `pipeline_07_extract_handwriting.py`
+- **API**: `process_stage3_complete()` section in `api/utils.py:1150-1235`
+- **Process**:
+  1. Filter geometries: `"handwriting" in geo['selectorTypes']`
+  2. Parse classes: Extract `author1`, `author2`, etc. from `class="handwritten author1"`
+  3. **Probabilistic filtering** (handwriting_ratio):
+     ```python
+     if random.random() > handwriting_ratio:
+         continue  # Skip this element
+     ```
+     - `ratio=0.0`: No handwriting (0%)
+     - `ratio=0.5`: ~50% of marked elements
+     - `ratio=1.0`: All marked elements (100%)
+  4. Match geometries to word bboxes:
+     - Convert browser coords (96 DPI) to PDF coords (72 DPI): `scale = 72/96 = 0.75`
+     - Find consecutive word bboxes matching geometry text
+     - Check bboxes are within geometry rect (threshold: 0.7)
+     - Track taken bbox indices to avoid duplicates
+  5. Build handwriting region definitions:
+     ```python
+     {
+         "id": "hw0",
+         "text": "Patient Name",
+         "author_id": "author1",
+         "is_signature": False,
+         "rect": {x, y, width, height},  # in points
+         "bboxes": ["0_0_0 Patient 10.0 20.0 50.0 35.0", ...]
+     }
+     ```
+- **Reproducibility**: Use `seed + i` for each region to maintain order consistency
+
+#### **Stage 08: Extract Visual Element Definitions**
+- **Original**: `pipeline_08_extract_visual_element_definitions.py`
+- **API**: `process_stage3_complete()` section in `api/utils.py:1237-1275`
+- **Process**:
+  1. Filter geometries: `"visual_element" in geo['selectorTypes']`
+  2. Parse attributes:
+     - `data-placeholder`: Element type (logo, photo, figure, chart, barcode, etc.)
+     - `data-content`: Semantic description (e.g., "company-logo", "product-photo")
+  3. Normalize types using synonyms:
+     - "chart" → "figure"
+     - "image" → "photo"
+  4. Filter by `visual_element_types` parameter (if specified)
+  5. Convert coordinates: pixels (96 DPI) → mm
+  6. Extract rotation from CSS `transform: rotate(Xdeg)`
+  7. Build visual element definitions:
+     ```python
+     {
+         "id": "ve0",
+         "type": "logo",  # normalized
+         "content": "company-logo",
+         "rect": {x, y, width, height},  # in mm
+         "rotation": 0  # degrees
+     }
+     ```
+
+#### **Stage 09: Create Handwriting Images**
+- **Original**: `pipeline_09_create_handwriting_images.py`
+- **API**: `call_handwriting_service_batch()` in `api/utils.py:785-920`
+- **Handwriting Service**: RunPod serverless endpoint hosting WordStylist diffusion model
+- **Service Implementation**: `handwriting_service/handler.py`, `handwriting_service/inference.py`
+
+**🔄 Handwriting Service Integration Details:**
+
+##### **Service Architecture**
+- **Platform**: RunPod Serverless (GPU: NVIDIA A4000, Cost: ~$0.00025/s active)
+- **Model**: WordStylist (Diffusion-based handwriting synthesis)
+  - Architecture: UNet with conditional style embeddings
+  - Input: Text (A-Z, a-z only, no spaces), Writer style ID (0-656)
+  - Output: PNG image with transparent background
+  - Inference time: ~18s per text on A4000
+  - Weights: `handwriting_service/WordStylist/models/`
+- **Endpoints**:
+  - `/run` (async): Queue job, return ID, poll `/status/{id}` (10MB limit)
+  - `/runsync` (sync): Wait for completion, return result (20MB limit, used by API)
+
+##### **Batch Processing (Cost Optimization)**
+The API uses TRUE batch processing to minimize RunPod activation overhead:
+
+```python
+# ✅ NEW: Batch all texts in ONE request
+runpod_request = {
+    "input": {
+        "texts": [
+            {"text": "Hello", "author_id": 42, "hw_id": "hw0_b0_l0_w0"},
+            {"text": "World", "author_id": 42, "hw_id": "hw0_b0_l0_w1"},
+            # ... 10-100 texts
+        ],
+        "apply_blur": True
+    }
+}
+# Result: 1 worker activation × (N × 18s) = ~40-60% cost savings
+```
+
+**Cost Comparison for 10 texts:**
+- ❌ OLD (parallel): 10 workers × 18s = 180 worker-seconds + 10× activation fee
+- ✅ NEW (batched): 1 worker × 190s = 190 worker-seconds + 1× activation fee
+
+##### **API Processing Flow**
+1. **Group by region and line**: Split handwriting regions into word-level requests
+   ```python
+   # Text: "Patient Name" → 2 word-level generations
+   texts_to_generate = [
+       {"text": "Patient", "author_id": 42, "hw_id": "hw0_b0_l0_w0"},
+       {"text": "Name", "author_id": 42, "hw_id": "hw0_b0_l0_w1"}
+   ]
+   ```
+
+2. **Map author IDs to numeric styles**:
+   ```python
+   # "author1" → WRITER_STYLES[1] = 42 (deterministic)
+   # "author2" → WRITER_STYLES[2] = 137
+   # 657 total writer styles available
+   ```
+
+3. **Sanitize text** (WordStylist constraint):
+   ```python
+   # Only A-Z, a-z allowed (no spaces, numbers, punctuation)
+   "Hello123!" → "Hello"
+   "first-name" → "firstname"
+   ```
+
+4. **Send batch request** to RunPod `/runsync` endpoint:
+   ```python
+   POST https://api.runpod.ai/v2/{endpoint_id}/runsync
+   Authorization: Bearer {RUNPOD_API_KEY}
+   Content-Type: application/json
+   
+   {
+       "input": {
+           "texts": [...],
+           "apply_blur": True  # Gaussian blur for realism
+       }
+   }
+   ```
+
+5. **Handle async responses**:
+   - If `status: "IN_PROGRESS"`: Poll `/status/{job_id}` every 5-10s (max 30 polls)
+   - If `status: "COMPLETED"`: Extract `output.images[]`
+   - If `status: "FAILED"`: Raise exception (stops entire generation)
+
+6. **Response format**:
+   ```python
+   {
+       "status": "COMPLETED",
+       "output": {
+           "images": [
+               {
+                   "image_base64": "iVBORw0KGgoAAAANSU...",
+                   "width": 200,
+                   "height": 64,
+                   "text": "Patient",
+                   "author_id": 42,
+                   "hw_id": "hw0_b0_l0_w0"
+               },
+               ...
+           ],
+           "total_generated": 2
+       }
+   }
+   ```
+
+7. **Store generated images**: Map `hw_id → image_base64` for insertion
+
+##### **Error Handling**
+- **Retry logic**: 3 attempts with exponential backoff (matching seed download)
+- **Timeouts**: Dynamic based on batch size: `20s × num_texts + 30s buffer`
+- **Failure behavior**: **RAISE EXCEPTION** (since session fix)
+  - ❌ OLD: Silent continue → Documents without handwriting
+  - ✅ NEW: Raise exception → Generation fails when user requested handwriting
+
+##### **Service Code Structure**
+**`handwriting_service/handler.py`** (RunPod handler):
+```python
+# Initialize model ONCE at module level (not per request)
+generator = HandwritingGenerator(
+    model_dir="WordStylist",
+    checkpoint_path="WordStylist/models",
+    device="cuda"
+)
+
+def handler(job):
+    """RunPod entry point - supports both /run and /runsync"""
+    texts = job["input"]["texts"]  # Batch input
+    results = generator.generate_batch(
+        texts=[t["text"] for t in texts],
+        author_ids=[t["author_id"] for t in texts],
+        num_inference_steps=50,
+        temperature=1.0,
+        apply_blur=True
+    )
+    return {"images": results, "total_generated": len(results)}
+```
+
+**`handwriting_service/inference.py`** (WordStylist wrapper):
+```python
+class HandwritingGenerator:
+    def generate_batch(self, texts, author_ids, ...):
+        results = []
+        for text, author_id in zip(texts, author_ids):
+            # Load model checkpoint
+            unet = Unet(...)
+            unet.load_state_dict(checkpoint)
+            
+            # Prepare style condition
+            style_id_tensor = torch.tensor([author_id])
+            
+            # Diffusion reverse process (50 steps)
+            img = self.sample(unet, style_id_tensor, text_length=len(text))
+            
+            # Post-process: crop, resize, apply blur
+            img_pil = postprocess_image(img)
+            if apply_blur:
+                img_pil = img_pil.filter(ImageFilter.GaussianBlur(1.2))
+            
+            # Encode to base64
+            img_base64 = encode_pil_to_base64(img_pil)
+            results.append({
+                "image_base64": img_base64,
+                "width": img_pil.width,
+                "height": img_pil.height
+            })
+        
+        return results
+```
+
+#### **Stage 10: Create Visual Element Images**
+- **Original**: `pipeline_10_create_visual_elements.py`
+- **API**: `generate_visual_element_images()` in `api/utils.py:925-1020`
+- **Process**:
+  1. Load prefab images from `data/visual_element_prefabs/{type}/`:
+     - `logo/`: Company logos (50+ SVGs)
+     - `photo/`: Stock photos (100+ JPGs)
+     - `figure/`: Charts, graphs (30+ PNGs)
+     - `barcode/`: Generated barcodes
+     - `qr_code/`, `stamp/`, `signature/`, `checkbox/`, etc.
+  2. **Random selection** (seed-based if provided):
+     ```python
+     if seed is not None:
+         random.seed(seed)
+     prefab_path = random.choice(list(prefab_dir.glob("*")))
+     ```
+  3. **Special handling**:
+     - **Barcode**: Generate on-the-fly using `python-barcode` library
+       ```python
+       # Generate random EAN-13 barcode (12 digits + checksum)
+       barcode_num = random.randint(100000000000, 999999999999)
+       barcode = EAN13(str(barcode_num), writer=ImageWriter())
+       ```
+     - **QR Code**: Generate using `qrcode` library
+     - **Checkbox**: Render checked/unchecked SVG
+  4. Load and convert to base64:
+     ```python
+     with open(prefab_path, 'rb') as f:
+         img_bytes = f.read()
+         img_base64 = base64.b64encode(img_bytes).decode('utf-8')
+     ```
+  5. Return mapping: `ve_id → image_base64`
+
+#### **Stage 11: Make Text Transparent (Implicit)**
+- **Original**: `pipeline_11_make_text_transparent.py`
+- **API**: Implemented as "whiteout" in `process_stage3_complete()` at `api/utils.py:1415-1427`
+- **Process**:
+  ```python
+  # Draw white rectangles over original text to hide it
+  for hw_region in handwriting_regions:
+      for bbox_str in hw_region['bboxes']:
+          bbox = parse_bbox(bbox_str)
+          rect = fitz.Rect(bbox.x0, bbox.y0, bbox.x2, bbox.y2)
+          page.draw_rect(rect, color=(1,1,1), fill=(1,1,1))  # White fill
+  ```
+- **Why not transparent?**: PyMuPDF doesn't support making existing text transparent, so we use white rectangles instead (same visual result)
+
+#### **Stage 12: Insert Handwriting Images**
+- **Original**: `pipeline_12_insert_handwriting_images.py`
+- **API**: `process_stage3_complete()` section in `api/utils.py:1429-1520`
+- **Process**:
+  1. **Position calculation**:
+     ```python
+     # Get word bbox from PDF extraction
+     bbox_w = bbox.x2 - bbox.x0  # Width in points
+     bbox_h = bbox.y2 - bbox.y0  # Height in points
+     
+     # Resize handwriting image with aspect ratio
+     scale = min(bbox_w / img_width, bbox_h / img_height)
+     new_w = int(img_width * scale * SCALE_UP_FACTOR)  # 3x upscale
+     new_h = int(img_height * scale * SCALE_UP_FACTOR)
+     
+     # Add random offsets for natural variation
+     offset_x = random.randint(-MAX_OFFSET_LEFT, MAX_OFFSET_RIGHT) + FIXED_OFFSET
+     offset_y = random.randint(-MAX_OFFSET_UP, MAX_OFFSET_DOWN)
+     
+     # Position at bbox coordinates
+     x0 = bbox.x0 + offset_x
+     y0 = bbox.y0 + offset_y - y_padding
+     ```
+  
+  2. **Insert into PDF**:
+     ```python
+     img_resized = img.resize((new_w, new_h), Image.LANCZOS).convert("RGBA")
+     img_bytes = pil_to_bytes(img_resized)
+     rect = fitz.Rect(x0, y0, x0 + bbox_w, y0 + bbox_h)
+     page.insert_image(rect, stream=img_bytes)
+     ```
+  
+  3. Save intermediate PDF: `{doc_id}_with_handwriting.pdf`
+
+#### **Stage 13: Insert Visual Elements**
+- **Original**: `pipeline_13_insert_visual_elements.py`
+- **API**: `process_stage3_complete()` section in `api/utils.py:1523-1625`
+- **Process**:
+  1. Convert mm → points: `mm_to_pt = 72 / 25.4`
+  2. Resize with aspect ratio preservation (same as handwriting)
+  3. Center image on white background (maintains bbox size)
+  4. Insert into PDF at geometry coordinates
+  5. Save final PDF: `{doc_id}_final.pdf` (includes both handwriting + visual elements)
+
+---
+
+### **Phase 3: Image Finalization & OCR (Stages 14-15)**
+Convert final PDF to high-resolution image and extract OCR data.
+
+#### **Stage 14: Render Image**
+- **Original**: `pipeline_14_render_image.py`
+- **API**: `process_stage4_ocr()` in `api/utils.py:1899-1940`
+- **Process**:
+  ```python
+  # Render PDF page to high-res PNG
+  page = fitz.open(pdf_path)[0]
+  pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # 3x scale = ~220 DPI
+  img_bytes = pix.tobytes("png")
+  img_base64 = base64.b64encode(img_bytes).decode('utf-8')
+  ```
+- **Output**: Base64-encoded PNG at 220 DPI (configurable via scale factor)
+
+#### **Stage 15: Perform OCR**
+- **Original**: `pipeline_15_perform_ocr.py`
+- **API**: `run_paddle_ocr()` in `api/utils.py:1950-2080`
+- **OCR Engine**: PaddleOCR v4 (multilingual)
+  - Models: `PP-OCRv4` detection + recognition
+  - Languages: Supports 80+ languages
+  - Accuracy: State-of-the-art open-source OCR
+- **Process**:
+  1. Render PDF to image via `pdf2image` at specified DPI (default: 300)
+  2. Initialize PaddleOCR with language parameter
+  3. Run detection + recognition:
+     ```python
+     ocr = PaddleOCR(lang=language, use_gpu=True)
+     results = ocr.ocr(img_array, cls=True)
+     ```
+  4. Parse results into word-level bboxes:
+     ```python
+     {
+         "text": "word",
+         "bbox": {
+             "x0": float,
+             "y0": float,
+             "x1": float,  # right
+             "y1": float   # bottom
+         },
+         "confidence": 0.95
+     }
+     ```
+- **Output**: Dictionary with `words` list, image dimensions, OCR engine info
+
+---
+
+### **Phase 4: Dataset Packaging (Stages 16-19)**
+Normalize, verify, analyze, and export final dataset.
+
+#### **Stage 16: Normalize Bboxes**
+- **Original**: `pipeline_16_normalize_bboxes.py`
+- **API**: `normalize_bboxes()` in `api/utils.py:2100-2180`
+- **Process**:
+  1. Convert absolute pixel coordinates → normalized [0, 1] range:
+     ```python
+     norm_bbox = [
+         bbox['x0'] / img_width,
+         bbox['y0'] / img_height,
+         bbox['x1'] / img_width,
+         bbox['y1'] / img_height
+     ]
+     ```
+  2. Clip to [0, 1]: `[max(0, min(1, x)) for x in norm_bbox]`
+  3. Create word-level and segment-level bboxes
+- **Output**: List of `{text, bbox: [x0, y0, x1, y1]}` where bbox is normalized
+
+#### **Stage 17: Ground Truth Verification**
+- **Original**: `pipeline_17_gt_preparation_verification.py`
+- **API**: `verify_ground_truth()` in `api/utils.py:2185-2250`
+- **Checks**:
+  - GT structure: Valid JSON, required fields
+  - Text matching: GT text exists in OCR output
+  - Bbox coverage: GT answers have corresponding bboxes
+- **Output**: Verification report with pass/fail status
+
+#### **Stage 18: Analyze**
+- **Original**: `pipeline_18_analyze.py`
+- **API**: `analyze_document()` in `api/utils.py:2255-2320`
+- **Metrics**:
+  - Word count, character count
+  - Average word length
+  - Handwriting regions count, coverage %
+  - Visual elements count by type
+  - OCR confidence statistics (mean, min, max)
+- **Output**: Analysis dictionary with computed metrics
+
+#### **Stage 19: Create Debug Data & Export**
+- **Original**: `pipeline_19_create_debug_data.py`
+- **API**: `export_to_msgpack()` in `api/utils.py:2350-2520`
+- **Debug Visualization**:
+  - Draw bboxes on image with different colors:
+    - Green: Word bboxes
+    - Red: Handwriting regions
+    - Blue: Visual elements
+    - Yellow: Ground truth target regions
+  - Save annotated image
+- **Dataset Export (msgpack)**:
+  ```python
+  dataset_entry = {
+      "image": img_bytes,  # PNG bytes
+      "words": ["hello", "world"],
+      "word_bboxes": [[0.1, 0.2, 0.15, 0.25], ...],  # Normalized
+      "segment_bboxes": [...],
+      "ground_truth": {"question": "answer"},
+      "metadata": {
+          "document_id": "...",
+          "has_handwriting": True,
+          "num_visual_elements": 3
+      }
+  }
+  msgpack.dump(dataset_entry, f)
+  ```
+- **Output**: `.msgpack` file compatible with PyTorch DataLoader
+
+---
+
+## Pipeline Verification: API vs Original Implementation
+
+### ✅ **Stage-by-Stage Mapping**
+
+| Stage | Original File | API Function | Status |
+|-------|--------------|--------------|--------|
+| 01 | `pipeline_01_select_seeds.py` | `download_seed_images()` | ✅ Mapped (with retry logic) |
+| 02 | `pipeline_02_prompt_llm.py` | `call_claude_api_direct()` | ✅ Mapped (uses Messages API) |
+| 03 | `pipeline_03_process_response.py` | `extract_html_documents_from_response()` | ✅ Mapped |
+| 04 | `pipeline_04_render_pdf_and_extract_geos.py` | `render_html_to_pdf()` | ✅ Mapped (Playwright) |
+| 05 | `pipeline_05_extract_bboxes_from_pdf.py` | `extract_bboxes_from_rendered_pdf()` | ✅ Mapped |
+| 06 | `pipeline_06_validation.py` | `validate_html_structure()`, `validate_pdf()` | ✅ Mapped |
+| 07 | `pipeline_07_extract_handwriting.py` | `process_stage3_complete()` section | ✅ Mapped (with ratio filter) |
+| 08 | `pipeline_08_extract_visual_element_definitions.py` | `process_stage3_complete()` section | ✅ Mapped |
+| 09 | `pipeline_09_create_handwriting_images.py` | `call_handwriting_service_batch()` | ✅ Mapped (RunPod integration) |
+| 10 | `pipeline_10_create_visual_elements.py` | `generate_visual_element_images()` | ✅ Mapped |
+| 11 | `pipeline_11_make_text_transparent.py` | `process_stage3_complete()` (whiteout) | ✅ Mapped (white rectangles) |
+| 12 | `pipeline_12_insert_handwriting_images.py` | `process_stage3_complete()` section | ✅ Mapped |
+| 13 | `pipeline_13_insert_visual_elements.py` | `process_stage3_complete()` section | ✅ Mapped |
+| 14 | `pipeline_14_render_image.py` | `process_stage4_ocr()` | ✅ Mapped |
+| 15 | `pipeline_15_perform_ocr.py` | `run_paddle_ocr()` | ✅ Mapped |
+| 16 | `pipeline_16_normalize_bboxes.py` | `normalize_bboxes()` | ✅ Mapped |
+| 17 | `pipeline_17_gt_preparation_verification.py` | `verify_ground_truth()` | ✅ Mapped |
+| 18 | `pipeline_18_analyze.py` | `analyze_document()` | ✅ Mapped |
+| 19 | `pipeline_19_create_debug_data.py` | `export_to_msgpack()` | ✅ Mapped |
+
+### 📊 **Key Differences: API vs Batch Pipeline**
+
+#### **Processing Model**
+- **Original**: Batch processing with file-based state management
+  - Input: CSV of seed selections, prompt parameters in JSON
+  - Output: Folder structure with intermediate files
+  - State: JSON logs per document + message
+  - Resumability: Can restart from any stage
+
+- **API**: Request/response with in-memory processing
+  - Input: JSON request with seed URLs
+  - Output: JSON response or ZIP file
+  - State: Ephemeral (temporary directories)
+  - Resumability: None (single-shot generation)
+
+#### **Handwriting Generation**
+- **Original**: Local GPU with WordStylist model loaded in-process
+  - Location: `docgenie/generation/handwriting_diffusion/`
+  - Execution: `generate_handwriting_diffusion_raw.py`
+  - Cost: Free (local GPU)
+
+- **API**: Remote RunPod serverless endpoint
+  - Location: `handwriting_service/` (deployed separately)
+  - Execution: HTTP POST to RunPod API
+  - Cost: ~$0.00025/s GPU time (pay-per-use)
+  - Benefit: No local GPU required, scales automatically
+
+#### **Seed Selection**
+- **Original**: Pre-crawled dataset with systematic selection
+  - Seeds stored in: `data/datasets/base_v2/`
+  - Selection: Clustering algorithm → balanced subset
+  - Tracking: CSV manifest with seed IDs
+
+- **API**: User-provided URLs
+  - Seeds: Any publicly accessible image URL
+  - Selection: User chooses 1-8 images per request
+  - Tracking: URLs stored in request metadata
+
+#### **Prompt Templates**
+- **Original**: Multiple template versions in folders
+  - Path: `data/prompt_templates/{version}/seed-based-json.txt`
+  - Versioning: ClaudeRefined1 → ClaudeRefined12
+  - Selection: Configurable per dataset
+
+- **API**: Fixed template (latest version)
+  - Path: `data/prompt_templates/ClaudeRefined12/seed-based-json.txt`
+  - Hardcoded in: `api/main.py:171`
+  - **Future improvement**: Make template selectable via API parameter
+
+---
+
+## Complete Request Flow Example
+
+### Example Request (Sync Endpoint)
+```bash
+POST /generate/pdf HTTP/1.1
+Content-Type: application/json
+
+{
+  "seed_images": [
+    "https://example.com/seed1.jpg",
+    "https://example.com/seed2.jpg"
+  ],
+  "prompt_params": {
+    "language": "english",
+    "doc_type": "medical_form",
+    "gt_type": "kie",
+    "gt_format": "json",
+    "num_solutions": 2,
+    "enable_handwriting": true,
+    "handwriting_ratio": 0.3,
+    "enable_visual_elements": true,
+    "visual_element_types": ["logo", "signature"],
+    "enable_ocr": true,
+    "enable_dataset_export": true,
+    "seed": 42
+  }
+}
+```
+
+### Processing Flow (Stages Executed)
+
+**Phase 1: Core Document Generation (30-60s)**
+1. ✅ Download 2 seed images with retry → `[img1_b64, img2_b64]`
+2. ✅ Load prompt template → Build prompt for medical_form + KIE
+3. ✅ Call Claude API → LLM generates 2 HTML documents (~25s)
+4. ✅ Extract HTML + ground truth → 2 clean HTML files with GT JSON
+5. ✅ Render each HTML to PDF via Playwright → 2 PDFs + geometries
+6. ✅ Extract word bboxes from PDFs → ~200-500 words per document
+
+**Phase 2: Feature Synthesis (120-180s if handwriting enabled)**
+7. ✅ Parse geometries for handwriting markers
+   - Found: 12 elements with `class="handwritten"`
+   - Filtered by ratio: 12 × 0.3 = ~4 elements selected (probabilistic)
+   - Matched to word bboxes: 4 regions with 15 total words
+8. ✅ Parse geometries for visual elements
+   - Found: 3 elements (`data-placeholder="logo"`, `"signature"`, `"logo"`)
+   - Filtered by types: Keep logo + signature, remove others
+   - Result: 2 visual element definitions
+9. ✅ Generate handwriting images via RunPod
+   - **Batch request**: 15 words in ONE API call
+   - Map author IDs: `author1 → style 42`, `author2 → style 137`
+   - RunPod processing: 1 worker × (15 × 18s) = ~270s
+   - Result: 15 PNG images (base64-encoded)
+10. ✅ Generate visual element images
+    - Logo: Random selection from `data/visual_element_prefabs/logo/` (seed=42)
+    - Signature: Generate on-the-fly using signature prefab
+    - Result: 2 PNG images
+11. ✅ Whiteout original text: Draw white rectangles over 15 word positions
+12. ✅ Insert handwriting: Place 15 generated images at word bboxes with offsets
+    - Save: `doc1_with_handwriting.pdf`, `doc2_with_handwriting.pdf`
+13. ✅ Insert visual elements: Place logo + signature at geometry coords
+    - Save: `doc1_final.pdf`, `doc2_final.pdf`
+
+**Phase 3: Image + OCR (5-10s)**
+14. ✅ Render each final PDF to 220 DPI image → 2 PNG files (base64)
+15. ✅ Run PaddleOCR on each image
+    - Doc1: Detected 187 words, avg confidence 0.91
+    - Doc2: Detected 203 words, avg confidence 0.94
+
+**Phase 4: Dataset Packaging (2-5s)**
+16. ✅ Normalize OCR bboxes: Convert pixels → [0,1] range
+17. ✅ Verify ground truth: Check GT fields match OCR output (enabled=false, skipped)
+18. ✅ Analyze documents: Compute metrics (enabled=false, skipped)
+19. ✅ Export to msgpack:
+    - Doc1: Pack image + words + normalized bboxes + GT → `doc1.msgpack`
+    - Doc2: Pack image + words + normalized bboxes + GT → `doc2.msgpack`
+
+**Final Output: ZIP File Contents**
+```
+dataset.zip
+├── doc1_uuid_0.pdf               # Original rendered PDF
+├── doc1_uuid_0_final.pdf         # PDF with handwriting + visual elements
+├── doc1_uuid_0.msgpack           # Dataset format
+├── doc2_uuid_1.pdf
+├── doc2_uuid_1_final.pdf
+├── doc2_uuid_1.msgpack
+├── metadata.json                 # Complete generation metadata
+└── handwriting/
+    ├── hw0_b0_l0_w0.png          # Individual handwriting images
+    ├── hw0_b0_l0_w1.png
+    └── ... (13 more)
+```
+
+### Response (JSON Metadata)
+```json
+{
+  "task_id": "uuid-here",
+  "status": "completed",
+  "num_documents": 2,
+  "processing_time_seconds": 305.7,
+  "stages_completed": [
+    "seed_download", "llm_prompt", "html_extraction",
+    "pdf_render", "bbox_extraction", "handwriting_extraction",
+    "visual_element_extraction", "handwriting_generation",
+    "visual_element_generation", "handwriting_insertion",
+    "visual_element_insertion", "image_render", "ocr",
+    "bbox_normalization", "dataset_export"
+  ],
+  "documents": [
+    {
+      "document_id": "doc1_uuid_0",
+      "ground_truth": {"patient_name": "John Doe", "date": "2024-01-15"},
+      "num_words": 187,
+      "num_handwriting_regions": 2,
+      "num_visual_elements": 2,
+      "ocr_confidence_avg": 0.91
+    },
+    {
+      "document_id": "doc2_uuid_1",
+      "ground_truth": {"patient_name": "Jane Smith", "date": "2024-01-16"},
+      "num_words": 203,
+      "num_handwriting_regions": 2,
+      "num_visual_elements": 2,
+      "ocr_confidence_avg": 0.94
+    }
+  ],
+  "download_url": "/download/dataset_uuid.zip"
+}
+```
+
+---
+
+## Configuration & Environment
+
+### Required Environment Variables
+```bash
+# LLM API
+ANTHROPIC_API_KEY=sk-ant-...              # Claude API key
+CLAUDE_MODEL=claude-3-5-sonnet-20241022   # Default model
+
+# Handwriting Service (RunPod)
+HANDWRITING_SERVICE_ENABLED=true
+HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/{endpoint_id}/runsync
+RUNPOD_API_KEY=...                        # RunPod API key
+HANDWRITING_APPLY_BLUR=true               # Gaussian blur for realism
+HANDWRITING_SERVICE_MAX_RETRIES=3
+HANDWRITING_SERVICE_TIMEOUT=600           # 10 minutes for large batches
+
+# OCR Configuration
+OCR_DPI=300                               # Image resolution for OCR
+OCR_LANGUAGE=en                           # PaddleOCR language code
+
+# File Paths
+PROMPT_TEMPLATES_DIR=/path/to/data/prompt_templates
+VISUAL_ELEMENT_PREFABS_DIR=/path/to/data/visual_element_prefabs
+```
+
+### Docker Deployment (Railway)
+```dockerfile
+# Dockerfile (api service)
+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y \
+    chromium chromium-driver \  # Playwright dependencies
+    libgl1 libglib2.0-0 \      # PaddleOCR dependencies
+    && rm -rf /var/lib/apt/lists/*
+
+COPY api/ /app/api
+COPY docgenie/ /app/docgenie
+COPY data/ /app/data
+WORKDIR /app/api
+RUN pip install -r requirements.txt
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+
+**Handwriting service**: See `handwriting_service/Dockerfile` (deployed separately to RunPod)
+
+---
+
+## Performance & Costs
+
+### Timing Breakdown (Single Document)
+| Stage | Time | Notes |
+|-------|------|-------|
+| Seed download | 0.5-2s | Depends on image size + network |
+| LLM prompt | 20-40s | Claude API latency |
+| PDF render | 1-3s | Playwright initialization |
+| Handwriting (10 words) | 180s | RunPod: 1 worker × (10×18s) |
+| Visual elements | 0.5-1s | Local file selection |
+| OCR | 3-5s | PaddleOCR inference |
+| Dataset export | 0.5-1s | msgpack serialization |
+| **TOTAL (no handwriting)** | **25-50s** |
+| **TOTAL (with handwriting)** | **200-230s** | Batched |
+
+### Cost Breakdown (Per Document)
+| Component | Cost | Notes |
+|-----------|------|-------|
+| Claude API | $0.015-0.03 | ~5K input + 16K output tokens |
+| RunPod GPU (10 words) | $0.045 | 180s × $0.00025/s |
+| Storage | Negligible | Temporary files deleted |
+| **TOTAL (no handwriting)** | **$0.015-0.03** |
+| **TOTAL (with handwriting)** | **$0.06-0.08** |
+
+**Optimization**: Batch multiple documents in ONE RunPod call to share worker activation overhead.
+
+---
+
+## Error Handling & Reliability
+
+### Retry Mechanisms
+1. **Seed image download**: 3 attempts, exponential backoff (2s, 4s, 8s)
+2. **Handwriting service**: 3 attempts, status polling up to 30 times
+3. **LLM API**: Built-in Anthropic SDK retries (rate limits, 529 errors)
+
+### Failure Modes
+| Error Type | Behavior | User Impact |
+|------------|----------|-------------|
+| Seed download failure | Raise HTTP 400 | Request rejected immediately |
+| LLM API error | Raise HTTP 500 | No charge, can retry |
+| Handwriting service failure | **Raise exception** (NEW) | Generation fails, prevents invalid outputs |
+| OCR failure | Log warning, continue | Document generated without OCR data |
+| PDF render failure | Raise HTTP 500 | Request fails, no partial results |
+
+### Session Fixes Applied
+- ✅ **Handwriting service failure now raises exception** (previously silent)
+- ✅ **Seed parameter defaults to null** (previously 0)
+- ✅ **Seed image download retry logic** (handles 503 timeout errors)
+- ✅ **API docs show correct examples** (seed: null, not 0)
+
+---
+
+## Future Enhancements
+
+### Short-term
+1. **Configurable prompt templates** via API parameter
+2. **Async endpoint progress tracking** (websocket or polling)
+3. **Batch ZIP download** with multiple documents in one archive
+4. **Cost estimation** before generation (preview mode)
+
+### Long-term
+1. **Custom visual element upload** (user-provided logos, signatures)
+2. **Multi-page document support** (currently single-page only)
+3. **Additional export formats** (COCO, YOLO, HuggingFace Datasets)
+4. **Fine-tuning handwriting styles** (train on user's handwriting samples)
+5. **LLM caching** (reduce cost for similar prompts)
+
+---
+
+## Troubleshooting
+
+### Common Issues
+
+**Q: "Handwriting service not called, but enable_handwriting=true"**
+- Check: LLM output contains `class="handwritten"` in HTML
+- Check: `handwriting_ratio` > 0 (default 0.2)
+- Check: `HANDWRITING_SERVICE_ENABLED=true` in environment
+- Debug: Look for "🔍 DEBUG - Handwriting Service Check" in logs
+
+**Q: "RunPod job stuck IN_PROGRESS"**
+- Cause: Large batch timing out
+- Solution: Increase `HANDWRITING_SERVICE_TIMEOUT` (default 600s)
+- Or: Reduce batch size by lowering `handwriting_ratio`
+
+**Q: "503 first byte timeout" on seed download**
+- Cause: CDN/storage provider temporary unavailability
+- Solution: Retry logic automatically handles this (3 attempts)
+- If persists: Use different image hosting (imgur, cloudinary)
+
+**Q: "Seed parameter still shows 0 in API docs"**
+- Fixed: Added `examples=[None, 42]` to Field definition
+- Clear browser cache if seeing old docs
+
+---
+
+## Testing
+
+### Unit Tests
+```bash
+# Test individual stages
+pytest api/tests/test_utils.py::test_download_seed_images
+pytest api/tests/test_utils.py::test_handwriting_service_batch
+```
+
+### Integration Tests
+```bash
+# Test sync endpoint (included in repo)
+python api/test_sync_pdf_api.py
+
+# Test async endpoint
+python api/test_async_api.py
+```
+
+### Manual Testing via Docs UI
+1. Navigate to `http://localhost:8000/docs`
+2. Expand `/generate/pdf` endpoint
+3. Click "Try it out"
+4. Paste example request JSON
+5. Click "Execute"
+6. Download resulting ZIP file
+
+### Example Test Request (Minimal)
+```json
+{
+  "seed_images": [
+    "https://i.imgur.com/example.jpg"
+  ],
+  "prompt_params": {
+    "language": "english",
+    "doc_type": "invoice",
+    "num_solutions": 1,
+    "enable_handwriting": false,
+    "enable_visual_elements": false,
+    "enable_ocr": true,
+    "enable_dataset_export": true
+  }
+}
+```
+
+---
+
+## Conclusion
+
+The DocGenie API successfully implements all 19 stages of the original batch pipeline in a request/response model suitable for real-time generation. Key architectural differences:
+
+1. **Handwriting generation**: Offloaded to RunPod serverless (cost-efficient batching)
+2. **Seed selection**: User-provided URLs instead of pre-crawled dataset
+3. **State management**: Ephemeral in-memory processing vs file-based
+4. **Scalability**: Horizontal scaling via FastAPI workers + async processing
+
+The API maintains feature parity with the batch pipeline while providing a simpler interface for integration with external systems (web apps, mobile apps, data pipelines).
+
+**Total Processing Time**: 25-50s (no handwriting) or 200-230s (with handwriting)  
+**Cost Per Document**: $0.015-0.08 depending on features  
+**Output Formats**: PDF, PNG, msgpack, ZIP archive
+
+For questions or issues, see `api/README.md` or `TESTING.md`.
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000000000000000000000000000000000000..99b09b4fbbe09ebb0e5c23e2b95c9686302eb434
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,278 @@
+# 🏗️ DocGenie Architecture & Dependency Resolution
+
+## 📦 Package Structure
+
+```
+docgenie/                          ← Root monorepo
+├── docgenie/                      ← Core package (importable)
+│   ├── __init__.py               
+│   ├── generation/               ← Used by API
+│   │   ├── pipeline_01/
+│   │   │   └── claude_batching.py  ← ClaudeBatchedClient
+│   │   ├── pipeline_03/
+│   │   ├── pipeline_04/
+│   │   └── utils/
+│   ├── evaluation/
+│   └── utils/
+│
+├── api/                          ← API Service (imports docgenie.*)
+│   ├── main.py                   from docgenie import ENV
+│   ├── worker.py                 from docgenie.generation.pipeline_01...
+│   ├── utils.py                  from docgenie.generation...
+│   └── requirements.txt          Extra: Redis, Supabase, Google
+│
+├── handwriting_service/          ← GPU Service (NO docgenie imports!)
+│   ├── main.py                   ✓ Self-contained
+│   ├── inference.py              ✓ No external deps
+│   └── models.py
+│
+└── WordStylist/                  ← Model code (used by handwriting)
+```
+
+## 🔗 Dependency Graph
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     API Service                              │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │ api/main.py                                          │   │
+│  │ ↓ imports                                             │   │
+│  │ api/utils.py (call_claude_api_direct)               │   │
+│  └──────────────────────────────────────────────────────┘   │
+│                                                              │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │ api/worker.py                                        │   │
+│  │ ↓ imports                                             │   │
+│  │ from docgenie.generation.pipeline_01.claude_batching │   │
+│  │ from docgenie.generation.constants                   │   │
+│  │ from docgenie.generation.pipeline_03_process_response│   │
+│  │ from docgenie.generation.pipeline_04_render_pdf...   │   │
+│  │ from docgenie import ENV                             │   │
+│  └──────────────────────────────────────────────────────┘   │
+│                           ↓                                  │
+│                    REQUIRES                                  │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │          docgenie/ package                           │   │
+│  │          (entire generation module)                   │   │
+│  └──────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────┐
+│              Handwriting Service                             │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │ handwriting_service/main.py                          │   │
+│  │ ↓ imports                                             │   │
+│  │ from handwriting_service.inference import ...        │   │
+│  │ from handwriting_service.models import ...           │   │
+│  └──────────────────────────────────────────────────────┘   │
+│                           ↓                                  │
+│                    REQUIRES                                  │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │          WordStylist/ model                          │   │
+│  │          (diffusion model code)                       │   │
+│  └──────────────────────────────────────────────────────┘   │
+│                                                              │
+│  ✓ NO docgenie imports - completely independent!            │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## 🐳 Docker Build Strategy
+
+### ❌ What Doesn't Work
+
+```dockerfile
+# ❌ WRONG: Can't copy just api/ folder
+FROM python:3.11
+COPY api/ /app/api/              # Missing docgenie package!
+RUN pip install -r requirements.txt
+CMD ["uvicorn", "main:app"]     # ImportError: No module named 'docgenie'
+```
+
+### ✅ What Works
+
+```dockerfile
+# ✅ CORRECT: Copy entire monorepo
+FROM python:3.11
+WORKDIR /app
+
+# Copy everything
+COPY . .
+
+# Install docgenie as package
+RUN pip install -e .             # Makes docgenie.* importable
+
+# Install API requirements
+RUN pip install -r api/requirements.txt
+
+WORKDIR /app/api
+CMD ["uvicorn", "main:app"]     # ✓ docgenie imports work!
+```
+
+## 🚢 Deployment Strategy Comparison
+
+### Option 1: Separate Deployments (❌ Won't Work)
+
+```
+API Deployment:
+├── api/ folder only
+└── ❌ Missing docgenie package → ImportError
+
+Handwriting Deployment:
+├── handwriting_service/ folder
+└── WordStylist/
+```
+
+**Problem:** API can't find docgenie imports!
+
+### Option 2: Monorepo Deployment (✅ Works)
+
+```
+API Deployment:
+├── docgenie/ package (core)
+├── api/ service (imports docgenie)
+├── setup.py
+└── requirements.txt
+
+Handwriting Deployment:
+├── handwriting_service/
+└── WordStylist/
+```
+
+**Solution:** Deploy entire repo for API, standalone for handwriting!
+
+## 📁 File Structure in Containers
+
+### API Container (Railway/EC2)
+```
+/app/
+├── docgenie/              ← Installed as Python package
+│   ├── __init__.py
+│   ├── generation/
+│   └── utils/
+├── api/                   ← Working directory
+│   ├── main.py
+│   ├── worker.py
+│   └── utils.py
+├── setup.py
+└── pyproject.toml
+
+Python can import:
+✓ from docgenie.generation.pipeline_01 import ...
+✓ from docgenie import ENV
+```
+
+### Handwriting Container (RunPod)
+```
+/app/
+├── handwriting_service/
+│   ├── main.py           ← No docgenie imports!
+│   ├── inference.py
+│   └── models.py
+└── WordStylist/          ← Model code
+    ├── ldm/
+    └── wordstylist_inference.py
+
+Python can import:
+✓ from handwriting_service.inference import ...
+✓ No docgenie dependencies needed!
+```
+
+## 🎯 Import Resolution Flow
+
+### API Service Import Chain
+
+1. **FastAPI starts:**
+   ```python
+   uvicorn main:app
+   ```
+
+2. **main.py imports utils:**
+   ```python
+   from api.utils import call_claude_api_direct
+   ```
+
+3. **utils.py imports docgenie:**
+   ```python
+   from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
+   ```
+
+4. **Python looks for docgenie:**
+   - Checks sys.path
+   - Finds `/app` (where `pip install -e .` installed it)
+   - Loads `docgenie/__init__.py`
+   - ✓ Import succeeds!
+
+### Handwriting Service Import Chain
+
+1. **FastAPI starts:**
+   ```python
+   uvicorn main:app
+   ```
+
+2. **main.py imports local modules:**
+   ```python
+   from handwriting_service.inference import HandwritingGenerator
+   ```
+
+3. **inference.py imports WordStylist:**
+   ```python
+   sys.path.insert(0, str(Path(__file__).parent.parent / "WordStylist"))
+   from ldm.models.diffusion.ddpm import LatentDiffusion
+   ```
+
+4. **Python loads local modules:**
+   - No external package dependencies
+   - ✓ Completely self-contained!
+
+## 🔍 Verifying Imports
+
+### Test API Imports
+```bash
+# Inside API container
+python3 -c "from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient; print('✓ Import works!')"
+```
+
+### Test Handwriting Imports
+```bash
+# Inside handwriting container
+python3 -c "from handwriting_service.inference import HandwritingGenerator; print('✓ Import works!')"
+```
+
+## 💡 Key Insights
+
+1. **API needs monorepo:** Must deploy entire `docgenie/` folder structure
+2. **Handwriting is independent:** Can deploy just `handwriting_service/` + `WordStylist/`
+3. **Docker layer caching:** Install docgenie package first, then API requirements
+4. **Working directory matters:** Set WORKDIR to /app/api for API service
+5. **Python package installation:** `pip install -e .` makes docgenie importable globally
+
+## 📊 Deployment Size Comparison
+
+| Deployment | Size | Contents |
+|------------|------|----------|
+| API (Railway) | ~2GB | Python 3.11 + docgenie + API deps + Playwright |
+| Worker (Railway) | ~2GB | Same as API (shares image) |
+| Handwriting (RunPod) | ~8GB | CUDA 11.8 + PyTorch + Diffusers + WordStylist |
+
+**Total:** ~12GB (but cached independently)
+
+## ✅ Checklist for Successful Deployment
+
+- [ ] Dockerfile copies **entire monorepo** for API
+- [ ] `pip install -e .` runs before API requirements
+- [ ] WORKDIR set to /app/api for runtime
+- [ ] Handwriting Dockerfile copies only handwriting_service/ + WordStylist/
+- [ ] .dockerignore excludes data/ folders (too large)
+- [ ] Environment variables set in Railway/EC2
+- [ ] Redis URL points to Upstash
+- [ ] HANDWRITING_SERVICE_URL points to RunPod endpoint
+
+## 🎉 Result
+
+```
+✓ API can import from docgenie package
+✓ Worker can use ClaudeBatchedClient
+✓ Handwriting service runs independently
+✓ All services communicate via HTTP
+✓ No more ImportError!
+```
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000000000000000000000000000000000000..145e006bda36ca009104f17d9e9076432b693ac4
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,875 @@
+# 🚀 DocGenie Deployment Guide
+
+Complete guide for deploying DocGenie API + Handwriting Service to production with all interdependencies resolved.
+
+## 📊 System Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                         Client                               │
+└────────────────────┬────────────────────────────────────────┘
+                     │
+                     ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    Railway (CPU)                             │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │  DocGenie API (Port 8000)                           │   │
+│  │  - FastAPI server                                     │   │
+│  │  - Imports: docgenie.generation.*                     │   │
+│  │  - Endpoints: /generate, /generate/pdf, /generate/async│  │
+│  └──────────────┬───────────────────────────────────────┘   │
+│                 │                                            │
+│  ┌──────────────▼───────────────────────────────────────┐   │
+│  │  Background Worker                                    │   │
+│  │  - RQ worker (Redis Queue)                           │   │
+│  │  - ClaudeBatchedClient (50% cost savings)            │   │
+│  │  - Imports: docgenie.generation.*                     │   │
+│  └──────────────┬───────────────────────────────────────┘   │
+└─────────────────┼────────────────────────────────────────────┘
+                  │
+        ┌─────────┴──────────┬──────────────┐
+        │                    │              │
+        ▼                    ▼              ▼
+┌───────────────┐  ┌──────────────────┐  ┌──────────────┐
+│ Redis (Upstash)│  │ Supabase         │  │ Google Drive │
+│ - Job queue    │  │ - PostgreSQL     │  │ - File storage│
+│ - Free tier    │  │ - Document DB    │  │ - OAuth 2.0  │
+└───────────────┘  └──────────────────┘  └──────────────┘
+        │
+        ▼
+┌─────────────────────────────────────────────────────────────┐
+│             RunPod Serverless (GPU)                          │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │  Handwriting Service (Port 8080)                     │   │
+│  │  - WordStylist diffusion model                        │   │
+│  │  - PyTorch + CUDA 11.8                                │   │
+│  │  - NO docgenie imports (standalone)                   │   │
+│  └──────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────┘
+```
+
+## 🔗 Dependency Resolution
+
+### ✅ Problem: API imports from docgenie package
+**Solution:** Deploy entire monorepo, install as package with `pip install -e .`
+
+**API Service imports:**
+```python
+# api/worker.py
+from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
+from docgenie import ENV
+
+# api/utils.py
+from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME
+from docgenie.generation.pipeline_01.claude_batching import create_message
+from docgenie.generation.pipeline_03_process_response import process_response
+from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import render_pdf
+```
+
+**Dockerfile solution:**
+```dockerfile
+# Copy entire monorepo
+COPY . .
+
+# Install as editable package
+RUN pip install -e .
+
+# Install API requirements
+RUN pip install -r api/requirements.txt
+```
+
+### ✅ Handwriting Service is Independent
+**No docgenie imports!** Can be deployed standalone.
+
+```python
+# handwriting_service/main.py - NO docgenie imports
+from handwriting_service.inference import HandwritingGenerator
+from handwriting_service.models import HandwritingRequest
+```
+
+## 📦 Pre-Deployment Checklist
+
+### 1. Environment Variables
+Create `api/.env` with all required variables:
+
+```bash
+# Claude API
+ANTHROPIC_API_KEY=sk-ant-xxxxx
+
+# Redis (will be replaced with Upstash URL)
+REDIS_URL=redis://localhost:6379
+
+# Handwriting Service
+HANDWRITING_SERVICE_URL=http://localhost:8080
+
+# Supabase
+SUPABASE_URL=https://xxxxx.supabase.co
+SUPABASE_KEY=eyJxxxxx
+
+# Google Drive (for token refresh only)
+# The frontend handles OAuth and sends tokens in API requests
+# These credentials are only needed to refresh expired tokens during long jobs
+GOOGLE_CLIENT_ID=xxxxx.apps.googleusercontent.com
+GOOGLE_CLIENT_SECRET=GOCSPX-xxxxx
+GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
+```
+
+### 2. Test Locally First
+```bash
+# Terminal 1: Start Redis
+docker run -p 6379:6379 redis:7-alpine
+
+# Terminal 2: Start Handwriting Service
+cd handwriting_service
+DEVICE=cpu uvicorn main:app --port 8080
+
+# Terminal 3: Start API
+cd api
+source ../.venv/bin/activate
+uvicorn main:app --reload --port 8000
+
+# Terminal 4: Start Worker
+cd api
+source ../.venv/bin/activate
+python worker.py
+```
+
+Test endpoints:
+```bash
+# Health check
+curl http://localhost:8000/health
+
+# Async generation (uses batched API)
+curl -X POST http://localhost:8000/generate/async \
+  -H "Content-Type: application/json" \
+  -d '{"template_name": "DocGenie", "num_pages": 2}'
+```
+
+## 🚢 Deployment Steps
+
+### Option A: Railway + RunPod (RECOMMENDED - $10/month)
+
+#### Step 1: Deploy Redis to Upstash (FREE)
+
+1. Go to https://upstash.com
+2. Create account → New Redis Database
+3. Copy the `UPSTASH_REDIS_REST_URL` (looks like: `redis://default:xxxxx@xxxxx.upstash.io:6379`)
+
+#### Step 2: Deploy Handwriting Service to RunPod
+
+**Option A: Build from Git Repository (RECOMMENDED - No Docker Hub needed!)**
+
+This builds directly on RunPod's servers, avoiding the need to upload 10GB over your internet.
+
+1. **Prepare and push code to Git:**
+```bash
+cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie
+
+# First, prepare optimized WordStylist (removes 432MB of unnecessary files)
+cd handwriting_service
+./prepare_build.sh
+cd ..
+
+# Now commit the optimized WordStylist
+git add handwriting_service/
+git status  # Verify WordStylist is included (should show WordStylist/models/ema_ckpt.pt, etc.)
+git commit -m "Add handwriting service with optimized WordStylist"
+git push origin main
+```
+
+2. **Deploy to RunPod:**
+   - Go to https://runpod.io → Serverless → New Endpoint
+   - Click "Build from Git" (not Docker Image)
+   - Settings:
+     - Name: `docgenie-handwriting`
+     - Git URL: `https://github.com/Ahadhassan-2003/FYP.git`
+     - Git Branch: `main`
+     - Docker Build Context: `docgenie/handwriting_service`
+     - Dockerfile Path: `Dockerfile`
+     - GPU: RTX 4090 or A40
+     - Container Disk: 15GB
+     - Max Workers: 1
+     - Idle Timeout: 5 seconds
+     - Exposed Port: 8080
+   - Environment Variables:
+     ```
+     DEVICE=cuda
+     PYTHONUNBUFFERED=1
+     ```
+   - Build Args (prepare WordStylist):
+     ```
+     PREPARE_WORDSTYLIST=true
+     ```
+   - Click "Deploy"
+
+RunPod will clone your repo and build the image on their fast servers!
+
+**Option B: Pre-built Docker Image (if Git unavailable)**
+
+<details>
+<summary>Click to expand Docker Hub method</summary>
+
+```bash
+cd handwriting_service
+
+# Prepare optimized build (removes 432MB)
+./prepare_build.sh
+
+# Login to Docker Hub
+docker login
+
+# Build image
+docker buildx build --platform linux/amd64 \
+  -t yourusername/docgenie-handwriting:latest \
+  --build-arg BUILDKIT_INLINE_CACHE=1 \
+  .
+
+# Push to Docker Hub (may take 20-30 minutes for 10GB)
+docker push yourusername/docgenie-handwriting:latest
+```
+
+Then deploy on RunPod:
+   - Go to https://runpod.io → Serverless → New Endpoint
+   - Docker Image: `yourusername/docgenie-handwriting:latest`
+   - GPU: RTX 4090 or A40
+   - Port: 8080
+   - Environment Variables: `DEVICE=cuda`
+
+</details>
+docker push ahadhassan/docgenie-handwriting:v2
+3. **Get endpoint URL:**
+   - Copy the URL (looks like: `https://api.runpod.ai/v2/xxxxx/runsync`)
+   - This is your `HANDWRITING_SERVICE_URL`
+
+#### Step 3: Deploy API to Railway
+
+1. **Install Railway CLI:**
+```bash
+# Install Railway CLI
+npm i -g @railway/cli
+
+# Or use curl
+bash <(curl -fsSL cli.new) railway
+```
+
+2. **Initialize Railway project:**
+```bash
+cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie
+
+# Login to Railway
+railway login
+
+# Create new project
+railway init
+
+# Link to project (creates railway.json)
+railway link
+```
+
+3. **Set environment variables:**
+```bash
+# Set all environment variables from api/.env
+railway variables set ANTHROPIC_API_KEY=sk-ant-xxxxx
+railway variables set REDIS_URL=redis://default:xxxxx@xxxxx.upstash.io:6379
+railway variables set HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/xxxxx/runsync
+railway variables set SUPABASE_URL=https://xxxxx.supabase.co
+railway variables set SUPABASE_KEY=eyJxxxxx
+
+# Google OAuth (for token refresh only - frontend provides tokens in requests)
+railway variables set GOOGLE_CLIENT_ID=xxxxx.apps.googleusercontent.com
+railway variables set GOOGLE_CLIENT_SECRET=GOCSPX-xxxxx
+railway variables set GOOGLE_DRIVE_FOLDER_NAME="DocGenie Documents"
+```
+
+**Note:** Google access/refresh tokens are NOT environment variables! The frontend authenticates with Google OAuth, then passes `google_drive_token` and `google_drive_refresh_token` in the API request body. See [API request schema](api/schemas.py#L108-L114).
+
+4. **Deploy API + Worker:**
+```bash
+# Railway will detect Dockerfile and deploy automatically
+railway up
+
+# Or connect to GitHub and deploy from there
+railway connect
+```
+
+5. **Option 1: Separate Worker Service (For Production Scale):**
+   
+   *Note: Only needed if processing 50+ concurrent jobs. For most use cases, Option 2 (combined) is sufficient.*
+   
+   **Method A: Connect to Same GitHub Repo (Recommended)**
+   - Go to Railway dashboard → Your project → **New Service**
+   - Click **"GitHub Repo"** → Select your repo
+   - Name: `docgenie-worker`
+   - **Settings** → **Deploy**:
+     - Builder: `DOCKERFILE`
+     - Dockerfile Path: `Dockerfile`
+     - Root Directory: `/` (same as API)
+     - **Custom Start Command**:
+       ```bash
+       rq worker --url $REDIS_URL
+       ```
+   - **Variables**: Add all environment variables (same as API service)
+   - **Deploy**
+   
+   **Method B: Use Same Docker Image as API**
+   - Railway dashboard → New Service → **Empty Service**
+   - Name: `docgenie-worker`
+   - **Settings** → **Source**: Link to API service's image
+   - **Custom Start Command**: `rq worker --url $REDIS_URL`
+   - **Variables**: Copy from API service
+   - **Deploy**
+
+6. **Option 2: Combined API + Worker (Recommended for Getting Started):**
+   
+   Update `railway.json` to run both in one service:
+   ```json
+   {
+     "deploy": {
+       "startCommand": "uvicorn api.main:app --host 0.0.0.0 --port $PORT & rq worker --url $REDIS_URL & wait"
+     }
+   }
+   ```
+   
+   Then push:
+   ```bash
+   git add railway.json
+   git commit -m "feat: Run API and worker in combined service"
+   git push
+   ```
+   
+   **Benefits:**
+   - ✅ Single service ($5/month instead of $10/month)
+   - ✅ Simpler logs and monitoring
+   - ✅ Automatic scaling together
+   - ✅ Good for 90% of use cases
+
+7. **Get API URL:**
+   - Railway dashboard → API service → Settings → Domains
+   - Generate domain (e.g., `docgenie-api.up.railway.app`)
+
+#### Step 4: Update Frontend
+
+Update your frontend API URL to Railway domain:
+```javascript
+const API_URL = 'https://docgenie-api.up.railway.app';
+```
+
+### Option B: AWS EC2 + RunPod (For Production)
+
+#### Prerequisites
+- AWS account with EC2 access
+- Domain name (optional, for SSL)
+
+#### Step 1: Launch EC2 Instance
+
+```bash
+# Launch t3.medium instance
+aws ec2 run-instances \
+  --image-id ami-0c55b159cbfafe1f0 \
+  --instance-type t3.medium \
+  --key-name your-key-pair \
+  --security-group-ids sg-xxxxx \
+  --subnet-id subnet-xxxxx
+```
+
+**Security Group Rules:**
+- Port 22 (SSH) - Your IP only
+- Port 80 (HTTP) - 0.0.0.0/0
+- Port 443 (HTTPS) - 0.0.0.0/0
+- Port 8000 (API) - 0.0.0.0/0
+
+#### Step 2: Setup EC2
+
+```bash
+# SSH into instance
+ssh -i your-key.pem ubuntu@your-ec2-ip
+
+# Update system
+sudo apt update && sudo apt upgrade -y
+
+# Install Docker
+curl -fsSL https://get.docker.com -o get-docker.sh
+sudo sh get-docker.sh
+sudo usermod -aG docker ubuntu
+
+# Install Docker Compose
+sudo apt install docker-compose-plugin -y
+
+# Install Git
+sudo apt install git -y
+
+# Clone repository
+git clone https://gitlab.cs.hs-rm.de/diss_lamott/docgenie.git
+cd docgenie
+```
+
+#### Step 3: Configure Environment
+
+```bash
+# Create .env file
+cd api
+nano .env
+
+# Paste all environment variables
+# Save: Ctrl+X, Y, Enter
+
+# Update REDIS_URL to use Upstash
+# Update HANDWRITING_SERVICE_URL to RunPod endpoint
+```
+
+#### Step 4: Deploy with Docker Compose
+
+```bash
+cd /home/ubuntu/docgenie
+
+# Start services (API + Worker + Redis)
+docker-compose up -d api worker redis
+
+# Check logs
+docker-compose logs -f api
+docker-compose logs -f worker
+```
+
+#### Step 5: Setup Nginx Reverse Proxy
+
+```bash
+# Install Nginx
+sudo apt install nginx -y
+
+# Create config
+sudo nano /etc/nginx/sites-available/docgenie
+
+# Paste configuration:
+```
+
+```nginx
+server {
+    listen 80;
+    server_name your-domain.com;  # Or use EC2 IP
+
+    location / {
+        proxy_pass http://localhost:8000;
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection 'upgrade';
+        proxy_set_header Host $host;
+        proxy_cache_bypass $http_upgrade;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        
+        # Increase timeout for long-running requests
+        proxy_read_timeout 300s;
+        proxy_connect_timeout 75s;
+    }
+}
+```
+
+```bash
+# Enable site
+sudo ln -s /etc/nginx/sites-available/docgenie /etc/nginx/sites-enabled/
+sudo nginx -t
+sudo systemctl restart nginx
+
+# Optional: Setup SSL with Let's Encrypt
+sudo apt install certbot python3-certbot-nginx -y
+sudo certbot --nginx -d your-domain.com
+```
+
+#### Step 6: Setup Systemd Service (Auto-restart)
+
+```bash
+# Create service file
+sudo nano /etc/systemd/system/docgenie.service
+```
+
+```ini
+[Unit]
+Description=DocGenie API
+After=docker.service
+Requires=docker.service
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+WorkingDirectory=/home/ubuntu/docgenie
+ExecStart=/usr/bin/docker-compose up -d api worker redis
+ExecStop=/usr/bin/docker-compose down
+User=ubuntu
+
+[Install]
+WantedBy=multi-user.target
+```
+
+```bash
+# Enable service
+sudo systemctl daemon-reload
+sudo systemctl enable docgenie
+sudo systemctl start docgenie
+
+# Check status
+sudo systemctl status docgenie
+```
+
+## 🧪 Testing Production Deployment
+
+### 1. Health Check
+```bash
+curl https://your-domain.com/health
+```
+
+### 2. Sync Generation (Fast)
+```bash
+curl -X POST https://your-domain.com/generate \
+  -H "Content-Type: application/json" \
+  -d '{
+    "template_name": "DocGenie",
+    "num_pages": 1
+  }'
+```
+
+### 3. Async Generation (Batched, Cheap)
+```bash
+# Start async job
+RESPONSE=$(curl -X POST https://your-domain.com/generate/async \
+  -H "Content-Type: application/json" \
+  -d '{
+    "template_name": "DocGenie",
+    "num_pages": 2
+  }')
+
+REQUEST_ID=$(echo $RESPONSE | jq -r '.request_id')
+echo "Request ID: $REQUEST_ID"
+
+# Poll status
+while true; do
+  STATUS=$(curl -s https://your-domain.com/jobs/$REQUEST_ID/status | jq -r '.status')
+  echo "Status: $STATUS"
+  if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then
+    break
+  fi
+  sleep 10
+done
+
+# Get result
+curl https://your-domain.com/jobs/$REQUEST_ID/status | jq
+```
+
+## 📊 Cost Breakdown
+
+### Railway + RunPod (Recommended)
+| Service | Cost | Notes |
+|---------|------|-------|
+| Railway (API + Worker) | $5-10/month | Includes 500 hours |
+| Upstash Redis | FREE | 10K requests/day |
+| RunPod Serverless GPU | $0.20/hr | Only charged when active |
+| Supabase | FREE | 500MB database |
+| **Total** | **~$10-15/month** | + $0.20/hr GPU usage |
+
+### EC2 + RunPod
+| Service | Cost | Notes |
+|---------|------|-------|
+| EC2 t3.medium | $30/month | 2 vCPU, 4GB RAM |
+| Upstash Redis | FREE | External Redis |
+| RunPod Serverless GPU | $0.20/hr | Only when needed |
+| Supabase | FREE | External DB |
+| **Total** | **~$30/month** | + $0.20/hr GPU usage |
+
+### EC2 + Dedicated GPU (Production)
+| Service | Cost | Notes |
+|---------|------|-------|
+| EC2 g4dn.xlarge | $150/month | 4 vCPU, 16GB RAM, T4 GPU |
+| Supabase | FREE | External DB |
+| **Total** | **~$150/month** | All-in-one solution |
+
+## 🔧 Maintenance
+
+### Update Deployment
+
+**Railway:**
+```bash
+# Push to main branch (auto-deploy)
+git push origin main
+
+# Or manual deploy
+railway up
+```
+
+**EC2:**
+```bash
+ssh ubuntu@your-ec2-ip
+cd docgenie
+git pull
+docker-compose down
+docker-compose up -d --build
+```
+
+### View Logs
+
+**Railway:**
+```bash
+railway logs
+```
+
+**EC2:**
+```bash
+# API logs
+docker-compose logs -f api
+
+# Worker logs
+docker-compose logs -f worker
+
+# Nginx logs
+sudo tail -f /var/log/nginx/access.log
+sudo tail -f /var/log/nginx/error.log
+```
+
+### Monitor Redis Queue
+
+```bash
+# Connect to Redis
+redis-cli -u $REDIS_URL
+
+# Check queue status
+> LLEN rq:queue:default
+> LRANGE rq:queue:default 0 -1
+```
+
+## 🚨 Troubleshooting
+
+### Issue: Worker can't import docgenie package
+**Solution:** Dockerfile installs entire monorepo with `pip install -e .`
+
+### Issue: Handwriting service connection timeout
+**Solution:** Use RunPod's `/runsync` endpoint, not `/run` (synchronous)
+
+### Issue: Google token expired during job
+**Solution:** Ensure `GOOGLE_REFRESH_TOKEN`, `GOOGLE_CLIENT_ID`, `GOOGLE_CLIENT_SECRET` are set
+
+### Issue: Railway build fails (too large)
+**Solution:** Check `.dockerignore` excludes `data/` folders
+
+### Issue: Worker heartbeat timeout
+**Solution:** Job is still running, batched API takes 10-30 minutes
+
+## 📚 Next Steps
+
+1. **Monitor costs:** Railway dashboard, RunPod usage page
+2. **Setup alerts:** Railway → Settings → Notifications
+3. **Scale workers:** Railway → Worker service → Settings → Replicas
+4. **Add caching:** Redis cache for generated documents
+5. **Setup CI/CD:** GitHub Actions → Railway auto-deploy
+
+## 🎉 You're Done!
+
+Your DocGenie API is now deployed with:
+- ✅ All docgenie package imports resolved
+- ✅ GPU handwriting service on RunPod
+- ✅ Background workers for batched API
+- ✅ Auto-scaling and cost optimization
+- ✅ Google token refresh working
+- ✅ Database schema compatibility
+
+**API URL:** `https://your-domain.com`  
+**Docs:** `https://your-domain.com/docs`  
+**Health:** `https://your-domain.com/health`
+
+---
+
+## 🖥️ Local Testing Guide
+
+### Architecture
+
+```
+┌─────────────────────────────────┐
+│   DocGenie API (Port 8000)      │──┐ HTTP
+└─────────────────────────────────┘  │ localhost:8080
+                                     ▼
+┌─────────────────────────────────┐
+│ Handwriting Service (Port 8080) │
+│ - Loads WordStylist model       │
+└─────────────────────────────────┘
+```
+
+### Prerequisites
+
+1. **Python environment**: `source .venv/bin/activate`
+2. **WordStylist Model** at `WordStylist/models/ckpt.pt` and `ema_ckpt.pt`
+3. **`api/.env`** with `ANTHROPIC_API_KEY`, `HANDWRITING_SERVICE_ENABLED=true`, `HANDWRITING_SERVICE_URL=http://localhost:8080`
+
+### Step-by-Step Setup
+
+**Terminal 1 – Handwriting Service:**
+```bash
+cd handwriting_service
+DEVICE=cpu ./start.sh          # CPU (no GPU required)
+# DEVICE=cuda ./start.sh       # GPU (faster)
+```
+
+**Terminal 2 – DocGenie API:**
+```bash
+cd api
+uvicorn main:app --reload
+```
+
+**Terminal 3 – Test:**
+```bash
+curl http://localhost:8080/health   # Handwriting service
+curl http://localhost:8000/health   # API
+cd api && python test_api.py
+```
+
+### Performance Notes
+- CPU mode: ~5–10 s/word | GPU mode: ~0.5–1 s/word
+- Service processes all words in one batch for efficiency
+
+---
+
+## ⚙️ Railway-Specific Configuration
+
+### Critical Issues & Fixes
+
+**1. `.dockerignore` – Keep required data folders:**
+```
+!data/prompt_templates/
+!data/visual_element_prefabs/
+```
+
+**2. `railway.json` – Start both API and worker:**
+```json
+"startCommand": "cd api && uvicorn main:app --host 0.0.0.0 --port $PORT & rq worker --url $REDIS_URL & wait"
+```
+
+### Environment Variables
+
+#### 🔴 Required
+```bash
+ANTHROPIC_API_KEY=sk-ant-api03-xxx
+REDIS_URL=rediss://default:xxx@xxx.upstash.io:6379
+HANDWRITING_SERVICE_URL=https://api.runpod.ai/v2/ht9ajgrduitgpr/runsync
+HANDWRITING_SERVICE_ENABLED=true
+SUPABASE_URL=https://xxx.supabase.co
+SUPABASE_KEY=xxx
+GOOGLE_CLIENT_ID=xxx.apps.googleusercontent.com
+GOOGLE_CLIENT_SECRET=xxx
+```
+
+#### 🟡 Recommended
+```bash
+RUNPOD_API_KEY=xxx
+OCR_SERVICE_ENABLED=true
+OCR_USE_LOCAL=true
+OCR_ENGINE=microsoft_di
+OCR_DPI=300
+HANDWRITING_SERVICE_TIMEOUT=300
+HANDWRITING_SERVICE_MAX_RETRIES=3
+RQ_QUEUE_NAME=docgenie
+LOG_LEVEL=INFO
+```
+
+#### 🟢 Optional (defaults are fine)
+```bash
+API_HOST=0.0.0.0
+API_PORT=8000
+DEBUG_MODE=false
+CLAUDE_MODEL=claude-sonnet-4-5-20250929
+CORS_ORIGINS=*
+GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
+TEMP_DIR=/tmp/docgenie_api
+HANDWRITING_APPLY_BLUR=false
+BBOX_NORMALIZATION_ENABLED=false
+GT_VERIFICATION_ENABLED=false
+ANALYSIS_ENABLED=false
+DEBUG_VISUALIZATION_ENABLED=false
+```
+
+### Validation Steps
+
+```bash
+# 1. Health check
+curl https://your-app.up.railway.app/health
+
+# 2. Sync generation
+curl -X POST https://your-app.up.railway.app/api/generate \
+  -H "Content-Type: application/json" \
+  -d '{"document_category": "invoice", "pages": 1}'
+
+# 3. Async generation
+curl -X POST https://your-app.up.railway.app/api/async/generate \
+  -H "Content-Type: application/json" \
+  -d '{"document_category": "invoice", "pages": 1, "google_access_token": "ya29.xxx"}'
+```
+
+### Common Railway Issues
+
+| Issue | Cause | Solution |
+|-------|-------|----------|
+| Worker not starting | Missing `rq worker` in start command | Check `railway.json` `startCommand` |
+| Missing prompt templates | `.dockerignore` too aggressive | Add `!data/prompt_templates/` |
+| Playwright errors | Browser not installed | Ensure `playwright install chromium` in Dockerfile |
+| Redis connection errors | Wrong `REDIS_URL` | Verify in Railway env variables |
+| Handwriting timeout | Batch too large | Increase `HANDWRITING_SERVICE_TIMEOUT` |
+| Large Docker image | `data/` folders included | Check `.dockerignore` excludes datasets/embeddings |
+
+---
+
+## ⚡ RunPod Batch Optimization
+
+### Problem (Old Parallel Processing)
+Each text was sent as a separate RunPod request → N texts = N workers = N× activation cost.
+
+**Example:** 10 texts → 10 workers × 18 s = 180 worker-seconds + 10× activation fees
+
+### Solution (New Batch Processing)
+All texts sent in **one** RunPod request → 1 worker handles everything.
+
+**Example:** 10 texts → 1 worker × 190 s = 190 worker-seconds + 1× activation fee  
+**Savings: ~45–60% cost reduction** (activation fees dominate RunPod pricing)
+
+### Batch Request Format (handler.py)
+
+```json
+{
+  "input": {
+    "texts": [
+      {"text": "Hello", "author_id": 42, "hw_id": "hw_0"},
+      {"text": "World", "author_id": 42, "hw_id": "hw_1"}
+    ],
+    "apply_blur": true
+  }
+}
+```
+
+**Response:**
+```json
+{
+  "status": "COMPLETED",
+  "output": {
+    "images": [
+      {"image_base64": "...", "width": 217, "height": 61, "text": "Hello", "author_id": 42, "hw_id": "hw_0"},
+      {"image_base64": "...", "width": 195, "height": 58, "text": "World", "author_id": 42, "hw_id": "hw_1"}
+    ],
+    "total_generated": 2
+  }
+}
+```
+
+> **Note:** Backward-compatible – single text requests (old format) are still supported. Handler auto-detects batch vs single based on the `"texts"` key.
+
+### Timeout Configuration
+Timeout is dynamically calculated: `num_texts × 20 + 30` seconds.  
+For large batches (20+ texts), set RunPod endpoint max execution time to 600 s.
+
+### Cost Comparison
+
+| Scenario | OLD (parallel) | NEW (batched) | Savings |
+|----------|---------------|---------------|---------|
+| 2 texts  | 2 workers × 18 s | 1 worker × 38 s | ~50% |
+| 10 texts | 10 workers × 18 s | 1 worker × 190 s | ~55% |
+| 25 texts | 25 workers × 18 s | 1 worker × 480 s | ~60% |
+
+### Integration Test
+```bash
+cd api
+python test_runpod_integration.py
+```
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..53825367b89703cd35148c3653536dff8a404f7e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,96 @@
+# ============================================
+# DocGenie API + Worker - Dockerfile (Minimal)
+# ============================================
+# Adapted for Hugging Face Spaces (Docker SDK):
+#   - Non-root user (UID 1000) — HF Spaces requirement
+#   - Port 7860                — HF Spaces default
+#   - Playwright browsers in user-owned path
+
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install runtime system dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    gnupg \
+    poppler-utils \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    libglib2.0-0 \
+    libnss3 \
+    libnspr4 \
+    libdbus-1-3 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libxkbcommon0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libasound2 \
+    libpango-1.0-0 \
+    libcairo2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install pip packages (no uv needed - simpler)
+COPY api/requirements.txt ./api/requirements.txt
+RUN pip install --no-cache-dir -r api/requirements.txt
+
+# Copy ONLY the docgenie modules needed by API (not the full package)
+COPY docgenie/__init__.py ./docgenie/__init__.py
+COPY docgenie/logging.py ./docgenie/logging.py
+COPY docgenie/generation ./docgenie/generation
+COPY data/prompt_templates ./data/prompt_templates
+COPY data/visual_element_prefabs ./data/visual_element_prefabs
+
+# Copy API code
+COPY api ./api
+
+# Copy startup script
+COPY start.sh ./start.sh
+RUN chmod +x start.sh
+
+# Clean up Python cache
+RUN find /usr/local/lib/python3.11/site-packages -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \
+    find /usr/local/lib/python3.11/site-packages -name "*.pyc" -delete
+
+# -------------------------------------------------------
+# Non-root user setup — required by Hugging Face Spaces
+# -------------------------------------------------------
+RUN useradd -m -u 1000 user
+
+# Install Playwright system dependencies as root (requires apt — must run before USER switch)
+RUN playwright install-deps chromium
+
+# Create writable directories and hand ownership to user
+RUN mkdir -p /tmp/docgenie /home/user/.cache/playwright && \
+    chown -R user:user /app /tmp/docgenie /home/user
+
+# Switch to non-root user for all runtime operations
+USER user
+
+# Set environment variables
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    PORT=7860 \
+    PLAYWRIGHT_BROWSERS_PATH=/home/user/.cache/playwright
+
+# Download Playwright Chromium browser binary into user-owned cache directory
+# (browser download only — system deps already installed above as root)
+RUN playwright install chromium
+
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/health')"
+
+# Start command — shell script handles API + RQ worker
+CMD ["./start.sh"]
diff --git a/GENERATION_PIPELINE_DOCUMENTATION.md b/GENERATION_PIPELINE_DOCUMENTATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..88a6c068cd84da11ed5e95cd2bd8a4fb21b81323
--- /dev/null
+++ b/GENERATION_PIPELINE_DOCUMENTATION.md
@@ -0,0 +1,3267 @@
+# DocGenie Generation Pipeline & API Documentation
+
+**Version:** 1.0  
+**Last Updated:** February 7, 2026  
+**Purpose:** Comprehensive reference for the DocGenie synthetic document generation system
+
+---
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Pipeline Architecture](#pipeline-architecture)
+3. [Pipeline Stages (01-19)](#pipeline-stages-01-19)
+4. [API Implementation](#api-implementation)
+5. [Core Models & Utilities](#core-models--utilities)
+6. [Configuration & Constants](#configuration--constants)
+7. [Usage Examples](#usage-examples)
+8. [Error Handling & Debugging](#error-handling--debugging)
+
+---
+
+## Overview
+
+DocGenie is a sophisticated 19-stage pipeline for generating synthetic document datasets with ground truth annotations. It supports multiple document understanding tasks:
+
+- **Document Question Answering (QA)**
+- **Key Information Extraction (KIE)**
+- **Document Layout Analysis (DLA)**
+- **Document Classification (CLS)**
+
+### Key Features
+
+- **LLM-Powered Generation**: Uses Claude/Gemini/Open-source models to generate diverse document content
+- **Realistic Handwriting**: Diffusion model-based handwriting synthesis with author-specific styles
+- **Visual Element Integration**: Stamps, logos, barcodes, charts, and photos
+- **Multi-Task Support**: Task-specific ground truth formatting and validation
+- **Quality Assurance**: Comprehensive validation, OCR verification, and error tracking
+- **Modular Design**: Each pipeline stage is independently executable with clear inputs/outputs
+
+### Technology Stack
+
+- **LLM APIs**: Claude (Anthropic), Gemini, DeepSeek, Qwen
+- **PDF Rendering**: Playwright (Chromium), PyMuPDF
+- **OCR**: Microsoft Azure OCR
+- **Handwriting**: Custom diffusion model
+- **Image Processing**: PIL, OpenCV
+- **API Framework**: FastAPI
+- **Data Processing**: Pandas, NumPy
+
+---
+
+## Pipeline Architecture
+
+### High-Level Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                        DOCGENIE GENERATION PIPELINE                      │
+└─────────────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────┐
+│  PHASE 1: SELECTION  │
+└──────────────────────┘
+    ↓
+[01] Select Seeds ────────────► seeds.csv, clusters.csv
+                                (Cluster-based diverse seed selection)
+
+┌──────────────────────┐
+│ PHASE 2: LLM GEN     │
+└──────────────────────┘
+    ↓
+[02] Prompt LLM ──────────────► batch_results/ (JSON)
+    │                           (Claude API batched calls)
+    ↓
+[03] Process Response ────────► raw_html/, raw_annotations/
+                                (Extract HTML & GT from responses)
+
+┌──────────────────────┐
+│ PHASE 3: RENDERING   │
+└──────────────────────┘
+    ↓
+[04] Render PDF Initial ──────► pdf_initial/, geometries/
+    │                           (HTML→PDF with geometry extraction)
+    ↓
+[05] Extract BBoxes ──────────► pdf_word_bboxes/, pdf_char_bboxes/
+    │                           (PyMuPDF text extraction)
+    ↓
+[06] Extract Layout ──────────► layout_element_definitions/
+                                (DLA/KIE-specific annotations)
+
+┌──────────────────────┐
+│ PHASE 4: EXTRACTION  │
+└──────────────────────┘
+    ↓
+[07] Extract Handwriting ─────► handwriting_definitions/
+    │                           (Identify handwriting regions)
+    ↓
+[08] Extract Visual Elements ─► visual_element_definitions/
+                                (Stamp/logo/barcode placeholders)
+
+┌──────────────────────┐
+│ PHASE 5: GENERATION  │
+└──────────────────────┘
+    ↓
+[09] Create Handwriting ──────► handwriting_images/
+    │                           (Diffusion model generation)
+    ↓
+[10] Create Visual Elements ──► visual_element_images/
+                                (Generate/select stamps, logos, etc.)
+
+┌──────────────────────┐
+│ PHASE 6: COMPOSITION │
+└──────────────────────┘
+    ↓
+[11] Render PDF (2nd Pass) ───► pdf_without_handwriting_placeholder/
+    │                           (Remove handwriting placeholders)
+    ↓
+[12] Insert Handwriting ──────► pdf_with_handwriting/
+    │                           (Overlay handwriting images)
+    ↓
+[13] Insert Visual Elements ──► pdf_final/
+    │                           (Overlay stamps, logos, etc.)
+    ↓
+[14] Render Image ────────────► images/
+                                (PDF→PNG conversion)
+
+┌──────────────────────┐
+│ PHASE 7: FINALIZATION│
+└──────────────────────┘
+    ↓
+[15] Perform OCR ─────────────► final_word_bboxes/, final_segment_bboxes/
+    │                           (Microsoft OCR)
+    ↓
+[16] Normalize BBoxes ────────► normalized_word_bboxes/, normalized_segment_bboxes/
+                                (Pixel→[0,1] coordinates)
+
+┌──────────────────────┐
+│ PHASE 8: VALIDATION  │
+└──────────────────────┘
+    ↓
+[17] GT Preparation ──────────► verified_gt/
+    │                           (Fuzzy matching, BIO tagging)
+    ↓
+[18] Analyze ─────────────────► dataset_log.json
+    │                           (Statistics, cost analysis)
+    ↓
+[19] Create Debug Data ───────► debug/ subdirectories
+                                (Visualizations for inspection)
+```
+
+### Data Flow Between Stages
+
+```
+Seed Images ──┐
+              ├──► [02] ──► HTML + GT ──► [04] ──► PDF + Geometries
+Prompt Params ┘                                         │
+                                                        ├──► [05] ──► BBoxes
+                                                        │               │
+                                                        │               ├──► [07] ──► HW Defs ──► [09] ──► HW Images ──┐
+                                                        │               │                                              │
+                                                        │               └──► [08] ──► VE Defs ──► [10] ──► VE Images ──┤
+                                                        │                                                              │
+                                                        └──► [11] ──► PDF (no HW) ──┬──► [12] ◄─────────────────────────┤
+                                                                                    │           (Insert HW)            │
+                                                                                    └──► [13] ◄────────────────────────┘
+                                                                                             (Insert VE)
+                                                                                                  ↓
+                                                                                            [14] ──► Image
+                                                                                                  ↓
+                                                                                            [15] ──► OCR BBoxes
+                                                                                                  ↓
+                                                                                            [16] ──► Normalized
+                                                                                                  ↓
+                                                                                            [17] ──► Verified GT
+```
+
+---
+
+## Pipeline Stages (01-19)
+
+### Stage 01: Select Seeds
+
+**File:** `pipeline_01_select_seeds.py`
+
+**Purpose:** Select diverse seed images from base dataset using clustering algorithms to ensure variety in the generated documents.
+
+**Key Functions:**
+- `main()`: Orchestrates seed selection process
+- `downscale_and_compress_seeds()`: Prepares seed images for efficient API transmission
+- `plot_class_distribution()`: Visualizes class balance in selected seeds
+- `visualize_cluster_histogram()`: Shows distribution across clusters
+
+**Process:**
+1. Load embeddings from base dataset
+2. Perform clustering (KMeans or other algorithms)
+3. Sample N seeds per cluster
+4. Downscale and compress images (JPEG, max dimension)
+5. Save seed manifest and cluster assignments
+
+**Inputs:**
+- `SynDatasetDefinition` configuration
+- Base dataset name (e.g., `docvqa`, `cord`, `publaynet`)
+- Clustering parameters from constants
+
+**Outputs:**
+```
+seeds.csv                    # Selected seed document IDs per prompt call
+clusters.csv                 # Cluster assignments for all documents
+seeds/ (directory)           # Preprocessed seed images (JPEG, compressed)
+```
+
+**Configuration Parameters:**
+- `EMBEDDING_MODEL`: Specifies which embedding model was used
+- `IMAGE_MAX_DIMENSION`: Max width/height for compression
+- `JPEG_QUALITY`: Compression quality (0-100)
+
+**Example Usage:**
+```python
+from docgenie.generation import pipeline_01_select_seeds
+
+pipeline_01_select_seeds.main(
+    syndatadef_path="data/syn_dataset_definitions/docvqa_alpha=1.0.yaml",
+    base_dataset="docvqa"
+)
+```
+
+---
+
+### Stage 02: Prompt LLM
+
+**File:** `pipeline_02_prompt_llm.py`
+
+**Purpose:** Send batched prompts to LLM APIs (Claude, Gemini, DeepSeek, Qwen) with seed images to generate document HTML and ground truth.
+
+**Key Functions:**
+- `main()`: Main orchestrator for LLM prompting
+- `create_batched_messages()`: Constructs API-compatible message batches
+- `track_batch_completion()`: Polls API for batch status
+- Cost calculation utilities in `pipeline_01/cost.py`
+
+**Process:**
+1. Load seed images and encode as base64
+2. Build prompts from template with parameter injection
+3. Create batched API requests (Claude Batch API for cost efficiency)
+4. Submit batches and track completion
+5. Save results for processing in stage 03
+
+**Inputs:**
+- Prompt template from `data/prompt_templates/<template_name>/`
+- Seed images from stage 01
+- API credentials from environment variables
+- `SynDatasetDefinition` parameters
+
+**Outputs:**
+```
+prompt_batches/              # Batch metadata (batch IDs, status)
+message_results/             # JSON response files per batch
+logs/                        # Prompting logs and progress
+```
+
+**API Configuration:**
+- **Claude:** Uses Batch API with prompt caching for cost efficiency
+- **Batch Size:** Configurable via `BATCH_SIZE` constant
+- **Polling Interval:** Configurable wait time between status checks
+- **Model Selection:** Specified in `SynDatasetDefinition.llm_model`
+
+**Cost Tracking:**
+- Input/output token counts per request
+- Cached token usage (for Claude)
+- Total cost estimation per batch
+
+**Example Configuration:**
+```yaml
+# In syn_dataset_definition YAML
+llm_model: "claude-sonnet-4-20250514"
+prompt_template: "DocGenie"
+num_solutions: 1  # Documents per prompt
+language: "English"
+doc_type: "business and administrative"
+```
+
+---
+
+### Stage 03: Process Response
+
+**File:** `pipeline_03_process_response.py`
+
+**Purpose:** Extract and validate HTML documents and ground truth annotations from LLM responses.
+
+**Key Functions:**
+- `main()`: Main processor
+- `extract_html_from_message()`: Regex-based HTML extraction from markdown code blocks
+- `extract_gt_from_html()`: Parse JSON ground truth from `<script id="GT">` tags
+- `validate_and_save_gt()`: Task-specific validation and formatting
+
+**Process:**
+1. Load message results from stage 02
+2. Extract HTML content using regex patterns
+3. Parse and validate ground truth JSON
+4. Apply task-specific formatting (QA, KIE, DLA, CLS)
+5. Save raw HTML and annotations separately
+
+**Inputs:**
+- Message results from `message_results/` (stage 02)
+- `SynDatasetDefinition` task type and prompt format
+
+**Outputs:**
+```
+raw_html/                    # HTML files (one per document)
+raw_annotations/             # Ground truth JSON files
+  qa/                        # QA format: {"question": "answer", ...}
+  kie/                       # KIE format: {"entity": "value", ...}
+  dla/                       # DLA format: {"element_id": "label", ...}
+  cls/                       # CLS format: {"class": "category"}
+logs/message_processing/     # Processing logs per message
+```
+
+**Ground Truth Formats:**
+
+**QA Format:**
+```json
+{
+  "What is the invoice number?": "INV-12345",
+  "What is the total amount?": "$1,234.56"
+}
+```
+
+**KIE Format:**
+```json
+{
+  "company_name": "Acme Corp",
+  "invoice_date": "2024-01-15",
+  "total_amount": "1234.56"
+}
+```
+
+**DLA Format:**
+```json
+{
+  "layout_0": "title",
+  "layout_1": "text",
+  "layout_2": "table"
+}
+```
+
+**Validation Checks:**
+- Expected document count matches actual
+- Valid JSON structure in GT
+- Task-specific field presence
+- HTML completeness (valid tags)
+
+**Error Handling:**
+- Malformed HTML → Logged, skipped
+- Missing GT → Document flagged in logs
+- Invalid JSON → Parsing error logged
+
+---
+
+### Stage 04: Render PDF and Extract Geometries
+
+**File:** `pipeline_04_render_pdf_and_extract_geos.py`
+
+**Purpose:** Convert HTML to PDF with automatic content-based page sizing and extract element geometries (positions, dimensions) for downstream processing.
+
+**Key Functions:**
+- `main()`: Async batch rendering orchestrator
+- `render_pdf_with_playwright()`: Single PDF rendering using Playwright/Chromium
+- `preprocess_css()`: Remove conflicting CSS `@page` rules
+- `validate_pdf()`: Check page count and file integrity
+- `extract_geometries()`: Parse element positions from injected JavaScript
+
+**Process:**
+1. Load raw HTML from stage 03
+2. Inject geometry extraction JavaScript
+3. Preprocess CSS (remove fixed page sizes)
+4. Render with Playwright in headless Chromium
+5. Measure content dimensions dynamically
+6. Export PDF with calculated page size
+7. Extract and save element geometries
+
+**Inputs:**
+- Raw HTML files from `raw_html/` (stage 03)
+
+**Outputs:**
+```
+pdf_initial/                 # Initial PDFs
+geometries/                  # Element geometry JSON files
+  {doc_id}.json              # Contains positions, dimensions, CSS classes
+render_html/                 # HTML with geometry extraction scripts
+debug/pdf_with_geos/         # Debug PDFs with geometry overlays (optional)
+logs/rendering/              # Rendering logs and errors
+```
+
+**Geometry JSON Structure:**
+```json
+{
+  "page_width_mm": 210.0,
+  "page_height_mm": 297.0,
+  "elements": [
+    {
+      "id": "layout_0",
+      "type": "div",
+      "class": "title",
+      "rect": {
+        "x": 20.0,
+        "y": 30.0,
+        "width": 170.0,
+        "height": 15.0
+      },
+      "text": "Invoice",
+      "attributes": {
+        "data-label": "title",
+        "data-handwriting": null,
+        "data-visual-element": null
+      }
+    }
+  ]
+}
+```
+
+**Key Features:**
+- **Automatic Page Sizing:** No fixed dimensions; page size adapts to content
+- **Concurrent Rendering:** Semaphore-controlled parallel processing
+- **Geometry Extraction:** JavaScript injected to capture element positions
+- **CSS Coordinate Conversion:** 96 DPI (CSS) → 72 DPI (PDF)
+- **Retry Logic:** Up to 3 attempts with configurable timeout
+- **Debug Visualizations:** Optional overlay of geometries on PDFs
+
+**Configuration Constants:**
+- `CHROMIUM_CONCURRENCY`: 10 (parallel render limit)
+- `PER_PDF_RENDER_TIMEOUT`: 60 seconds
+- `PER_PDF_RENDER_MAX_RETRIES`: 3 attempts
+- `PDF_POINT_SCALING`: 72/96 for DPI conversion
+
+**Error Handling:**
+- Timeout → Retry with increased timeout
+- Multi-page PDFs → Flagged and skipped
+- Missing geometries → Error logged, document marked invalid
+
+---
+
+### Stage 05: Extract BBoxes from PDF
+
+**File:** `pipeline_05_extract_bboxes_from_pdf.py`
+
+**Purpose:** Extract word-level and character-level bounding boxes from PDFs using PyMuPDF for accurate text positioning.
+
+**Key Functions:**
+- `main()`: Main extraction orchestrator
+- `extract_bboxes_from_pdf()`: PyMuPDF-based extraction
+- `verify_char_to_word_mapping()`: Validate character-to-word relationships
+- `create_bbox_debug_pdf()`: Generate debug visualizations
+
+**Process:**
+1. Load PDFs from stage 04
+2. Extract words with PyMuPDF `get_text("words")`
+3. Extract characters with `get_text("rawdict")`
+4. Map characters to parent words
+5. Validate mappings (required for handwriting splitting)
+6. Save both word and character bboxes
+
+**Inputs:**
+- PDFs from `pdf_initial/` (stage 04)
+
+**Outputs:**
+```
+pdf_word_bboxes/             # Word-level bounding boxes
+  {doc_id}.json              # Format: [x0, y0, x1, y1, text]
+pdf_char_bboxes/             # Character-level bounding boxes (if mappable)
+  {doc_id}.json              # Format: [x0, y0, x1, y1, char, word_idx]
+debug/pdf_bboxes/            # Debug PDFs with bbox overlays (optional)
+logs/bbox_extraction/        # Extraction logs
+```
+
+**BBox JSON Format:**
+
+**Word BBoxes:**
+```json
+[
+  {"x0": 20.0, "y0": 30.0, "x1": 60.0, "y1": 45.0, "text": "Invoice"},
+  {"x0": 65.0, "y0": 30.0, "x1": 95.0, "y1": 45.0, "text": "Date"}
+]
+```
+
+**Char BBoxes (with word mapping):**
+```json
+[
+  {"x0": 20.0, "y0": 30.0, "x1": 25.0, "y1": 45.0, "char": "I", "word_idx": 0},
+  {"x0": 25.0, "y0": 30.0, "x1": 30.0, "y1": 45.0, "char": "n", "word_idx": 0}
+]
+```
+
+**Key Features:**
+- **Dual-Level Extraction:** Both word and character granularity
+- **Mapping Validation:** Ensures characters can be mapped to words (critical for handwriting)
+- **Debug Visualizations:** Color-coded bbox overlays on PDFs
+- **PDF Coordinate System:** Uses 72 DPI (PDF points)
+
+**Character-to-Word Mapping:**
+Required for handwriting splitting in stage 07. If mapping fails (e.g., due to ligatures or complex fonts), char bboxes are not saved, and handwriting will use word-level fallback.
+
+**Error Handling:**
+- Empty PDFs → Skipped with warning
+- Unmappable characters → Word-level fallback
+- Invalid bboxes (negative dimensions) → Filtered out
+
+---
+
+### Stage 06: Extract Layout Element Definitions and Annotation GT
+
+**File:** `pipeline_06_extract_layout_element_definitions_and_annotation_gt.py`
+
+**Purpose:** Extract layout element definitions for Document Layout Analysis (DLA) and annotation-based ground truth for Key Information Extraction (KIE).
+
+**Key Functions:**
+- `main()`: Task router
+- `handle_dla()`: Process DLA tasks
+- `handle_kie()`: Process KIE tasks
+- `parse_layout_elements()`: Extract layout elements from geometries
+- `parse_kie_fields()`: Extract KIE annotated fields from geometries
+
+**Process (DLA):**
+1. Load geometries from stage 04
+2. Filter elements with `data-label` attribute
+3. Validate labels against task-specific valid labels
+4. Extract bounding boxes and labels
+5. Save layout element definitions
+
+**Process (KIE):**
+1. Load geometries from stage 04
+2. Filter elements with `data-annotation` attribute
+3. Extract entity names and text values
+4. Validate against expected entity schema
+5. Save raw annotations for stage 17
+
+**Inputs:**
+- Geometries from `geometries/` (stage 04)
+- Valid labels from `SynDatasetDefinition.valid_labels`
+- Task type from `SynDatasetDefinition.task`
+
+**Outputs:**
+```
+layout_element_definitions/  # For DLA tasks
+  {doc_id}.json              # [{id, label, rect}, ...]
+raw_annotations/             # For KIE tasks
+  kie_annotations/
+    {doc_id}.json            # {entity_name: text_value, ...}
+```
+
+**Layout Element Definition Format (DLA):**
+```json
+[
+  {
+    "id": "layout_0",
+    "label": "title",
+    "rect": {"x": 20.0, "y": 30.0, "width": 170.0, "height": 15.0}
+  },
+  {
+    "id": "layout_1",
+    "label": "text",
+    "rect": {"x": 20.0, "y": 50.0, "width": 170.0, "height": 50.0}
+  }
+]
+```
+
+**KIE Annotation Format:**
+```json
+{
+  "company_name": "Acme Corporation",
+  "invoice_number": "INV-12345",
+  "invoice_date": "January 15, 2024",
+  "total_amount": "$1,234.56"
+}
+```
+
+**Validation Checks:**
+- **Missing labels:** Elements without `data-label` → Warning
+- **Invalid labels:** Labels not in valid_labels → Error
+- **Zero dimensions:** Width or height = 0 → Error
+- **Multiple labels (KIE only):** One element, multiple entities → Error
+- **Missing text:** KIE elements without text → Warning
+
+**Task-Specific Handling:**
+- **DLA:** Converts geometries to `LayoutBBox` format for stage 17
+- **KIE:** Extracts entity-value pairs for BIO tagging in stage 17
+- **QA/CLS:** No processing in this stage (uses raw GT from stage 03)
+
+---
+
+### Stage 07: Extract Handwriting
+
+**File:** `pipeline_07_extract_handwriting.py`
+
+**Purpose:** Identify text regions marked for handwriting generation, map them to character/word bounding boxes, and prepare definitions for the diffusion model.
+
+**Key Functions:**
+- `main()`: Main extraction orchestrator
+- `parse_handwriting_geometries()`: Extract elements with `data-handwriting` attribute
+- `map_handwriting_to_bboxes()`: Match handwriting text to OCR bboxes spatially
+- `split_long_words()`: Split words exceeding diffusion model's character limit
+- `extract_handwriting_style()`: Parse author ID from CSS class
+
+**Process:**
+1. Load geometries from stage 04
+2. Filter elements with `data-handwriting` attribute
+3. Load word/character bboxes from stage 05
+4. Spatially match handwriting regions to bboxes
+5. Split long words for diffusion model constraints
+6. Extract author ID for style consistency
+7. Save handwriting definitions with bbox mappings
+
+**Inputs:**
+- Geometries from `geometries/` (stage 04)
+- Word bboxes from `pdf_word_bboxes/` (stage 05)
+- Character bboxes from `pdf_char_bboxes/` (stage 05, if available)
+
+**Outputs:**
+```
+handwriting_definitions/
+  {doc_id}.json              # Handwriting region definitions
+logs/handwriting_extraction/ # Extraction logs and warnings
+```
+
+**Handwriting Definition Format:**
+```json
+[
+  {
+    "id": "hw0",
+    "text": "John Smith",
+    "author_id": "author1",
+    "bboxes": [
+      "20.0,30.0,60.0,45.0,John",
+      "65.0,30.0,95.0,45.0,Smith"
+    ],
+    "rect": {
+      "x": 20.0,
+      "y": 30.0,
+      "width": 75.0,
+      "height": 15.0
+    },
+    "is_signature": false
+  }
+]
+```
+
+**Key Features:**
+- **Author ID Extraction:** Parses CSS classes like `handwriting-author1` for style consistency
+- **Spatial Matching:** Uses bbox overlap/proximity to map handwriting regions to text
+- **Long Word Splitting:** Splits words > `MAX_HANDWRITING_CHARS` (default: 7) into multiple images
+- **Signature Detection:** Identifies signature fields for special handling
+- **Character-Level Fallback:** Uses word bboxes if char-level mapping unavailable
+
+**Configuration Constants:**
+- `MAX_HANDWRITING_CHARS`: 7 (max characters per diffusion generation)
+- `SPATIAL_MATCH_THRESHOLD`: Pixel tolerance for bbox matching
+
+**Mapping Logic:**
+1. Find all words whose bboxes overlap/intersect with handwriting rect
+2. Extract text from matched bboxes
+3. Compare with expected handwriting text
+4. If char bboxes available: split long words at char boundaries
+5. If char bboxes unavailable: split at word boundaries
+
+**Error Handling:**
+- No bbox matches → Warning, handwriting region skipped
+- Text mismatch → Logged, uses best match
+- Missing char bboxes → Word-level fallback (may affect quality)
+
+---
+
+### Stage 08: Extract Visual Element Definitions
+
+**File:** `pipeline_08_extract_visual_element_definitions.py`
+
+**Purpose:** Extract placeholders for visual elements (stamps, logos, barcodes, charts, photos) from geometries for generation in stage 10.
+
+**Key Functions:**
+- `main()`: Main extraction orchestrator
+- `parse_visual_element_geometries()`: Extract elements with `data-visual-element` attribute
+- `parse_css_dimension()`: Parse width/height from CSS
+- `parse_css_rotation()`: Extract rotation angle from CSS transform
+
+**Process:**
+1. Load geometries from stage 04
+2. Filter elements with `data-visual-element` attribute
+3. Parse element type (stamp, logo, barcode, etc.)
+4. Extract dimensions and rotation from CSS
+5. Parse content (e.g., stamp text, barcode value)
+6. Validate element types and dimensions
+7. Save visual element definitions
+
+**Inputs:**
+- Geometries from `geometries/` (stage 04)
+
+**Outputs:**
+```
+visual_element_definitions/
+  {doc_id}.json              # Visual element definitions
+logs/visual_element_extraction/ # Extraction logs
+```
+
+**Visual Element Definition Format:**
+```json
+[
+  {
+    "id": "ve0",
+    "type": "stamp",
+    "type_unmapped": "stamp",
+    "content": "CONFIDENTIAL",
+    "rect": {
+      "x": 150.0,
+      "y": 250.0,
+      "width": 60.0,
+      "height": 30.0
+    },
+    "rotation": -15.0
+  },
+  {
+    "id": "ve1",
+    "type": "barcode",
+    "type_unmapped": "barcode",
+    "content": "1234567890",
+    "rect": {
+      "x": 20.0,
+      "y": 270.0,
+      "width": 100.0,
+      "height": 20.0
+    },
+    "rotation": 0.0
+  }
+]
+```
+
+**Supported Visual Element Types:**
+- `stamp`: Text-based stamps (e.g., "APPROVED", "CONFIDENTIAL")
+- `logo`: Company/brand logos (selected from prefabs)
+- `barcode`: Code128 barcodes
+- `chart`: Charts and graphs (selected from prefabs)
+- `photo`: Photographic images (selected from prefabs)
+
+**Type Mapping:**
+Maps LLM-generated type names to standard types using `VISUAL_ELEMENT_TYPE_MAPPING` constant.
+
+**Key Features:**
+- **Content Extraction:** Parses text content for stamps, values for barcodes
+- **Rotation Support:** Extracts CSS `rotate()` transform angles
+- **Dimension Parsing:** Handles CSS units (px, mm, %, etc.)
+- **Type Validation:** Warns about unknown types, uses fallback mapping
+
+**Validation Checks:**
+- Unknown type → Logged, mapped to closest known type
+- Zero dimensions → Error, element skipped
+- Invalid rotation angle → Defaults to 0°
+- Missing content (for stamps/barcodes) → Warning
+
+**CSS Parsing Examples:**
+```css
+/* Rotation extraction */
+transform: rotate(-15deg);        → -15.0
+transform: rotate(0.5turn);       → 180.0
+
+/* Dimension extraction */
+width: 60mm;                      → 60.0 (mm)
+height: 30px;                     → Converted to mm
+```
+
+---
+
+### Stage 09: Create Handwriting Images
+
+**File:** `pipeline_09_create_handwriting_images.py`
+
+**Purpose:** Generate realistic handwriting images using a diffusion model, with per-author style consistency.
+
+**Key Functions:**
+- `main()`: Main generation orchestrator
+- `generate_handwriting_diffusion()`: Call diffusion model API/local model
+- `add_handwriting_blur()`: Optional post-processing for realism
+
+**Process:**
+1. Load handwriting definitions from stage 07
+2. Group by author ID for style consistency
+3. For each text segment:
+   - Generate handwriting image with diffusion model
+   - Apply optional blur for realism
+   - Save image with metadata
+4. Track author-to-writer style mapping
+5. Save generation logs
+
+**Inputs:**
+- Handwriting definitions from `handwriting_definitions/` (stage 07)
+- Diffusion model checkpoint (local or API)
+
+**Outputs:**
+```
+handwriting_images/
+  {doc_id}/
+    hw0_0.png               # First bbox of handwriting region hw0
+    hw0_1.png               # Second bbox (if multi-word)
+    hw1_0.png
+logs/handwriting_generation/ # Generation logs with author mappings
+```
+
+**Handwriting Image Specifications:**
+- **Format:** PNG with transparency
+- **Height:** 40 pixels (configurable via `HANDWRITING_HEIGHT_PX`)
+- **Padding:** 0 pixels horizontal (configurable via `HANDWRITING_PADDING_PX`)
+- **Background:** Transparent
+- **Color:** Black text on transparent
+
+**Diffusion Model Integration:**
+Located in `handwriting_diffusion/` module:
+- `generate_handwriting_diffusion_raw.py`: Core generation logic
+- `text_encoder.py`: Text encoding for model input
+- `tokenizer.py`: Character tokenization
+
+**Key Features:**
+- **Author-Style Consistency:** Same author ID → consistent handwriting style
+- **Writer Style Mapping:** Maps author IDs to writer style IDs (e.g., "author1" → writer_42)
+- **Batch Processing:** Generates multiple images efficiently
+- **Optional Blur:** Post-processing for more realistic appearance
+- **Quality Control:** Validates generated images (non-empty, correct dimensions)
+
+**Configuration Constants:**
+- `HANDWRITING_HEIGHT_PX`: 40 pixels
+- `HANDWRITING_PADDING_PX`: 0 pixels
+- `DIFFUSION_NUM_INFERENCE_STEPS`: 50 (generation quality/speed tradeoff)
+- `HANDWRITING_BLUR_ENABLED`: Optional blur toggle
+- `HANDWRITING_STYLES`: List of available writer styles
+
+**Author-to-Writer Mapping Example:**
+```json
+{
+  "author1": "writer_42",
+  "author2": "writer_17",
+  "author3": "writer_89"
+}
+```
+
+**Error Handling:**
+- Generation failure → Retry up to 3 times
+- Empty image → Logged, document flagged
+- Model timeout → Skipped, logged
+
+---
+
+### Stage 10: Create Visual Elements
+
+**File:** `pipeline_10_create_visual_elements.py`
+
+**Purpose:** Generate or select visual element images (stamps, logos, barcodes, charts, photos) based on definitions from stage 08.
+
+**Key Functions:**
+- `main()`: Main generation orchestrator
+- `route_visual_element_generation()`: Router for element type
+- `generate_stamp()`: Create stamp image with text
+- `select_logo()`: Random selection from logo prefabs
+- `generate_barcode()`: Create Code128 barcode
+- `select_photo()`: Random selection from photo prefabs
+- `select_chart()`: Random selection from chart prefabs
+
+**Process:**
+1. Load visual element definitions from stage 08
+2. For each element:
+   - Route to type-specific generator
+   - Generate or select image
+   - Resize to target dimensions
+   - Save with transparency (if applicable)
+3. Cache prefab directories for performance
+4. Save generation logs
+
+**Inputs:**
+- Visual element definitions from `visual_element_definitions/` (stage 08)
+- Prefab directories in `data/visual_element_prefabs/`:
+  - `logos/`
+  - `photos/`
+  - `charts/`
+
+**Outputs:**
+```
+visual_element_images/
+  {doc_id}/
+    ve0.png                 # Stamp image
+    ve1.png                 # Logo image
+    ve2.png                 # Barcode image
+logs/visual_element_generation/ # Generation logs
+```
+
+**Type-Specific Generation:**
+
+**Stamps:**
+- Font: Configurable (default: Arial Bold)
+- Color: Configurable (default: red/blue)
+- Background: Transparent
+- Border: Optional rounded rectangle
+- Rotation: Applied during insertion (stage 13)
+
+**Logos:**
+- Source: Random selection from `data/visual_element_prefabs/logos/`
+- Format: PNG with transparency preserved
+- Caching: Directory contents cached after first scan
+
+**Barcodes:**
+- Type: Code128 (supports alphanumeric)
+- Library: `python-barcode`
+- Background: White
+- Content validation: Numeric or alphanumeric
+
+**Photos:**
+- Source: Random selection from `data/visual_element_prefabs/photos/`
+- Format: JPEG or PNG
+- Aspect ratio: Preserved during resize
+
+**Charts/Figures:**
+- Source: Random selection from `data/visual_element_prefabs/charts/`
+- Format: PNG with transparency
+- Types: Bar charts, line graphs, pie charts, etc.
+
+**Key Features:**
+- **Type-Specific Logic:** Each element type has dedicated generation function
+- **Prefab Caching:** Directory scans cached for performance
+- **Transparent Backgrounds:** Stamps and some logos support transparency
+- **Content Validation:** Barcodes validate numeric content
+- **Aspect Ratio Preservation:** Images scaled without distortion
+
+**Configuration Constants:**
+- `STAMP_FONT_SIZE`: Calculated from target dimensions
+- `STAMP_BORDER_WIDTH`: 2 pixels
+- `BARCODE_DPI`: 300 for high quality
+- `PREFAB_CACHE_SIZE`: In-memory cache limit
+
+**Error Handling:**
+- Missing prefab directory → Error, element skipped
+- Empty prefab directory → Warning, fallback placeholder
+- Invalid barcode content → Logged, uses fallback text
+- Image generation failure → Placeholder created
+
+---
+
+### Stage 11: Render PDF Second Pass
+
+**File:** `pipeline_11_render_pdf_second_pass.py`
+
+**Purpose:** Re-render PDF without handwriting placeholders (replaced with blank spaces) to prepare for handwriting image insertion in stage 12.
+
+**Key Functions:**
+- `main()`: Main rendering orchestrator
+- `render_pdf_playwright_async()`: Async Playwright rendering
+- `remove_handwriting_placeholders_from_html()`: Strip handwriting elements
+
+**Process:**
+1. Load HTML from `render_html/` (stage 04)
+2. Remove all elements with `data-handwriting` attribute
+3. Re-render PDF using same dimensions from stage 04
+4. Validate PDF output
+5. Save PDFs without handwriting placeholders
+
+**Inputs:**
+- HTML from `render_html/` (stage 04)
+- Page dimensions from stage 04 logs
+
+**Outputs:**
+```
+pdf_without_handwriting_placeholder/
+  {doc_id}.pdf              # PDFs with handwriting regions blank
+logs/rendering_second_pass/ # Rendering logs
+```
+
+**Key Differences from Stage 04:**
+- **Uses Pre-calculated Dimensions:** No content measurement needed
+- **Handwriting Elements Removed:** Not just hidden (visibility: hidden), but removed from DOM
+- **No Geometry Extraction:** Geometries already saved in stage 04
+- **Faster Rendering:** No JavaScript injection for measurement
+
+**HTML Preprocessing:**
+```html
+<!-- Before (Stage 04) -->
+<div data-handwriting="author1" class="handwriting">John Smith</div>
+
+<!-- After (Stage 11) -->
+<!-- Element completely removed -->
+```
+
+**Why This Stage Exists:**
+Handwriting placeholders in HTML use system fonts, which don't match the diffusion-generated handwriting. Removing them creates blank spaces where handwriting images will be inserted in stage 12.
+
+**Rendering Configuration:**
+- Same timeout and retry logic as stage 04
+- Reuses Playwright browser context for efficiency
+- No debug output (geometries not extracted)
+
+**Error Handling:**
+- Multi-page PDF → Flagged and skipped
+- Rendering timeout → Retry with increased timeout
+- HTML parsing error → Logged, document marked invalid
+
+---
+
+### Stage 12: Insert Handwriting Images
+
+**File:** `pipeline_12_insert_handwriting_images.py`
+
+**Purpose:** Overlay generated handwriting images onto PDF pages using PyMuPDF, with precise positioning and natural variation.
+
+**Key Functions:**
+- `main()`: Main insertion orchestrator
+- `insert_handwriting_into_pdf()`: Per-document insertion
+- `scale_image_with_aspect_ratio()`: Resize images while preserving aspect
+- `group_bboxes_by_line()`: Group multi-word handwriting by line
+
+**Process:**
+1. Load PDFs from stage 11
+2. Load handwriting images from stage 09
+3. Load handwriting definitions (bbox mappings) from stage 07
+4. For each handwriting region:
+   - Group bboxes by line/block
+   - Scale images to fit bboxes
+   - Apply random offsets for natural variation
+   - Insert images at calculated positions
+5. Save PDFs with handwriting
+
+**Inputs:**
+- PDFs from `pdf_without_handwriting_placeholder/` (stage 11)
+- Handwriting images from `handwriting_images/` (stage 09)
+- Handwriting definitions from `handwriting_definitions/` (stage 07)
+
+**Outputs:**
+```
+pdf_with_handwriting/
+  {doc_id}.pdf              # PDFs with handwriting inserted
+logs/handwriting_insertion/ # Insertion logs
+debug/handwriting_insertion/ # Debug PDFs with bbox overlays (optional)
+```
+
+**Insertion Logic:**
+
+**1. Image Scaling:**
+- High-res scaling: 3x upsampling before insertion
+- Aspect ratio preserved
+- Left-aligned within bbox (respects layout rect)
+
+**2. Positioning:**
+- **X coordinate:** Left edge of bbox + random offset
+- **Y coordinate:** Top edge of bbox + random offset
+- **Multi-word lines:** Consistent Y offset for line
+
+**3. Random Offsets:**
+```python
+x_offset = random.uniform(-MAX_HANDWRITING_RAND_X, MAX_HANDWRITING_RAND_X)
+y_offset = random.uniform(-MAX_HANDWRITING_RAND_Y, MAX_HANDWRITING_RAND_Y)
+```
+
+**4. Block/Line Grouping:**
+For multi-word handwriting (e.g., "John Smith"):
+- Group bboxes by Y coordinate (same line)
+- Apply consistent Y offset to entire line
+- Individual X offsets per word for natural spacing
+
+**Key Features:**
+- **High-Resolution Insertion:** 3x scaling for quality
+- **Natural Variation:** Random offsets simulate handwriting imperfection
+- **Line Consistency:** Multi-word lines maintain baseline
+- **Aspect Ratio Preservation:** Images not distorted
+- **Transparency Support:** PNG alpha channel preserved
+
+**Configuration Constants:**
+- `HANDWRITING_IMAGE_UPSCALE_FACTOR`: 3x
+- `MAX_HANDWRITING_RAND_X`: ±2 pixels
+- `MAX_HANDWRITING_RAND_Y`: ±1 pixel
+- `HANDWRITING_LINE_Y_CONSISTENCY`: Same Y offset per line
+
+**Coordinate System:**
+- PDF uses 72 DPI (points)
+- Bboxes from stage 07 are in PDF coordinates
+- No conversion needed
+
+**Error Handling:**
+- Missing handwriting image → Warning, bbox skipped
+- Image too large for bbox → Scaled down with warning
+- PyMuPDF insertion failure → Logged, document flagged
+
+---
+
+### Stage 13: Insert Visual Elements
+
+**File:** `pipeline_13_insert_visual_elements.py`
+
+**Purpose:** Overlay visual element images (stamps, logos, barcodes, etc.) onto PDF pages with precise positioning and rotation.
+
+**Key Functions:**
+- `main()`: Main insertion orchestrator
+- `insert_visual_elements_into_pdf()`: Per-document insertion
+- `scale_image_with_aspect_ratio()`: Resize images (same as stage 12)
+
+**Process:**
+1. Load PDFs from stage 12
+2. Load visual element images from stage 10
+3. Load visual element definitions from stage 08
+4. For each visual element:
+   - Scale image to fit bbox
+   - Calculate centered position
+   - Apply rotation (if specified)
+   - Insert image at calculated position
+5. Save final PDFs
+6. If no visual elements: copy PDF from stage 12
+
+**Inputs:**
+- PDFs from `pdf_with_handwriting/` (stage 12)
+- Visual element images from `visual_element_images/` (stage 10)
+- Visual element definitions from `visual_element_definitions/` (stage 08)
+
+**Outputs:**
+```
+pdf_final/
+  {doc_id}.pdf              # Final PDFs with all elements
+logs/visual_element_insertion/ # Insertion logs
+debug/visual_element_insertion/ # Debug PDFs (optional)
+```
+
+**Insertion Logic:**
+
+**1. Image Scaling:**
+- High-res scaling: 3x upsampling
+- Aspect ratio preserved
+- Centered within bbox (not left-aligned like handwriting)
+
+**2. Positioning:**
+- **X coordinate:** Center of bbox - half image width
+- **Y coordinate:** Center of bbox - half image height
+
+**3. Rotation:**
+- Applied via PyMuPDF transformation matrix
+- Rotation around image center
+- Angle from visual element definition
+
+**Key Differences from Stage 12 (Handwriting):**
+- **Centered placement:** Visual elements centered in bbox
+- **No random offsets:** Precise placement for logos/stamps
+- **Rotation support:** Stamps often rotated for "APPROVED" effect
+- **Fallback:** Copies PDF if no visual elements (ensures output exists)
+
+**Rotation Transformation:**
+```python
+# PyMuPDF rotation matrix
+rotation_matrix = fitz.Matrix(rotation_angle)
+image_rect = image_rect * rotation_matrix
+```
+
+**Key Features:**
+- **High-Resolution Insertion:** 3x scaling for quality
+- **Centered Alignment:** Visual elements centered in bboxes
+- **Rotation Support:** Arbitrary angles for stamps
+- **Transparency Preservation:** PNG alpha channel maintained
+- **Fallback Handling:** Copies PDF if no visual elements
+
+**Configuration Constants:**
+- `VISUAL_ELEMENT_UPSCALE_FACTOR`: 3x
+- `ROTATION_PRECISION`: Angle precision in degrees
+
+**Coordinate System:**
+- Same as stage 12 (PDF 72 DPI)
+- Rotation applied after positioning
+
+**Error Handling:**
+- Missing visual element image → Warning, element skipped
+- Image too large for bbox → Scaled down with warning
+- PyMuPDF insertion failure → Logged, document flagged
+- No visual elements → PDF copied from stage 12
+
+---
+
+### Stage 14: Render Image
+
+**File:** `pipeline_14_render_image.py`
+
+**Purpose:** Convert final PDFs to high-quality PNG images for OCR and dataset distribution.
+
+**Key Functions:**
+- `main()`: Main conversion orchestrator
+- `convert_pdf_to_image()`: PDF to PNG conversion
+
+**Process:**
+1. Load PDFs from stage 13
+2. Convert each PDF to PNG using custom PDF-to-image module
+3. Validate image dimensions
+4. Save images
+
+**Inputs:**
+- PDFs from `pdf_final/` (stage 13)
+
+**Outputs:**
+```
+images/
+  {doc_id}.png              # Final document images
+logs/image_rendering/       # Conversion logs
+```
+
+**Image Specifications:**
+- **Format:** PNG
+- **DPI:** Configurable (default: 200 DPI for quality OCR)
+- **Color Mode:** RGB (24-bit)
+- **Compression:** PNG lossless
+
+**PDF-to-Image Module:**
+Located in custom module (not standard library):
+- Handles PDF rendering at specified DPI
+- Single-page conversion only (multi-page PDFs skipped)
+- Uses PDF coordinate system: 72 DPI internally
+
+**Key Features:**
+- **High DPI:** 200+ DPI for accurate OCR
+- **Single Page Only:** Multi-page PDFs flagged as errors
+- **Lossless Compression:** PNG preserves all details
+- **Size Validation:** Checks image dimensions match PDF
+
+**Configuration Constants:**
+- `IMAGE_DPI`: 200 (OCR quality vs file size tradeoff)
+- `IMAGE_MAX_DIMENSION`: Optional max width/height
+
+**Coordinate System Conversion:**
+```
+PDF: 210mm × 297mm @ 72 DPI → 595 × 842 points
+PNG: 210mm × 297mm @ 200 DPI → 1654 × 2339 pixels
+```
+
+**Why This Stage:**
+- OCR performs better on high-DPI images
+- Images are final output format for datasets
+- PNG preserves quality better than JPEG for text
+
+**Error Handling:**
+- Multi-page PDF → Flagged and skipped
+- Conversion failure → Logged, document marked invalid
+- Empty image → Error, document flagged
+- Dimension mismatch → Warning logged
+
+---
+
+### Stage 15: Perform OCR
+
+**File:** `pipeline_15_perform_ocr.py`
+
+**Purpose:** Perform Optical Character Recognition on final images to obtain accurate word and line-level bounding boxes, essential for documents with handwriting or visual elements.
+
+**Key Functions:**
+- `main()`: Main OCR orchestrator
+- `call_microsoft_ocr()`: Microsoft Azure OCR API call
+- `convert_ocr_to_bbox_format()`: Transform OCR results to internal format
+- `aggregate_words_to_segments()`: Group words into lines
+
+**Process:**
+1. Determine which documents need OCR:
+   - Has handwriting → Requires OCR
+   - Has visual elements → Requires OCR
+   - Neither → Copy PDF bboxes from stage 05
+2. For documents requiring OCR:
+   - Call Microsoft OCR service
+   - Parse word-level bboxes
+   - Aggregate into line-level segments
+   - Convert coordinates to PDF space
+3. Save final bboxes
+
+**Inputs:**
+- Images from `images/` (stage 14)
+- Handwriting definitions from `handwriting_definitions/` (stage 07)
+- Visual element definitions from `visual_element_definitions/` (stage 08)
+- PDF bboxes from `pdf_word_bboxes/` (stage 05, for non-OCR documents)
+
+**Outputs:**
+```
+final_word_bboxes/
+  {doc_id}.json             # Word-level bounding boxes
+final_segment_bboxes/
+  {doc_id}.json             # Line-level bounding boxes
+ocr_results_cache/
+  {doc_id}.json             # Raw OCR API responses (cached)
+logs/ocr/                   # OCR logs and errors
+```
+
+**OCR Decision Logic:**
+```python
+requires_ocr = (
+    has_handwriting(doc_id) or 
+    has_visual_elements(doc_id)
+)
+
+if requires_ocr:
+    perform_microsoft_ocr()
+else:
+    copy_pdf_bboxes()  # Reuse stage 05 results
+```
+
+**Why Handwriting/Visual Elements Require OCR:**
+- Handwriting images inserted in stage 12 → Not in PDF text layer
+- Visual elements inserted in stage 13 → Not in PDF text layer
+- PDF text extraction (stage 05) misses these elements
+- OCR captures all visible text on rendered image
+
+**Microsoft OCR API:**
+- **Service:** Azure Computer Vision (Read API)
+- **Input:** PNG image (from stage 14)
+- **Output:** Word polygons, text, confidence scores
+- **Caching:** Results cached to avoid re-processing
+
+**OCR Response Format:**
+```json
+{
+  "readResults": [{
+    "page": 1,
+    "words": [
+      {
+        "boundingBox": [20, 30, 60, 30, 60, 45, 20, 45],
+        "text": "Invoice",
+        "confidence": 0.98
+      }
+    ],
+    "lines": [
+      {
+        "boundingBox": [20, 30, 95, 30, 95, 45, 20, 45],
+        "text": "Invoice Date",
+        "words": [...]
+      }
+    ]
+  }]
+}
+```
+
+**Coordinate Conversion:**
+OCR returns pixel coordinates; converted to PDF points:
+```python
+pdf_x = ocr_x * (pdf_width / image_width)
+pdf_y = ocr_y * (pdf_height / image_height)
+```
+
+**Segment Aggregation:**
+Groups words into lines based on Y coordinate proximity.
+
+**Key Features:**
+- **Selective OCR:** Only runs OCR when necessary (cost/time savings)
+- **Result Caching:** Avoids redundant API calls
+- **Dual-Level Output:** Word and line (segment) bboxes
+- **Coordinate Conversion:** OCR pixels → PDF points
+- **Fallback:** Copies PDF bboxes for documents without handwriting/VEs
+
+**Configuration:**
+- OCR API key from environment variables
+- Timeout: 30 seconds per request
+- Retry: Up to 3 attempts
+
+**Error Handling:**
+- OCR API failure → Retry, fallback to PDF bboxes if all retries fail
+- Empty OCR result → Warning, uses PDF bboxes
+- Coordinate conversion error → Logged, bbox skipped
+
+---
+
+### Stage 16: Normalize BBoxes
+
+**File:** `pipeline_16_normalize_bboxes.py`
+
+**Purpose:** Convert bounding box coordinates from PDF points (absolute pixels) to normalized [0, 1] coordinates for model training and evaluation.
+
+**Key Functions:**
+- `main()`: Main normalization orchestrator
+- `normalize_word_and_segment_bboxes()`: Normalize word/segment bboxes
+- `normalize_layout_bboxes()`: Normalize layout element bboxes (DLA only)
+- `normalize_coordinates()`: Core coordinate transformation
+
+**Process:**
+1. Load final bboxes from stage 15
+2. Load image dimensions (for normalization denominators)
+3. For each bbox:
+   - Transform: `normalized_x = pixel_x / image_width`
+   - Transform: `normalized_y = pixel_y / image_height`
+   - Preserve text content
+4. Save normalized bboxes
+5. For DLA tasks: also normalize layout element bboxes
+
+**Inputs:**
+- Word bboxes from `final_word_bboxes/` (stage 15)
+- Segment bboxes from `final_segment_bboxes/` (stage 15)
+- Layout element definitions from `layout_element_definitions/` (stage 06, for DLA)
+- Image dimensions from stage 14 logs
+
+**Outputs:**
+```
+normalized_word_bboxes/
+  {doc_id}.json             # Normalized word bboxes
+normalized_segment_bboxes/
+  {doc_id}.json             # Normalized segment bboxes
+normalized_gt/              # For DLA tasks only
+  {doc_id}.json             # Normalized layout elements
+logs/normalization/         # Normalization logs
+```
+
+**Normalization Formula:**
+```python
+normalized_bbox = {
+    "x0": bbox["x0"] / image_width,
+    "y0": bbox["y0"] / image_height,
+    "x1": bbox["x1"] / image_width,
+    "y1": bbox["y1"] / image_height,
+    "text": bbox["text"]  # Preserved
+}
+```
+
+**Normalized BBox Format:**
+```json
+[
+  {
+    "x0": 0.095,    # Was: 20 pixels out of 210mm @ 200dpi
+    "y0": 0.128,    # Was: 30 pixels
+    "x1": 0.286,    # Was: 60 pixels
+    "y1": 0.192,    # Was: 45 pixels
+    "text": "Invoice"
+  }
+]
+```
+
+**DLA-Specific Normalization:**
+For Document Layout Analysis tasks, layout element bboxes are also normalized:
+```json
+[
+  {
+    "label": "title",
+    "bbox": [0.095, 0.128, 0.905, 0.192]  # [x0, y0, x1, y1]
+  },
+  {
+    "label": "text",
+    "bbox": [0.095, 0.213, 0.905, 0.534]
+  }
+]
+```
+
+**Why Normalization:**
+- **Model Training:** Most models expect [0, 1] coordinates
+- **Resolution Independence:** Works across different image sizes
+- **Standard Format:** Matches common dataset formats (e.g., LayoutLM)
+
+**Coordinate System Mapping:**
+```
+PDF Points (72 DPI):
+  595 × 842 points (A4)
+  ↓ (stage 14 conversion @ 200 DPI)
+Image Pixels (200 DPI):
+  1654 × 2339 pixels
+  ↓ (stage 16 normalization)
+Normalized [0, 1]:
+  0.0-1.0 × 0.0-1.0
+```
+
+**Key Features:**
+- **Preserves Text:** Text content unchanged during normalization
+- **Task-Specific:** DLA tasks get additional layout bbox normalization
+- **Validation:** Checks for out-of-bounds coordinates (clamps to [0, 1])
+- **Precision:** Full float precision maintained
+
+**Error Handling:**
+- Out-of-bounds coordinates → Clamped to [0, 1] with warning
+- Missing image dimensions → Error, document skipped
+- Zero image dimensions → Error, document marked invalid
+
+---
+
+### Stage 17: GT Preparation & Verification
+
+**File:** `pipeline_17_gt_preparation_verification.py`
+
+**Purpose:** Validate and prepare final ground truth annotations with fuzzy text matching, task-specific formatting, and comprehensive validation.
+
+**Key Functions:**
+- `main()`: Main verification orchestrator
+- `route_task()`: Route to task-specific handler
+- `handle_qa()`: QA ground truth processing
+- `handle_kie()`: KIE ground truth with BIO tagging
+- `handle_dla()`: DLA ground truth processing
+- `fuzzy_match_text()`: Levenshtein-based text matching
+
+**Process:**
+1. Load raw GT from stage 03/06
+2. Load final word bboxes from stage 15
+3. Route to task-specific handler
+4. Perform fuzzy text matching (GT text → OCR text)
+5. Map GT annotations to bbox indices
+6. Apply task-specific formatting
+7. Validate and save verified GT
+
+**Inputs:**
+- Raw GT from `raw_annotations/` (stage 03/06)
+- Word bboxes from `final_word_bboxes/` (stage 15)
+- Layout elements from `layout_element_definitions/` (stage 06, for DLA)
+- Visual elements from `visual_element_definitions/` (stage 08, for DLA)
+
+**Outputs:**
+```
+verified_gt/
+  qa/
+    {doc_id}.json           # QA format
+  kie/
+    {doc_id}.json           # KIE format with BIO tagging
+  dla/
+    {doc_id}.json           # DLA format with normalized bboxes
+  cls/
+    {doc_id}.json           # Classification format
+logs/gt_verification/       # Verification logs with match statistics
+```
+
+---
+
+#### **QA (Question Answering) Format**
+
+**Process:**
+1. Load QA pairs: `{"question": "answer", ...}`
+2. For each answer:
+   - Find words in bboxes matching answer text (fuzzy)
+   - Record bbox indices of matching words
+3. Save verified GT with bbox mappings
+
+**Output Format:**
+```json
+{
+  "questions": [
+    {
+      "question": "What is the invoice number?",
+      "answer": "INV-12345",
+      "answer_bbox_indices": [15, 16]  # Word indices in final_word_bboxes
+    },
+    {
+      "question": "What is the total amount?",
+      "answer": "$1,234.56",
+      "answer_bbox_indices": [42]
+    }
+  ]
+}
+```
+
+**Fuzzy Matching:**
+Uses Levenshtein distance with 0.85 similarity cutoff:
+```python
+similarity = fuzz.ratio(gt_answer, ocr_text) / 100.0
+if similarity >= 0.85:
+    match_found = True
+```
+
+---
+
+#### **KIE (Key Information Extraction) Format**
+
+**Process:**
+1. Load entity annotations: `{entity_name: text_value, ...}`
+2. For each entity:
+   - Find words matching entity value (fuzzy)
+   - Generate BIO tags for all words
+3. Save verified GT with BIO tagging
+
+**Output Format:**
+```json
+{
+  "entities": [
+    {
+      "entity": "company_name",
+      "value": "Acme Corporation",
+      "bbox_indices": [5, 6]
+    },
+    {
+      "entity": "invoice_date",
+      "value": "January 15, 2024",
+      "bbox_indices": [20, 21, 22]
+    }
+  ],
+  "word_labels": [
+    "O", "O", "O", "O", "O",           # Words 0-4: Outside entities
+    "B-company_name", "I-company_name", # Words 5-6: Company name
+    "O", "O", ...,                      # Words 7-19: Outside
+    "B-invoice_date", "I-invoice_date", "I-invoice_date"  # Words 20-22
+  ]
+}
+```
+
+**BIO Tagging:**
+- `B-entity`: Beginning of entity
+- `I-entity`: Inside entity (continuation)
+- `O`: Outside any entity
+
+---
+
+#### **DLA (Document Layout Analysis) Format**
+
+**Process:**
+1. Load layout element definitions from stage 06
+2. Load visual element definitions from stage 08
+3. Validate labels (must be in `valid_labels`)
+4. Check spatial constraints (no containment, minimal overlap)
+5. Merge visual elements into layout annotations
+6. Normalize bboxes to [0, 1]
+7. Save verified GT
+
+**Output Format:**
+```json
+{
+  "layout_elements": [
+    {
+      "id": "layout_0",
+      "label": "title",
+      "bbox": [0.095, 0.128, 0.905, 0.192]  # Normalized [x0, y0, x1, y1]
+    },
+    {
+      "id": "layout_1",
+      "label": "text",
+      "bbox": [0.095, 0.213, 0.905, 0.534]
+    },
+    {
+      "id": "ve0",
+      "label": "figure",  # Visual element mapped to layout label
+      "bbox": [0.714, 0.895, 0.905, 0.980]
+    }
+  ]
+}
+```
+
+**Visual Element Merging:**
+Visual elements from stage 08 are converted to layout labels:
+- `stamp` → `figure` (or custom mapping)
+- `logo` → `figure`
+- `chart` → `figure`
+- `barcode` → `figure`
+- `photo` → `figure`
+
+**Spatial Validation:**
+- **No containment:** One bbox fully inside another → Error
+- **Minimal overlap:** Overlap area < 5% of smaller bbox → Warning
+- **Valid labels:** All labels must be in `SynDatasetDefinition.valid_labels`
+
+---
+
+#### **CLS (Classification) Format**
+
+**Process:**
+1. Load classification label from raw GT
+2. Validate label against expected classes
+3. Save verified GT
+
+**Output Format:**
+```json
+{
+  "document_class": "invoice",
+  "confidence": 1.0
+}
+```
+
+---
+
+**Fuzzy Matching Details:**
+
+Uses Levenshtein distance (via `fuzz` library) to handle OCR discrepancies:
+```python
+# Example: GT "INV-12345" vs OCR "INV-I2345" (OCR error)
+similarity = fuzz.ratio("INV-12345", "INV-I2345") / 100.0
+# similarity = 0.89 (above 0.85 threshold) → Match!
+```
+
+**Match Statistics Logged:**
+- Total GT annotations
+- Successfully matched annotations
+- Failed matches (similarity < 0.85)
+- Average similarity score
+
+**Key Features:**
+- **Fuzzy Matching:** Handles OCR errors gracefully
+- **Task-Specific Formatting:** QA, KIE, DLA, CLS all handled differently
+- **BIO Tagging:** Automatic generation for KIE
+- **Visual Element Integration:** DLA merges visual elements as layout annotations
+- **Spatial Validation:** Detects overlapping/contained layout elements
+- **Similarity Tracking:** Logs match quality for analysis
+
+**Error Handling:**
+- Match failure (similarity < 0.85) → Logged, annotation skipped
+- Invalid labels (DLA) → Error, document marked invalid
+- Spatial violations (DLA) → Warning, elements flagged
+- Missing bboxes → Error, document marked invalid
+
+---
+
+### Stage 18: Analyze
+
+**File:** `pipeline_18_analyze.py`
+
+**Purpose:** Generate comprehensive statistics, cost analysis, and error categorization for the entire dataset generation process.
+
+**Key Functions:**
+- `main()`: Main analysis orchestrator
+- `calculate_api_costs()`: Compute LLM API costs from token usage
+
+**Process:**
+1. Load all document logs from stages 01-17
+2. Categorize documents: valid vs. invalid
+3. Calculate error distributions
+4. Compute API usage and costs
+5. Generate statistics (handwriting, visual elements, annotations)
+6. Save comprehensive dataset log
+
+**Inputs:**
+- All document logs from previous stages
+- Batch results from stage 02 (for cost calculation)
+- Message processing logs from stage 03
+- Prompt usage statistics
+
+**Outputs:**
+```
+dataset_log.json            # Comprehensive dataset statistics
+logs/analysis/              # Analysis logs
+```
+
+**Dataset Log Structure:**
+```json
+{
+  "metadata": {
+    "syndatadef_name": "docvqa_alpha=1.0",
+    "task": "qa",
+    "total_documents_requested": 1000,
+    "generation_date": "2026-02-07"
+  },
+  
+  "prompting": {
+    "total_prompts": 100,
+    "total_batches": 10,
+    "llm_model": "claude-sonnet-4-20250514"
+  },
+  
+  "total_cost_summary": {
+    "total_cost_usd": 123.45,
+    "input_tokens": 1500000,
+    "output_tokens": 800000,
+    "cached_tokens": 500000,
+    "cost_per_document": 0.12
+  },
+  
+  "valid_samples_stats": {
+    "total_valid": 847,
+    "total_invalid": 153,
+    "validity_rate": 0.847,
+    "avg_handwriting_regions_per_doc": 2.3,
+    "avg_visual_elements_per_doc": 1.5,
+    "avg_annotations_per_doc": 8.7,
+    "documents_with_handwriting": 654,
+    "documents_with_visual_elements": 512
+  },
+  
+  "valid_samples": [
+    {
+      "doc_id": "doc_0001",
+      "seed_image": "docvqa_train_12345",
+      "has_handwriting": true,
+      "has_visual_elements": true,
+      "num_annotations": 10,
+      "num_words": 247,
+      "image_size": [1654, 2339]
+    }
+  ],
+  
+  "valid_samples_by_category": {
+    "invoice": 234,
+    "receipt": 198,
+    "form": 415
+  },
+  
+  "errors": {
+    "multipage_pdf": 12,
+    "missing_ocr_result": 5,
+    "failed_gt_verification": 38,
+    "rendering_timeout": 8,
+    "llm_parsing_error": 23,
+    "bbox_extraction_failed": 4,
+    "handwriting_generation_failed": 7,
+    "visual_element_generation_failed": 3,
+    "other": 53
+  }
+}
+```
+
+**Error Categories:**
+
+| Error Category | Description |
+|---------------|-------------|
+| `multipage_pdf` | PDF rendered with multiple pages (invalid) |
+| `missing_ocr_result` | OCR failed or returned empty result |
+| `failed_gt_verification` | GT matching/validation failed in stage 17 |
+| `rendering_timeout` | PDF rendering exceeded timeout |
+| `llm_parsing_error` | Failed to extract HTML/GT from LLM response |
+| `bbox_extraction_failed` | PyMuPDF failed to extract bboxes |
+| `handwriting_generation_failed` | Diffusion model failed |
+| `visual_element_generation_failed` | VE generation/selection failed |
+| `other` | Miscellaneous errors |
+
+**Cost Calculation:**
+
+**Claude API Pricing (example):**
+```python
+costs = {
+    "input": input_tokens * 0.003 / 1000,      # $3 per 1M tokens
+    "output": output_tokens * 0.015 / 1000,    # $15 per 1M tokens
+    "cached": cached_tokens * 0.0003 / 1000    # $0.30 per 1M tokens (prompt caching)
+}
+total_cost = costs["input"] + costs["output"] + costs["cached"]
+```
+
+**Statistics Computed:**
+- **Validity Rate:** Percentage of documents passing all stages
+- **Handwriting Stats:** Documents with handwriting, avg regions per doc
+- **Visual Element Stats:** Documents with VEs, avg elements per doc
+- **Annotation Stats:** Avg QA pairs/KIE entities/DLA elements per doc
+- **Token Usage:** Input/output/cached token totals
+- **Cost Metrics:** Total cost, cost per document, cost per valid document
+
+**Key Features:**
+- **Comprehensive Error Tracking:** All error categories logged
+- **Cost Transparency:** Token-level cost breakdown
+- **Quality Metrics:** Validity rate, avg annotations, etc.
+- **Category Breakdown:** Valid samples grouped by document type
+- **Per-Document Tracking:** Each valid document's metadata saved
+
+**Usage:**
+This log is essential for:
+- Understanding dataset quality
+- Optimizing pipeline (identify bottlenecks)
+- Cost estimation for future runs
+- Debugging (error distribution analysis)
+
+---
+
+### Stage 19: Create Debug Data
+
+**File:** `pipeline_19_create_debug_data.py`
+
+**Purpose:** Generate comprehensive debug visualizations for manual inspection and quality assurance.
+
+**Key Functions:**
+- `main()`: Main debug generator
+- `visualize_visual_element_bboxes()`: Overlay VE bboxes on PDFs
+- `visualize_final_bboxes_on_images()`: Overlay OCR bboxes on images
+- `visualize_pdf_bboxes()`: Overlay PDF bboxes
+
+**Process:**
+1. Load all generated documents
+2. Create debug subdirectories
+3. For each document:
+   - Generate PDF bbox overlays
+   - Generate VE bbox overlays
+   - Generate handwriting insertion region overlays
+   - Generate final OCR bbox overlays on images
+4. Copy raw HTML with debug.js script for browser inspection
+
+**Inputs:**
+- All intermediate and final outputs from stages 01-18
+
+**Outputs:**
+```
+debug/
+  pdf_bboxes/               # PDF word bboxes overlaid (stage 05)
+    {doc_id}.pdf
+  visual_element_bboxes/    # VE bboxes overlaid (stage 08)
+    {doc_id}.pdf
+  handwriting_insertion/    # Handwriting regions overlaid (stage 12)
+    {doc_id}.pdf
+  final_bboxes_on_images/   # OCR bboxes on final images (stage 15)
+    {doc_id}.png
+  html_with_debug/          # Raw HTML + debug.js
+    {doc_id}.html
+    debug.js                # Browser-based inspection script
+```
+
+**Debug Visualizations:**
+
+**1. PDF BBoxes (Stage 05):**
+- Red rectangles: Word bounding boxes from PyMuPDF
+- Annotated with word text
+- Purpose: Verify PDF text extraction quality
+
+**2. Visual Element BBoxes (Stage 08):**
+- Blue rectangles: Visual element placeholder regions
+- Annotated with element type (stamp, logo, etc.)
+- Purpose: Verify VE extraction and positioning
+
+**3. Handwriting Insertion Regions (Stage 12):**
+- Green rectangles: Handwriting bbox regions
+- Annotated with handwriting text
+- Purpose: Verify handwriting placement accuracy
+
+**4. Final BBoxes on Images (Stage 15):**
+- Orange rectangles: OCR word bounding boxes
+- Overlaid on final rendered images
+- Purpose: Verify OCR accuracy and coverage
+
+**Debug JavaScript (debug.js):**
+```javascript
+// Browser-based inspection tool
+// Features:
+// - Highlight elements on hover
+// - Show geometry data in console
+// - Toggle element visibility
+// - Measure element dimensions
+```
+
+**Key Features:**
+- **Color-Coded Overlays:** Different colors for different stages
+- **Text Annotations:** Bboxes labeled with content
+- **Multi-Format:** Both PDF and PNG visualizations
+- **Browser Inspection:** HTML with interactive debug script
+- **Selective Generation:** Only enabled with `DEBUG_MODE=true` flag
+
+**Configuration:**
+- `DEBUG_MODE`: Enable/disable debug output (default: false)
+- `DEBUG_BBOX_LINE_WIDTH`: Line thickness for overlays (default: 2)
+- `DEBUG_BBOX_OPACITY`: Overlay transparency (default: 0.5)
+
+**When to Use:**
+- Visual quality assurance
+- Debugging bbox extraction issues
+- Verifying handwriting/VE insertion
+- Identifying OCR problems
+- Manual inspection of edge cases
+
+**Performance Note:**
+Debug generation adds ~20% processing time; disabled by default in production.
+
+---
+
+## API Implementation
+
+The DocGenie API provides a FastAPI-based REST service for synchronous document generation, integrating pipeline stages 01-06.
+
+### Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         API ARCHITECTURE                         │
+└─────────────────────────────────────────────────────────────────┘
+
+Client Request
+     │
+     ↓
+┌─────────────────────┐
+│   FastAPI Server    │
+│   (main.py)         │
+└─────────────────────┘
+     │
+     ├──► Validate Request (schemas.py)
+     │
+     ├──► Download Seed Images (utils.py)
+     │
+     ├──► Build Prompt (utils.py)
+     │
+     ├──► Call Claude API (Synchronous)
+     │    └──► Claude Sonnet 4.5
+     │
+     ├──► Extract HTML & GT (pipeline_03 functions)
+     │
+     ├──► Render PDF (pipeline_04 functions)
+     │    └──► Playwright/Chromium
+     │
+     ├──► Extract BBoxes (pipeline_05 functions)
+     │    └──► PyMuPDF
+     │
+     └──► Return Response
+          └──► JSON with base64-encoded PDFs
+```
+
+---
+
+### Endpoints
+
+#### **GET /**
+Health check endpoint.
+
+**Response:**
+```json
+{
+  "message": "DocGenie API is running",
+  "version": "1.0",
+  "status": "healthy"
+}
+```
+
+---
+
+#### **GET /health**
+Detailed health status.
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "api_version": "1.0",
+  "playwright_available": true,
+  "llm_model": "claude-sonnet-4-5-20250929"
+}
+```
+
+---
+
+#### **POST /generate**
+Generate documents with ground truth annotations.
+
+**Request Body** (`schemas.GenerateDocumentRequest`):
+```json
+{
+  "seed_images": [
+    "https://example.com/seed1.jpg",
+    "https://example.com/seed2.jpg"
+  ],
+  "prompt_params": {
+    "language": "English",
+    "doc_type": "business and administrative",
+    "gt_type": "Multiple questions and their answers",
+    "gt_format": "{\"question\": \"answer\", ...}",
+    "num_solutions": 2
+  }
+}
+```
+
+**Request Validation:**
+- `seed_images`: 1-10 URLs (HTTPS only)
+- `num_solutions`: 1-5 documents
+- All prompt parameters required
+
+**Response** (`schemas.GenerateDocumentResponse`):
+```json
+{
+  "success": true,
+  "message": "Successfully generated 2 documents",
+  "documents": [
+    {
+      "document_id": "doc_20260207_001",
+      "html": "<html>...</html>",
+      "css": "body { ... }",
+      "ground_truth": {
+        "What is the invoice number?": "INV-12345"
+      },
+      "pdf_base64": "JVBERi0xLjQK...",
+      "bboxes": [
+        {
+          "x0": 20.0,
+          "y0": 30.0,
+          "x1": 60.0,
+          "y1": 45.0,
+          "text": "Invoice"
+        }
+      ],
+      "page_width_mm": 210.0,
+      "page_height_mm": 297.0
+    }
+  ],
+  "total_documents": 2
+}
+```
+
+---
+
+#### **POST /generate-files**
+Generate documents and return as downloadable files.
+
+**Request:** Same as `/generate`
+
+**Response:** 
+- **Content-Type:** `application/zip`
+- **File:** ZIP archive containing:
+  - `doc_001.pdf`
+  - `doc_001_gt.json`
+  - `doc_001_bboxes.json`
+  - `doc_002.pdf`
+  - ...
+
+**File Structure:**
+```
+generated_documents.zip
+├── doc_001.pdf
+├── doc_001_gt.json
+├── doc_001_bboxes.json
+├── doc_001_metadata.json
+├── doc_002.pdf
+├── ...
+```
+
+---
+
+### Request/Response Schemas
+
+Defined in `api/schemas.py`:
+
+#### **PromptParameters**
+```python
+class PromptParameters(BaseModel):
+    language: str = "English"
+    doc_type: str = "business and administrative"
+    gt_type: str = "Multiple questions and their answers"
+    gt_format: str = '{"question": "answer", ...}'
+    num_solutions: int = Field(default=1, ge=1, le=5)
+```
+
+#### **GenerateDocumentRequest**
+```python
+class GenerateDocumentRequest(BaseModel):
+    seed_images: List[HttpUrl] = Field(..., min_items=1, max_items=10)
+    prompt_params: PromptParameters
+```
+
+#### **BoundingBox**
+```python
+class BoundingBox(BaseModel):
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    text: str
+```
+
+#### **DocumentResult**
+```python
+class DocumentResult(BaseModel):
+    document_id: str
+    html: str
+    css: str
+    ground_truth: Optional[dict]
+    pdf_base64: str
+    bboxes: List[BoundingBox]
+    page_width_mm: float
+    page_height_mm: float
+```
+
+#### **GenerateDocumentResponse**
+```python
+class GenerateDocumentResponse(BaseModel):
+    success: bool
+    message: str
+    documents: List[DocumentResult]
+    total_documents: int
+```
+
+---
+
+### API Pipeline Flow
+
+Detailed integration with pipeline stages:
+
+```python
+# Simplified API flow (from api/main.py)
+
+@app.post("/generate", response_model=GenerateDocumentResponse)
+async def generate_documents(request: GenerateDocumentRequest):
+    # 1. Download and encode seed images
+    seed_images_base64 = await download_and_encode_images(request.seed_images)
+    
+    # 2. Build prompt from template
+    prompt = build_prompt_from_template(
+        seed_images=seed_images_base64,
+        params=request.prompt_params
+    )
+    
+    # 3. Call Claude API (synchronous, not batched)
+    llm_response = await call_claude_api(
+        prompt=prompt,
+        model="claude-sonnet-4-5-20250929"
+    )
+    
+    # 4. Extract HTML and GT (from pipeline_03)
+    documents_html = extract_html_from_message(llm_response)
+    documents_gt = extract_gt_from_html(documents_html)
+    
+    # 5. Validate HTML (from utils.py)
+    validate_html_structure(documents_html)
+    
+    # 6. Render PDFs (from pipeline_04)
+    pdfs = await render_pdfs_with_playwright(documents_html)
+    
+    # 7. Validate PDFs (from utils.py)
+    validate_pdf_pages(pdfs)
+    
+    # 8. Extract bboxes (from pipeline_05)
+    bboxes = extract_bboxes_from_pdfs(pdfs)
+    
+    # 9. Validate bboxes (from utils.py)
+    validate_bbox_completeness(bboxes)
+    
+    # 10. Encode PDFs to base64
+    pdfs_base64 = encode_pdfs_to_base64(pdfs)
+    
+    # 11. Build response
+    return GenerateDocumentResponse(
+        success=True,
+        documents=[...],
+        total_documents=len(documents_html)
+    )
+```
+
+---
+
+### Integration with Pipeline Functions
+
+**Reused from Pipeline:**
+- `extract_html_from_message()` from `pipeline_03`
+- `extract_gt_from_html()` from `pipeline_03`
+- `render_pdf_with_playwright()` from `pipeline_04`
+- `extract_bboxes_from_pdf()` from `pipeline_05`
+- Various utilities from `docgenie.generation.utils`
+
+**API-Specific Functions (api/utils.py):**
+- `download_seed_images()`: Fetch images from URLs
+- `encode_images_to_base64()`: Convert images for API transmission
+- `build_prompt_from_template()`: Template-based prompt construction
+- `call_claude_api_sync()`: Synchronous Claude API call (non-batched)
+- `encode_pdf_to_base64()`: PDF encoding for response
+- `validate_html_structure()`: HTML validation
+- `validate_pdf_pages()`: PDF page count/size validation
+- `validate_bbox_completeness()`: Ensure bboxes extracted
+
+---
+
+### Configuration
+
+#### **Environment Variables (.env)**
+```bash
+ANTHROPIC_API_KEY=sk-ant-...            # Required for Claude API
+LLM_MODEL=claude-sonnet-4-5-20250929   # Default model
+API_PORT=8000                           # Server port
+DEBUG_MODE=false                        # Enable debug logging
+```
+
+#### **Prompt Templates**
+Located in `data/prompt_templates/`:
+
+**Template Structure:**
+```
+data/prompt_templates/<template_name>/
+├── system_prompt.txt                   # System message
+├── user_prompt_template.txt            # User message template
+└── example_output.html                 # Example for few-shot
+```
+
+**Placeholder Substitution:**
+```python
+# In user_prompt_template.txt
+"""
+Please generate {num_solutions} {doc_type} documents in {language}.
+
+Ground truth format: {gt_format}
+Ground truth type: {gt_type}
+"""
+
+# Substituted with request.prompt_params
+```
+
+#### **CORS Configuration**
+```python
+# In api/main.py
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],        # Open for development
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+```
+
+---
+
+### Authentication & Security
+
+**Current State:**
+- **No built-in authentication:** API is open (for development)
+- **API Key Management:** Claude API key in `.env`, not passed in requests
+- **CORS:** Open (`allow_origins=["*"]`) for development
+
+**Production Recommendations:**
+- Add API key authentication (e.g., Bearer tokens)
+- Restrict CORS origins to known frontends
+- Rate limiting (e.g., Redis-based)
+- Input sanitization for HTML injection prevention
+- HTTPS only (terminate SSL at reverse proxy)
+
+---
+
+### Performance Considerations
+
+**Async Rendering:**
+- Playwright rendering uses async/await
+- Concurrent requests supported by FastAPI
+- Semaphore control prevents resource exhaustion
+
+**Image Size Limits:**
+- Seed images compressed before API transmission
+- Max dimension: 1024px (configurable)
+- JPEG quality: 85% (configurable)
+
+**Timeouts:**
+- PDF render timeout: 60 seconds per document
+- Claude API timeout: 120 seconds
+- Total request timeout: 300 seconds (5 minutes)
+
+**Retry Logic:**
+- PDF rendering: Up to 3 attempts
+- Claude API: Up to 2 retries on network errors
+- No retry on validation failures
+
+**Concurrency:**
+- FastAPI default: Multiple workers (configurable)
+- Playwright: Semaphore-controlled (10 concurrent renders)
+
+---
+
+### Limitations
+
+**Current Limitations:**
+1. **Single-Page Only:** Multi-page PDFs flagged as errors
+2. **Seed Image Limit:** Maximum 10 seed images per request
+3. **Document Limit:** Maximum 5 document variations (`num_solutions`)
+4. **No Handwriting/Visual Elements:** Stages 07-13 not integrated (API stops at stage 06)
+5. **Synchronous LLM:** No batching (higher cost per document)
+6. **No Dataset Export:** No `/export-dataset` endpoint for full pipeline runs
+
+**Known Issues:**
+- Large documents (>10 pages worth of content) may timeout
+- Complex CSS (animations, 3D transforms) may not render correctly
+- Some Unicode characters may not display in PDFs
+
+---
+
+### Future Integration Plan
+
+From `api/PIPELINE_INTEGRATION.md`:
+
+#### **Stage 3: Handwriting & Visual Elements (Stages 07-11)**
+
+**New Request Parameters:**
+```python
+class PromptParameters(BaseModel):
+    # ... existing ...
+    enable_handwriting: bool = False
+    handwriting_ratio: float = Field(default=0.2, ge=0.0, le=1.0)
+    enable_visual_elements: bool = False
+    visual_element_types: List[str] = ["stamp", "logo"]
+```
+
+**New Response Fields:**
+```python
+class DocumentResult(BaseModel):
+    # ... existing ...
+    handwriting_regions: Optional[List[HandwritingRegion]]
+    visual_elements: Optional[List[VisualElement]]
+```
+
+**Impact:**
+- Longer processing time (diffusion model: ~5s per handwriting region)
+- Larger response size (additional images)
+
+---
+
+#### **Stage 4: Image Finalization & OCR (Stages 12-15)**
+
+**New Response Fields:**
+```python
+class DocumentResult(BaseModel):
+    # ... existing ...
+    image_base64: str                    # Final rendered image (PNG)
+    ocr_text: str                        # Full OCR text
+    ocr_confidence: float                # Average OCR confidence
+```
+
+**Impact:**
+- OCR API costs (~$1.50 per 1000 images)
+- Additional 2-3 seconds per document (OCR latency)
+
+---
+
+#### **Stage 5: Dataset Packaging (Stages 16-19)**
+
+**New Endpoint:**
+```python
+@app.post("/export-dataset")
+async def export_dataset(request: ExportDatasetRequest):
+    """
+    Run full pipeline (stages 01-19) and return packaged dataset.
+    """
+    # Run pipeline with syndatadef
+    # Return ZIP with images, GTs, statistics
+```
+
+**Request:**
+```python
+class ExportDatasetRequest(BaseModel):
+    syndatadef_config: dict              # Full SynDatasetDefinition
+    output_format: str = "huggingface"   # "huggingface", "coco", "custom"
+```
+
+**Response:**
+- ZIP archive with full dataset
+- Includes `dataset_log.json` from stage 18
+
+---
+
+### Example Usage
+
+#### **Python Client (api/example_usage.py)**
+
+```python
+import requests
+import base64
+from pathlib import Path
+
+# API endpoint
+API_URL = "http://localhost:8000/generate"
+
+# Prepare request
+request_data = {
+    "seed_images": [
+        "https://example.com/invoice_seed.jpg",
+        "https://example.com/receipt_seed.jpg"
+    ],
+    "prompt_params": {
+        "language": "English",
+        "doc_type": "invoices and receipts",
+        "gt_type": "Multiple questions and their answers",
+        "gt_format": '{"question": "answer", ...}',
+        "num_solutions": 3
+    }
+}
+
+# Call API
+response = requests.post(API_URL, json=request_data)
+result = response.json()
+
+# Process results
+if result["success"]:
+    for doc in result["documents"]:
+        doc_id = doc["document_id"]
+        
+        # Save PDF
+        pdf_data = base64.b64decode(doc["pdf_base64"])
+        Path(f"{doc_id}.pdf").write_bytes(pdf_data)
+        
+        # Save GT
+        Path(f"{doc_id}_gt.json").write_text(
+            json.dumps(doc["ground_truth"], indent=2)
+        )
+        
+        # Save HTML
+        Path(f"{doc_id}.html").write_text(doc["html"])
+        
+        print(f"Saved: {doc_id}")
+        print(f"  - BBoxes: {len(doc['bboxes'])}")
+        print(f"  - GT Annotations: {len(doc['ground_truth'])}")
+```
+
+#### **cURL Example**
+
+```bash
+curl -X POST http://localhost:8000/generate \
+  -H "Content-Type: application/json" \
+  -d '{
+    "seed_images": [
+      "https://example.com/seed.jpg"
+    ],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "business documents",
+      "gt_type": "Questions and answers",
+      "gt_format": "{\"question\": \"answer\"}",
+      "num_solutions": 1
+    }
+  }'
+```
+
+#### **JavaScript/TypeScript Client**
+
+```typescript
+const response = await fetch('http://localhost:8000/generate', {
+  method: 'POST',
+  headers: { 'Content-Type': 'application/json' },
+  body: JSON.stringify({
+    seed_images: ['https://example.com/seed.jpg'],
+    prompt_params: {
+      language: 'English',
+      doc_type: 'invoices',
+      gt_type: 'Questions and answers',
+      gt_format: '{"question": "answer"}',
+      num_solutions: 2
+    }
+  })
+});
+
+const result = await response.json();
+
+// Decode PDF
+const pdfBlob = new Blob(
+  [Uint8Array.from(atob(result.documents[0].pdf_base64), c => c.charCodeAt(0))],
+  { type: 'application/pdf' }
+);
+
+// Download PDF
+const url = URL.createObjectURL(pdfBlob);
+const a = document.createElement('a');
+a.href = url;
+a.download = `${result.documents[0].document_id}.pdf`;
+a.click();
+```
+
+---
+
+### Testing
+
+**Test File:** `api/test_api.py`
+
+```python
+import pytest
+from fastapi.testclient import TestClient
+from api.main import app
+
+client = TestClient(app)
+
+def test_health_check():
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "healthy"
+
+def test_generate_documents():
+    request_data = {
+        "seed_images": ["https://example.com/seed.jpg"],
+        "prompt_params": {
+            "language": "English",
+            "doc_type": "invoices",
+            "gt_type": "Questions",
+            "gt_format": "{\"q\": \"a\"}",
+            "num_solutions": 1
+        }
+    }
+    response = client.post("/generate", json=request_data)
+    assert response.status_code == 200
+    result = response.json()
+    assert result["success"] is True
+    assert len(result["documents"]) > 0
+
+def test_invalid_request():
+    request_data = {
+        "seed_images": [],  # Invalid: empty
+        "prompt_params": {"num_solutions": 10}  # Invalid: > 5
+    }
+    response = client.post("/generate", json=request_data)
+    assert response.status_code == 422  # Validation error
+```
+
+**Run Tests:**
+```bash
+cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie
+pytest api/test_api.py -v
+```
+
+---
+
+### Deployment
+
+**Start Server:**
+```bash
+cd /media/ahad-hassan/Volume_E/FYP/FYP/docgenie/api
+chmod +x start.sh
+./start.sh
+```
+
+**start.sh Contents:**
+```bash
+#!/bin/bash
+export $(cat .env | xargs)
+uvicorn main:app --host 0.0.0.0 --port 8000 --reload
+```
+
+**Production Deployment:**
+```bash
+# With Gunicorn (multi-worker)
+gunicorn main:app \
+  --workers 4 \
+  --worker-class uvicorn.workers.UvicornWorker \
+  --bind 0.0.0.0:8000 \
+  --timeout 300
+```
+
+**Docker Deployment:**
+```dockerfile
+FROM python:3.10-slim
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install Playwright browsers
+RUN playwright install chromium
+RUN playwright install-deps
+
+COPY . .
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+
+---
+
+## Core Models & Utilities
+
+### SynDatasetDefinition Model
+
+**File:** `docgenie/generation/models/_syndatadef.py`
+
+**Purpose:** Configuration object for synthetic dataset generation, loaded from YAML files.
+
+**Key Attributes:**
+```python
+@dataclass
+class SynDatasetDefinition:
+    # Dataset metadata
+    name: str                           # "docvqa_alpha=1.0"
+    task: TaskType                      # TaskType.QA, .KIE, .DLA, .CLS
+    base_dataset: str                   # "docvqa", "cord", etc.
+    
+    # LLM configuration
+    llm_model: str                      # "claude-sonnet-4-20250514"
+    prompt_template: str                # "DocGenie"
+    num_solutions: int                  # Documents per prompt
+    
+    # Prompting parameters
+    language: str                       # "English", "German", etc.
+    doc_type: str                       # "business and administrative"
+    gt_type: str                        # Task-specific GT description
+    gt_format: str                      # Expected GT format
+    
+    # Dataset parameters
+    num_samples: int                    # Total documents to generate
+    alpha: float                        # Clustering diversity parameter
+    
+    # Seed selection
+    seeds_per_cluster: int              # Seeds sampled per cluster
+    clustering_method: str              # "kmeans", "agglomerative"
+    
+    # Task-specific
+    valid_labels: Optional[List[str]]   # For DLA: ["title", "text", ...]
+    
+    # Output paths
+    output_dir: Path                    # Root output directory
+```
+
+**Key Methods:**
+```python
+def get_file_structure(self) -> FileStructure:
+    """Returns FileStructure manager for output directories."""
+
+def build_prompt(self, seed_images: List[str]) -> str:
+    """Builds prompt from template with parameter substitution."""
+
+def iter_document_logs(self) -> Iterator[DocumentLog]:
+    """Iterates over all document logs."""
+
+def update_document_status(self, doc_id: str, status: Status):
+    """Updates document status in log."""
+
+@classmethod
+def from_yaml(cls, yaml_path: Path) -> "SynDatasetDefinition":
+    """Load configuration from YAML file."""
+```
+
+**Example YAML:**
+```yaml
+name: "docvqa_alpha=1.0"
+task: "qa"
+base_dataset: "docvqa"
+
+llm_model: "claude-sonnet-4-20250514"
+prompt_template: "DocGenie"
+num_solutions: 1
+
+language: "English"
+doc_type: "business and administrative documents"
+gt_type: "Multiple questions and their answers"
+gt_format: '{"question": "answer", ...}'
+
+num_samples: 1000
+alpha: 1.0
+seeds_per_cluster: 10
+clustering_method: "kmeans"
+
+output_dir: "data/datasets/synthesized_datasets/docvqa_alpha=1.0"
+```
+
+---
+
+### PipelineParameters Model
+
+**File:** `docgenie/generation/models/_pipeline.py`
+
+**Purpose:** Runtime parameters for pipeline execution.
+
+**Attributes:**
+```python
+@dataclass
+class PipelineParameters:
+    # Execution control
+    start_stage: int = 1                # First stage to execute
+    end_stage: int = 19                 # Last stage to execute
+    skip_existing: bool = True          # Skip documents with existing outputs
+    
+    # Parallelization
+    max_workers: int = 10               # Concurrent processing
+    chromium_concurrency: int = 10      # Parallel PDF renders
+    
+    # Debug mode
+    debug_mode: bool = False            # Enable debug visualizations
+    
+    # Retry configuration
+    max_retries: int = 3                # Retry attempts
+    retry_delay: float = 2.0            # Seconds between retries
+    
+    # Timeouts
+    pdf_render_timeout: int = 60        # Seconds
+    ocr_timeout: int = 30               # Seconds
+```
+
+---
+
+### FileStructure Model
+
+**File:** `docgenie/generation/models/_file.py`
+
+**Purpose:** Manages directory structure for generated data.
+
+**Key Properties:**
+```python
+@dataclass
+class FileStructure:
+    root: Path                          # Root output directory
+    
+    # Directory properties (all return Path)
+    seeds_directory: Path
+    prompt_batches_directory: Path
+    message_results_directory: Path
+    raw_html_directory: Path
+    raw_annotations_directory: Path
+    geometries_directory: Path
+    pdf_initial_directory: Path
+    render_html_directory: Path
+    pdf_word_bboxes_directory: Path
+    pdf_char_bboxes_directory: Path
+    layout_element_definitions_directory: Path
+    handwriting_definitions_directory: Path
+    visual_element_definitions_directory: Path
+    handwriting_images_directory: Path
+    visual_element_images_directory: Path
+    pdf_without_handwriting_placeholder_directory: Path
+    pdf_with_handwriting_directory: Path
+    pdf_final_directory: Path
+    images_directory: Path
+    final_word_bboxes_directory: Path
+    final_segment_bboxes_directory: Path
+    normalized_word_bboxes_directory: Path
+    normalized_segment_bboxes_directory: Path
+    verified_gt_directory: Path
+    
+    # Debug subdirectories
+    debug_directory: Path
+    debug_pdf_bboxes_directory: Path
+    debug_visual_element_bboxes_directory: Path
+    debug_handwriting_insertion_directory: Path
+    debug_final_bboxes_on_images_directory: Path
+    debug_html_with_debug_directory: Path
+```
+
+**Key Methods:**
+```python
+def create_all_directories(self):
+    """Create all output directories."""
+
+def get_document_path(self, doc_id: str, stage: str) -> Path:
+    """Get path for specific document and stage."""
+```
+
+---
+
+### DocumentLog Model
+
+**File:** `docgenie/generation/models/_log.py`
+
+**Purpose:** Document-level metadata and status tracking.
+
+**Attributes:**
+```python
+@dataclass
+class DocumentLog:
+    doc_id: str                         # Unique document ID
+    seed_image_id: str                  # Source seed image
+    prompt_call_id: str                 # Prompt batch/call ID
+    
+    status: Status                      # VALID, INVALID, PROCESSING
+    
+    # Stage completion flags
+    has_raw_html: bool = False
+    has_raw_gt: bool = False
+    has_pdf_initial: bool = False
+    has_geometries: bool = False
+    has_bboxes: bool = False
+    has_handwriting: bool = False
+    has_visual_elements: bool = False
+    has_final_image: bool = False
+    has_ocr_result: bool = False
+    has_verified_gt: bool = False
+    
+    # Statistics
+    num_words: int = 0
+    num_annotations: int = 0
+    num_handwriting_regions: int = 0
+    num_visual_elements: int = 0
+    
+    # Error tracking
+    error_stage: Optional[str] = None
+    error_message: Optional[str] = None
+    error_category: Optional[str] = None
+    
+    # Timestamps
+    created_at: datetime
+    updated_at: datetime
+```
+
+**Key Methods:**
+```python
+def mark_stage_complete(self, stage: str):
+    """Mark pipeline stage as complete."""
+
+def mark_error(self, stage: str, error_msg: str, category: str):
+    """Record error and mark document as invalid."""
+
+def is_valid(self) -> bool:
+    """Check if document passed all stages."""
+```
+
+---
+
+### BBox Model
+
+**File:** `docgenie/generation/models/_bbox.py`
+
+**Purpose:** Bounding box representation with text content.
+
+**Attributes:**
+```python
+@dataclass
+class BBox:
+    rect: Rect                          # {x0, y0, x1, y1}
+    text: str                           # Text content
+    metadata: Optional[dict] = None     # Additional data
+```
+
+**Key Methods:**
+```python
+@property
+def width(self) -> float:
+    return self.rect["x1"] - self.rect["x0"]
+
+@property
+def height(self) -> float:
+    return self.rect["y1"] - self.rect["y0"]
+
+def normalize(self, image_width: float, image_height: float) -> "BBox":
+    """Convert to normalized [0, 1] coordinates."""
+
+def unnormalize(self, image_width: float, image_height: float) -> "BBox":
+    """Convert from normalized to pixel coordinates."""
+
+def to_dict(self) -> dict:
+    """Serialize to dictionary."""
+
+@classmethod
+def from_dict(cls, data: dict) -> "BBox":
+    """Deserialize from dictionary."""
+```
+
+---
+
+### LayoutBBox Model
+
+**File:** `docgenie/generation/models/_bbox.py`
+
+**Purpose:** Layout element bounding box (for DLA tasks).
+
+**Attributes:**
+```python
+@dataclass
+class LayoutBBox:
+    label: str                          # "title", "text", "table", etc.
+    rect: Rect                          # {x0, y0, x1, y1}
+    metadata: Optional[dict] = None
+```
+
+**Key Methods:**
+```python
+def normalize(self, image_width: float, image_height: float) -> "LayoutBBox":
+    """Normalize coordinates."""
+
+def contains(self, other: "LayoutBBox") -> bool:
+    """Check if this bbox fully contains another."""
+
+def overlaps(self, other: "LayoutBBox") -> bool:
+    """Check if this bbox overlaps with another."""
+
+def overlap_area(self, other: "LayoutBBox") -> float:
+    """Calculate overlap area with another bbox."""
+```
+
+---
+
+### Utility Modules
+
+#### **BBox Utilities (utils/bboxes.py)**
+
+```python
+def load_bboxes_from_file(file_path: Path) -> List[BBox]:
+    """Load bboxes from JSON file."""
+
+def save_bboxes_to_file(bboxes: List[BBox], file_path: Path):
+    """Save bboxes to JSON file."""
+
+def visualize_bboxes_on_pdf(
+    pdf_path: Path,
+    bboxes: List[BBox],
+    output_path: Path,
+    color: str = "red"
+):
+    """Draw bbox overlays on PDF."""
+
+def visualize_bboxes_on_image(
+    image_path: Path,
+    bboxes: List[BBox],
+    output_path: Path,
+    color: str = "orange"
+):
+    """Draw bbox overlays on image."""
+
+def check_bbox_containment(bbox1: BBox, bbox2: BBox) -> bool:
+    """Check if bbox1 contains bbox2."""
+```
+
+---
+
+#### **Geometry Utilities (utils/geos.py)**
+
+```python
+def filter_layout_elements(geometries: dict) -> List[dict]:
+    """Extract elements with data-label attribute."""
+
+def filter_handwriting_elements(geometries: dict) -> List[dict]:
+    """Extract elements with data-handwriting attribute."""
+
+def filter_visual_elements(geometries: dict) -> List[dict]:
+    """Extract elements with data-visual-element attribute."""
+
+def filter_by_css_class(geometries: dict, class_name: str) -> List[dict]:
+    """Extract elements with specific CSS class."""
+```
+
+---
+
+#### **Serialization Utilities (utils/serialization.py)**
+
+```python
+def encode_image_to_base64(image_path: Path) -> str:
+    """Encode image file to base64 string."""
+
+def decode_base64_to_image(base64_str: str, output_path: Path):
+    """Decode base64 string and save as image."""
+
+def serialize_dataclass(obj: Any) -> dict:
+    """Serialize dataclass to dictionary."""
+
+def deserialize_dataclass(data: dict, cls: Type[T]) -> T:
+    """Deserialize dictionary to dataclass."""
+```
+
+---
+
+## Configuration & Constants
+
+### Key Constants (generation/constants.py)
+
+```python
+# Document processing
+HTML_PARSER = "html.parser"           # BeautifulSoup parser
+PDF_POINT_SCALING = 72 / 96           # CSS DPI to PDF DPI
+
+# Bounding boxes
+BBOX_OVERLAP_THRESHOLD = 0.05         # 5% overlap tolerance
+SPATIAL_MATCH_THRESHOLD = 10.0        # Pixel tolerance for matching
+
+# PDF rendering
+CHROMIUM_CONCURRENCY = 10             # Parallel renders
+PER_PDF_RENDER_TIMEOUT = 60           # Seconds
+PER_PDF_RENDER_MAX_RETRIES = 3        # Retry attempts
+
+# Handwriting
+MAX_HANDWRITING_CHARS = 7             # Max chars per diffusion generation
+HANDWRITING_HEIGHT_PX = 40            # Image height
+HANDWRITING_PADDING_PX = 0            # Horizontal padding
+DIFFUSION_NUM_INFERENCE_STEPS = 50    # Generation quality
+HANDWRITING_IMAGE_UPSCALE_FACTOR = 3  # Insertion scaling
+MAX_HANDWRITING_RAND_X = 2            # Random X offset (pixels)
+MAX_HANDWRITING_RAND_Y = 1            # Random Y offset (pixels)
+
+# Visual elements
+VISUAL_ELEMENT_UPSCALE_FACTOR = 3     # Insertion scaling
+STAMP_BORDER_WIDTH = 2                # Stamp border thickness
+BARCODE_DPI = 300                     # Barcode image quality
+
+# OCR
+IMAGE_DPI = 200                       # Final image DPI
+OCR_CONFIDENCE_THRESHOLD = 0.8        # Min confidence
+
+# Ground truth
+FUZZY_MATCH_THRESHOLD = 0.85          # Levenshtein similarity cutoff
+
+# Handwriting styles
+HANDWRITING_STYLES = [
+    "writer_0", "writer_1", "writer_2", ..., "writer_99"
+]
+
+# Visual element types
+VISUAL_ELEMENT_TYPES = [
+    "stamp", "logo", "barcode", "chart", "photo"
+]
+
+# Visual element type mapping (LLM output → standard)
+VISUAL_ELEMENT_TYPE_MAPPING = {
+    "stamp": "stamp",
+    "company_stamp": "stamp",
+    "approval_stamp": "stamp",
+    "logo": "logo",
+    "company_logo": "logo",
+    "brand_logo": "logo",
+    "barcode": "barcode",
+    "code128": "barcode",
+    "chart": "chart",
+    "graph": "chart",
+    "figure": "chart",
+    "photo": "photo",
+    "image": "photo",
+    "picture": "photo"
+}
+```
+
+---
+
+### Environment Variables
+
+**Required:**
+```bash
+ANTHROPIC_API_KEY=sk-ant-...          # Claude API key
+MICROSOFT_AZURE_OCR_KEY=...           # Azure OCR key
+MICROSOFT_AZURE_OCR_ENDPOINT=...      # Azure OCR endpoint
+```
+
+**Optional:**
+```bash
+LLM_MODEL=claude-sonnet-4-20250514    # Override model
+DEBUG_MODE=false                      # Enable debug output
+LOG_LEVEL=INFO                        # Logging verbosity
+CHROMIUM_CONCURRENCY=10               # Override concurrency
+```
+
+---
+
+## Error Handling & Debugging
+
+### Error Categories
+
+**Comprehensive error tracking in stage 18:**
+
+| Category | Description | Resolution |
+|----------|-------------|------------|
+| `multipage_pdf` | PDF rendered with >1 page | Check HTML content size, CSS page breaks |
+| `missing_ocr_result` | OCR API failed or empty | Check Azure credentials, retry |
+| `failed_gt_verification` | GT text not found in OCR | Review fuzzy match threshold, inspect HTML |
+| `rendering_timeout` | PDF render exceeded timeout | Increase timeout, simplify HTML |
+| `llm_parsing_error` | Failed to extract HTML/GT | Review LLM response format, update regex |
+| `bbox_extraction_failed` | PyMuPDF extraction error | Check PDF validity, inspect fonts |
+| `handwriting_generation_failed` | Diffusion model error | Check model checkpoint, GPU availability |
+| `visual_element_generation_failed` | VE creation error | Check prefab directories, image validity |
+
+---
+
+### Debug Visualizations (Stage 19)
+
+**Generated debug outputs:**
+
+1. **PDF BBoxes:** Verify PyMuPDF extraction quality
+2. **Visual Element BBoxes:** Verify VE extraction and positioning
+3. **Handwriting Insertion:** Verify handwriting placement
+4. **Final BBoxes on Images:** Verify OCR accuracy
+
+**Enable debug mode:**
+```python
+# In pipeline execution
+pipeline_params = PipelineParameters(
+    debug_mode=True,
+    # ... other params
+)
+```
+
+---
+
+### Logging
+
+**Log locations:**
+```
+data/datasets/synthesized_datasets/<dataset_name>/logs/
+├── pipeline_01/
+│   └── seed_selection.log
+├── pipeline_02/
+│   └── prompting.log
+├── pipeline_03/
+│   └── message_processing/
+│       ├── batch_001.log
+│       └── batch_002.log
+├── ...
+└── pipeline_19/
+    └── debug_generation.log
+```
+
+**Log format:**
+```
+[2026-02-07 14:32:15] [INFO] [pipeline_04] Starting PDF rendering for doc_0001
+[2026-02-07 14:32:18] [INFO] [pipeline_04] Successfully rendered doc_0001 (3.2s)
+[2026-02-07 14:32:18] [ERROR] [pipeline_04] Rendering failed for doc_0002: Timeout
+```
+
+---
+
+### Common Issues & Solutions
+
+**Issue: Multi-page PDFs**
+- **Cause:** HTML content exceeds page size
+- **Solution:** Reduce content, check for long tables, use CSS `overflow: hidden`
+
+**Issue: Handwriting not appearing**
+- **Cause:** Character bboxes not available (stage 05)
+- **Solution:** Use simpler fonts in HTML, ensure PyMuPDF can extract chars
+
+**Issue: OCR missing text**
+- **Cause:** Low image DPI, poor contrast
+- **Solution:** Increase `IMAGE_DPI` (stage 14), adjust HTML styling
+
+**Issue: GT verification failures**
+- **Cause:** OCR discrepancies, fuzzy match threshold too high
+- **Solution:** Lower `FUZZY_MATCH_THRESHOLD`, improve HTML text rendering
+
+**Issue: Claude API timeout**
+- **Cause:** Large seed images, complex prompts
+- **Solution:** Compress seed images, simplify prompt template
+
+---
+
+## Usage Examples
+
+### Full Pipeline Execution
+
+```python
+from docgenie.generation import pipeline_01_select_seeds
+from docgenie.generation.models import SynDatasetDefinition, PipelineParameters
+
+# Load configuration
+syndatadef = SynDatasetDefinition.from_yaml(
+    "data/syn_dataset_definitions/docvqa_alpha=1.0.yaml"
+)
+
+# Configure pipeline
+params = PipelineParameters(
+    start_stage=1,
+    end_stage=19,
+    skip_existing=True,
+    debug_mode=False,
+    max_workers=10
+)
+
+# Execute pipeline stages
+for stage in range(params.start_stage, params.end_stage + 1):
+    print(f"Executing stage {stage:02d}...")
+    
+    # Import and run stage module
+    stage_module = importlib.import_module(
+        f"docgenie.generation.pipeline_{stage:02d}_*"
+    )
+    stage_module.main(syndatadef, params)
+    
+    print(f"Stage {stage:02d} complete.")
+
+print("Pipeline execution complete!")
+```
+
+---
+
+### API Usage
+
+**See API Implementation section for detailed examples.**
+
+---
+
+### Custom Dataset Definition
+
+```yaml
+# data/syn_dataset_definitions/custom_invoices.yaml
+
+name: "custom_invoices_v1"
+task: "kie"
+base_dataset: "cord"
+
+llm_model: "claude-sonnet-4-20250514"
+prompt_template: "ClaudeRefined12"
+num_solutions: 1
+
+language: "English"
+doc_type: "invoices and receipts"
+gt_type: "Key-value pairs (entity extraction)"
+gt_format: '{"entity_name": "entity_value", ...}'
+
+num_samples: 500
+alpha: 0.75
+seeds_per_cluster: 5
+clustering_method: "kmeans"
+
+# KIE-specific
+valid_labels: null  # Not used for KIE
+
+output_dir: "data/datasets/synthesized_datasets/custom_invoices_v1"
+```
+
+---
+
+## Conclusion
+
+This documentation provides a comprehensive overview of the DocGenie generation pipeline and API. For additional support or questions, refer to:
+
+- **Pipeline Integration Guide:** `api/PIPELINE_INTEGRATION.md`
+- **API README:** `api/README.md`
+- **Source Code:** `docgenie/generation/` and `api/`
+
+**Key Takeaways:**
+
+1. **19-Stage Pipeline:** Modular design from seed selection to GT verification
+2. **Multi-Task Support:** QA, KIE, DLA, CLS with task-specific handling
+3. **Realistic Documents:** LLM-generated content, diffusion handwriting, visual elements
+4. **Quality Assurance:** Comprehensive validation, OCR verification, error tracking
+5. **API Integration:** FastAPI service for synchronous document generation (stages 01-06)
+6. **Extensibility:** Modular code, clear interfaces, easy to extend
+
+**Pipeline Strengths:**
+
+- Task-agnostic core with task-specific adapters
+- Extensive logging and error tracking
+- Parallel processing where applicable
+- Debug visualizations at every stage
+- Integration with state-of-the-art models
+
+**Future Directions:**
+
+- Full API integration (stages 07-19)
+- Additional document types (forms, legal, medical)
+- Multi-language support expansion
+- Enhanced visual element generation (charts, diagrams)
+- Real-time generation optimization
diff --git a/LLM_PROJECT_CONTEXT_NOTE.md b/LLM_PROJECT_CONTEXT_NOTE.md
new file mode 100755
index 0000000000000000000000000000000000000000..92f23f72b5f714eec9dff6da0c0bbcce0dfd0a8d
--- /dev/null
+++ b/LLM_PROJECT_CONTEXT_NOTE.md
@@ -0,0 +1,254 @@
+# DocGenie Project Context Note (LLM Ready)
+
+## 1) Executive Summary
+DocGenie is an AI-driven synthetic document generation platform designed to create realistic, annotated datasets for document intelligence tasks.
+
+The project combines:
+- LLM-based document content and layout generation
+- PDF rendering and geometric extraction
+- Optional handwriting synthesis (diffusion model)
+- Optional visual element insertion (logos, stamps, barcodes, charts, photos)
+- OCR extraction and bbox normalization
+- Ground-truth preparation for downstream machine learning
+- API-first and async batch workflows for production-scale generation
+
+The core idea is to transform a small set of real seed document images plus high-level generation parameters into large, diverse, reproducible synthetic datasets suitable for training and evaluation.
+
+## 2) Problem Statement
+Real document datasets are expensive and slow to collect, often constrained by privacy, class imbalance, and weak annotation quality. This limits model quality for tasks like DocVQA, KIE, and layout understanding.
+
+Key challenges:
+- Lack of large high-quality labeled datasets
+- Domain mismatch between training and production documents
+- Manual labeling cost and inconsistency
+- Need for handwriting and visual artifacts in realistic layouts
+- Need for reproducibility and controllable data generation
+
+## 3) Proposed Solution
+DocGenie proposes a modular synthetic dataset engine with controllable realism.
+
+High-level solution flow:
+1. Select and ingest seed images that represent target document style.
+2. Use LLM prompting (vision + text) to generate HTML/CSS-based document variants and structured GT.
+3. Render HTML to PDF and extract text geometry/bboxes.
+4. Optionally replace selected text with generated handwriting.
+5. Optionally insert visual elements (stamp/logo/barcode/photo/figure).
+6. Produce final PDFs/images + OCR + normalized bboxes + verified GT + export packages.
+
+Design principles:
+- Stage-wise pipeline (clear inputs/outputs per stage)
+- Reproducibility via seeds
+- Production-ready API endpoints
+- Async job orchestration for large runs
+- Separation of CPU API workloads and GPU handwriting inference workloads
+
+## 4) Project Goals
+Primary goals:
+- Generate realistic synthetic documents at scale
+- Support multiple document AI tasks with rich annotation
+- Provide configurable realism controls (handwriting ratio, visual element types, OCR toggles)
+- Minimize generation cost with batched LLM calls
+- Enable operational deployment with monitoring and async processing
+
+Secondary goals:
+- Improve dataset diversity through seed and prompt strategies
+- Support rapid experimentation for model development
+- Keep architecture modular for independent upgrades
+
+## 5) Core Capabilities
+- Seed-image-guided generation (1-8 images per request)
+- Configurable document language/type and GT format
+- Multi-output generation per seed set (num_solutions)
+- Handwriting synthesis with writer-style consistency
+- Visual element synthesis and insertion
+- OCR extraction from final rendered artifacts
+- Normalized bbox outputs for ML pipelines
+- Optional dataset packaging/export (for training pipelines)
+- Async batch generation with status polling and result retrieval
+
+## 6) 19-Stage Pipeline (Conceptual)
+DocGenie follows a full multi-stage pipeline:
+1. Seed selection/download
+2. Prompt LLM
+3. Process LLM response and extract HTML/GT
+4. Render PDF and extract geometries
+5. Extract text bboxes
+6. Validate generated artifacts
+7. Extract handwriting region definitions
+8. Extract visual element definitions
+9. Generate handwriting images
+10. Generate visual element images
+11. Re-render PDF (without placeholders where required)
+12. Insert handwriting overlays
+13. Insert visual overlays
+14. Render document images
+15. Run OCR
+16. Normalize bboxes
+17. Prepare/verify GT
+18. Analyze run statistics
+19. Create debug/export outputs
+
+Important detail:
+- Browser geometries are often in 96 DPI and PDF geometry in 72 DPI, requiring coordinate transforms.
+- Handwriting insertion requires text-to-bbox matching and deduplication logic.
+
+## 7) API Product Surface
+Main API behavior is centered around three use patterns:
+
+1) Synchronous generation endpoint
+- Returns generated documents and metadata directly in response.
+- Suitable for development and debugging.
+
+2) Synchronous PDF/ZIP artifact endpoint
+- Returns packaged artifacts (PDF, metadata, optional assets) in downloadable form.
+- Suitable for practical batch outputs.
+
+3) Asynchronous batch endpoint
+- Queues long-running generation jobs.
+- Returns request/task id.
+- Client polls status endpoint.
+- Client fetches/downloads final output when completed.
+- Best for production and larger workloads.
+
+Typical request dimensions:
+- seed_images: list of remote URLs
+- prompt_params: language, doc type, GT settings, feature toggles, reproducibility seed
+
+## 8) System Architecture
+Monorepo-style architecture with independent service boundaries:
+
+A) Core package
+- Shared generation logic and pipeline stages.
+
+B) API service (CPU)
+- FastAPI interface
+- Orchestrates generation pipeline
+- Manages async queue and external integrations
+
+C) Background worker (CPU)
+- Executes queued async jobs
+- Handles long-running generation and packaging workflows
+
+D) Handwriting service (GPU)
+- Separate service for diffusion-based handwriting generation
+- Designed to be deployable independently
+
+E) Data stores and platform services
+- Queue broker (Redis)
+- Metadata storage (database)
+- File delivery/storage integration
+
+Architecture intent:
+- Keep API orchestration scalable and light
+- Offload expensive handwriting generation to GPU service
+- Enable independent deployment and scaling per component
+
+## 9) Handwriting Subsystem
+Handwriting generation is treated as a specialized capability:
+- Uses diffusion-style generation with writer IDs/styles
+- Supports per-word token generation and mapping
+- Supports post-processing (blur, anti-aliasing, cropping)
+- Designed for realism and style consistency within a document
+
+Operational notes:
+- Batch handling is optimized for service cost and startup overhead
+- Some model/sampling settings are constrained by the underlying handwriting model implementation
+
+## 10) Visual Element Subsystem
+Visual elements include artifacts commonly found in real documents:
+- logos
+- stamps
+- barcodes
+- photos
+- figures/charts
+
+Key behavior:
+- Placeholder-based extraction from generated HTML/geometries
+- Type normalization and filtering by request settings
+- Coordinate-aware insertion into final PDF/image artifacts
+
+## 11) Data and Output Contracts
+The project outputs ML-ready artifacts with rich metadata:
+
+Typical outputs:
+- Generated HTML/CSS
+- Intermediate and final PDFs
+- Rasterized page images
+- Word/segment/layout bboxes
+- Normalized coordinate variants
+- Handwriting images and maps
+- Visual element images and maps
+- Ground-truth objects (task dependent)
+- Optional packaged export for training pipelines
+
+This enables direct use for training/evaluation datasets, debugging, and pipeline QA.
+
+## 12) Deployment Strategy (Current Direction)
+Recommended deployment split:
+- API + worker on CPU-friendly platform
+- Handwriting service on GPU-capable platform
+- Redis and database as managed services
+
+Why this split works:
+- Different resource profiles (CPU orchestration vs GPU inference)
+- Independent scaling and cost control
+- Service isolation improves reliability and debugging
+
+## 13) Testing and Quality Strategy
+Project testing plan emphasizes:
+- Unit tests per critical stage function
+- Integration tests for service boundaries (LLM, handwriting service, queue)
+- System tests for end-to-end generation
+- Non-functional tests: performance, reliability, scalability, security
+
+Key risk areas tested heavily:
+- External API failures/retries
+- Geometry and bbox alignment
+- Async job state transitions
+- Handwriting/visual overlay correctness
+
+## 14) Known Constraints and Practical Considerations
+- Quality depends on seed representativeness and prompt quality.
+- External service availability (LLM providers, handwriting endpoint) impacts runtime reliability.
+- Coordinate conversion and matching edge cases can affect overlay precision.
+- Large batch jobs require async orchestration and observability.
+- Some advanced generation realism features may still be iterative/improving.
+
+## 15) Why This Project Matters
+DocGenie addresses a real bottleneck in document AI: obtaining large, diverse, high-quality labeled training data.
+
+It provides a controllable synthetic data engine that can:
+- accelerate experimentation
+- reduce dependence on private data access
+- improve model robustness through diversity and controlled perturbations
+- support multiple document AI tasks in one platform
+
+## 16) Suggested Prompt Context for Future LLM Tasks
+Use the following when asking an LLM to help with this codebase:
+
+Project summary:
+I am working on DocGenie, a synthetic document generation platform with a 19-stage pipeline. It uses LLM-generated HTML/CSS from seed images, renders to PDF, extracts bboxes/geometries, optionally inserts diffusion-generated handwriting and visual elements, runs OCR, normalizes bboxes, verifies GT, and exports ML-ready artifacts. The system has a FastAPI service, async worker, and separate GPU handwriting service.
+
+Primary objective:
+Improve reliability, generation quality, and production scalability of synthetic dataset generation for DocVQA/KIE/layout tasks.
+
+Technical priorities:
+- API and worker robustness
+- bbox/geometry correctness
+- handwriting and visual insertion accuracy
+- async job reliability and observability
+- deployment and cost optimization
+
+Constraints:
+- External dependencies (LLM APIs, managed queue/db, GPU service)
+- need reproducibility through seeded runs
+- preserve compatibility of output metadata for downstream ML pipelines
+
+When proposing changes:
+- Keep stage boundaries clear
+- Avoid breaking output contracts
+- Include failure handling and retries
+- Prefer measurable improvements (latency, cost, quality, reliability)
+
+## 17) Fast Context Snapshot (Short Version)
+DocGenie is an API-first synthetic document dataset generator for document AI. It takes seed images and generation settings, uses an LLM to generate document HTML/GT, renders PDFs, extracts geometry, optionally adds handwriting and visual artifacts, runs OCR, normalizes annotations, and returns/exports ML-ready data. It is built as a modular 19-stage pipeline with async job processing and a separate GPU handwriting service for scalable production usage.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..77fe4a8917a10f709c22120fc2424ddb6ca7216f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,454 @@
+---
+title: DocGenie API
+emoji: 📄
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 7860
+pinned: false
+---
+
+# DocGenie
+
+## Project structure
+The source code under /docgenie is split into three parts:
+- **generation**: Code responsible for synthesizing datasets.
+- **evaluation**: Code responsible for training models on original/synthetic data and evaluating them. Also contains code to load these datasets.
+- **analyzation**: Code responsible for analyting original/synthetic data, e.g. clustering, LayoutFID scores etc.
+
+## Setting up project dependencies
+Install uv astral (https://docs.astral.sh/uv/getting-started/installation/)
+```
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+Install dependencies (set uv cache dir to appropriate dir in your data folder as default home cache dir has limited space):
+```
+uv sync --cache-dir /data/proj/$USER/.cache/uv/
+``` 
+
+Source the uv environment
+```
+source .venv/bin/active
+```
+
+Or, directly run commands with uv run
+```
+uv run python /path/to/script
+```
+
+## Setting up dependencies for generation pipeline
+Install playwright chromium by running
+```
+playwright install chromium
+```
+
+and also download chromium for PDF conversion:
+```
+wget -O chrome.zip "https://download-chromium.appspot.com/dl/Linux_x64?type=snapshots"
+unzip chrome.zip
+```
+
+Add Chromium to your PATH
+```
+echo "export PATH=\"$(pwd)/chrome-linux:\$PATH\"" >> ~/.bashrc
+```
+
+Reload your shell
+```
+source ~/.bashrc
+```
+
+Verify installation
+```
+chrome --version
+```
+
+# Synthetization Pipeline
+- Set the env variable ANTHROPIC_API_KEY with your Anthropic API Key
+- Create a new syn dataset definition file in data/syn_dataset_definitions. For a template refer to docvqa-test.yaml
+- Execute 'docgenie/generation/main.py SynDsDefFname' where SynDsDefFname is the filename of the syn dataset definition without extension
+- Data will be stored in 'data/datasets/SynDsName' where SynDsName is field 'name' in the syn dataset definition.
+- Final PDFs will be stored in subdirectory pdf_final
+  - Handwriting synthesis is currently not implemented, so the final PDFs will be missing text. To see the PDF with the text which has to be replaced by handwriting see PDFs in sub directory pdf_pass1
+  - Visual element insertion is currently not implemented
+
+# DocVQA Handwriting Generation
+
+A toolkit for generating synthetic handwriting images for document visual question answering (DocVQA) tasks. This project provides scripts to generate, process, and enhance handwritten text overlays on documents using either font-based rendering or diffusion-based deep learning models.
+
+## Overview
+
+This repository contains tools to:
+- Generate synthetic handwriting from bounding box specifications
+- Apply post-processing effects (blur, antialiasing) for realistic rendering
+- Support multiple generation backends (font-based, diffusion model)
+- Handle word segmentation and concatenation for long words
+- Maintain consistent author styles across documents
+
+## Project Structure
+
+```
+docvqa_handwriting_generation/
+├── model/                      # Model architecture and training utilities
+│   ├── text_encoder.py
+│   ├── tokenizer.py
+│   ├── train_hugging.py
+│   └── experiments/
+│       └── hf_conditional_latent/
+│           ├── config.yaml
+│           ├── writer_id_map.json
+│           ├── checkpoints/
+│           └── cached_vae/
+├── scripts/                    # Generation and evaluation scripts
+│   ├── generate_handwriting_diffusion_raw.py
+│   ├── generate_handwriting_resized.py
+│   ├── generate_writer_style_eval.py
+│   └── add_handwriting_blur.py
+└── requirements.txt
+```
+## Directory Structure for Hnadwritten Text Images
+
+```
+data/
+├── datasets/                    
+│   ├── synthesized_datasets/  
+│   ├───── DocVQA-XYZ-Dataset/        
+│   │──────── handwriting_raw_tokens/     # Directory containing folders for each doc which inturn contains images                
+│   │────────────────7cd-ef-xy456-xxx-xxx_0/  # Directory for doc named as 7cd-ef-xy456-xxx-xxx_0 etc.
+│   │──────────────────────── hw01_0.png      # Images
+│   │──────────────────────── hw01_1.png
+│   │────────────────────────     .
+│   │────────────────────────     .
+│   │────────────────────────     .
+│   │─────────────────32xc-ef-xy456-xxx-xxx_0/    
+│   │──────────────────────── hw01_0.png
+│   │──────────────────────── hw01_1.png
+│   │────────────────────────     .
+│   │────────────────────────     .
+│   │────────────────────────     .
+```
+
+Dataset archives unpack directly into the repository root (e.g. `docvqa-handwritten-sizes4/`, `docvqa-test/`, `docvqa-viselems/`).
+
+## Installation
+
+### Requirements
+
+- Python 3.8+
+- PyTorch (for diffusion backend)
+- Other dependencies listed in `requirements.txt`
+
+### Setup
+
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd docvqa_handwriting_generation
+```
+
+2. Install dependencies:
+TODO: update pyproject.toml for dependencies, we now use UV
+```bash
+pip install -r requirements.txt
+```
+
+3. Download or train the diffusion model:
+
+**Pre-trained Models:** `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
+
+Expected structure after extraction:
+```
+model/
+└── experiments/
+    └── hf_conditional_latent/
+        ├── config.yaml              # Model configuration
+        ├── writer_id_map.json       # Writer ID to index mapping
+        ├── cached_vae/             # VAE decoder (auto-downloaded on first use)
+        │   ├── config.json
+        │   └── diffusion_pytorch_model.safetensors
+        └── checkpoints/
+            ├── latest.pt            # Latest checkpoint
+            └── checkpoint-####.pt   # Epoch checkpoints
+```
+
+**Note:** The VAE decoder will be automatically downloaded from HuggingFace on first use and cached locally.
+
+4. Download datasets (optional, for testing):
+
+**DocVQA Handwritten Dataset:** `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
+
+## Usage
+
+### 1. Diffusion-Based Handwriting Generation
+
+Generate handwriting tokens using a conditional diffusion model with writer style control and intelligent word splitting:
+
+```bash
+python scripts/generate_handwriting_diffusion_raw.py \
+    --input-dir data/docvqa-handwritten-sizes4/handwriting_bbox \
+    --output-dir output/handwriting_raw_tokens \
+    --run-dir model/experiments/hf_conditional_latent \
+    --checkpoint latest.pt \
+    --steps 30 \
+    --split-length 7 \
+    --batch-size 8 \
+    --temperature 1.0 \
+    --device cuda
+```
+
+**Key Features:**
+
+**Intelligent Word Splitting:**
+- Words longer than `--split-length` are automatically split into segments
+- Example: `--split-length 7` → "generation" becomes "generat" + "ion"
+- Segments are generated separately and stitched horizontally
+- Set `--split-length 0` to disable splitting
+
+**Writer Style Control:**
+- Each author gets a consistent style ID per document
+- Style IDs are derived from the model's trained writer embeddings
+- Maintains style consistency across all words from the same author
+
+**Conditional Diffusion:**
+- Uses HuggingFace UNet2DConditionModel with cross-attention
+- Character-level text encoding via transformer
+- VAE latent space generation (auto-downloads stabilityai/sd-vae-ft-mse)
+- Configurable sampling temperature for quality/diversity tradeoff
+
+**Arguments:**
+- `--run-dir`: Path to model experiment directory
+- `--checkpoint`: Checkpoint filename (default: `latest.pt`)
+- `--steps`: Number of diffusion steps (default: 30; more = better quality)
+- `--split-length`: Max word length before splitting (default: 7)
+- `--temperature`: Sampling temperature (0.7-0.9 = conservative, 1.0 = standard, 1.1-1.3 = creative)
+- `--batch-size`: Batch size for GPU efficiency (default: 8)
+- `--use-ema`: Use EMA weights if available in checkpoint
+
+**Output:** 
+- Images: `<output-dir>/<json_stem>/hw<id>_<word_no>.png`
+- Mapping: `<output-dir>/raw_token_map.json`
+
+**Output Features:**
+- RGBA format with transparent backgrounds
+- Tight cropping to handwriting content
+- Word segments automatically stitched horizontally
+- Baseline-aligned concatenation for natural appearance
+
+### 2. Resized Handwriting Generation
+
+Generate handwriting scaled to fit specific bounding boxes:
+
+```bash
+python scripts/generate_handwriting_resized.py \
+    --input-dir data/syn_docvqa/handwriting_bbox \
+    --output-dir output/handwriting_rendered \
+    --backend font \
+    --fonts-dir assets/fonts \
+    --max-workers 8
+```
+
+**Backends:**
+- `font`: Pillow-based pseudo-handwriting (fast, no GPU needed)
+- `diffusion`: Deep learning model (requires GPU, model artifacts)
+
+**Output:**
+- Images: `<output-dir>/<json_stem>__<hw_id>__seg<index>.png`
+- Mapping: `<output-dir>/handwriting_image_map.json`
+
+### 3. Post-Processing with Blur
+
+Add realistic blur and anti-aliasing to generated handwriting:
+
+```bash
+python scripts/add_handwriting_blur.py \
+    --input-root output/handwriting_raw_tokens \
+    --output-root output/handwriting_raw_tokens_blur \
+    --mapping-json output/handwriting_raw_tokens/raw_token_map.json \
+    --append-mapping \
+    --radius-min 0.6 \
+    --radius-max 1.8 \
+    --antialias
+```
+
+**Features:**
+- Gaussian blur with configurable radius
+- Optional downscale+upscale anti-aliasing
+- Advanced edge refinement (erosion, dilation, unsharp mask)
+- Updates mapping JSON with blurred image paths
+- Supports in-place or mirror directory output
+
+### 4. Writer Style Evaluation Exports
+
+Generate per-writer evaluation samples with a curated word list and DPM-Solver++ sampling:
+
+```bash
+python scripts/generate_writer_style_eval.py \
+    --run-dir model/experiments/hf_conditional_latent \
+    --checkpoint latest.pt \
+    --output-dir writer_eval \
+    --max-words 48 \
+    --batch-size 12 \
+    --num-steps 30 \
+    --temperature 0.7 \
+    --device cuda
+```
+
+**Outputs:**
+- PNG samples saved under `<output-dir>/writer_XXXX/`
+- `<output-dir>/writer_style_manifest.json` summarizing words, writers, and generation metadata
+
+## Input Format
+
+### Handwriting Bbox JSON
+
+Input JSON files specify bounding boxes and text for handwriting generation:
+
+```json
+[
+  {
+    "id": "hw0",
+    "text": "Example Text",
+    "author-id": "author1",
+    "bboxes": [
+      "110.69,124.79,161.76,143.41,Example,22,0,0",
+      "166.85,124.79,204.83,143.41,Text,22,0,1"
+    ]
+  }
+]
+```
+
+**Bbox format:** `x1,y1,x2,y2,text,block_no,line_no,word_no`
+- Coordinates are floats
+- Last 3 values are indices for grouping (block, line, word)
+- Text can contain any characters (including commas)
+
+## Key Features
+
+### Intelligent Word Splitting
+- Automatically splits words exceeding `--split-length` characters
+- Example: "generation" (10 chars) → "generat" + "ion" (with split_length=7)
+- Segments generated independently with same style
+- Stitched horizontally with baseline alignment
+- Configurable via `--split-length` parameter (0 = no splitting)
+
+### Writer Style Consistency
+- Each author ID gets consistent style per document
+- Style derived from trained writer embeddings in model
+- Falls back to deterministic hashing for unknown authors
+- Reproducible with same `--seed` value
+
+### Conditional Text Generation
+- Character-level transformer text encoder
+- Cross-attention conditioning in UNet
+- VAE latent space generation (64×256 latent → decoded to full resolution)
+- Temperature control for quality/diversity tradeoff
+
+### Batched GPU Generation
+- Process multiple segments in parallel
+- Configurable batch size for memory optimization
+- Progress tracking with tqdm
+
+### Output Quality
+- RGBA format with transparent backgrounds
+- Tight cropping to ink extents
+- Otsu thresholding for clean binarization
+- Baseline-aligned word segment stitching
+- Version-controlled output mappings
+
+## Advanced Options
+
+### Diffusion Generation Parameters
+- `--steps`: Number of diffusion steps (default: 30; more = higher quality, slower)
+  - Quick preview: 15-20 steps
+  - Production: 30-50 steps
+- `--split-length`: Maximum word length before splitting (default: 7; 0 = no splitting)
+- `--temperature`: Sampling temperature (default: 1.0)
+  - 0.7-0.9: Conservative, cleaner output
+  - 1.0: Standard sampling
+  - 1.1-1.3: Creative, more diverse
+- `--batch-size`: Batch size for GPU processing (default: 8)
+- `--seed`: Random seed for reproducibility (default: 42)
+- `--use-ema`: Use EMA weights if available (improves quality)
+
+### Blur Parameters
+- `--radius`: Fixed blur radius (overrides min/max)
+- `--radius-min/max`: Random uniform blur range
+- `--antialias`: Enable downscale+upscale smoothing
+- `--scale-factor`: Downscale factor for antialiasing (default: 0.75)
+
+## Troubleshooting
+
+### CUDA Out of Memory
+- Reduce `--batch-size` to 1-4
+- Reduce `--steps` (try 20-30)
+- Use CPU: `--device cpu` (much slower)
+- Close other GPU applications
+
+### Missing Model Files
+Ensure you have the trained model checkpoint in:
+```
+model/experiments/hf_conditional_latent/
+├── config.yaml
+├── writer_id_map.json
+└── checkpoints/
+    └── latest.pt
+```
+
+The VAE decoder will be auto-downloaded on first use to:
+```
+model/experiments/hf_conditional_latent/cached_vae/
+```
+
+### Import Errors
+Make sure all dependencies are installed:
+```bash
+pip install -r requirements.txt
+```
+
+Ensure model components are accessible:
+```bash
+# From project root
+python -c "from model.text_encoder import TextEncoder; from model.tokenizer import CharTokenizer"
+```
+
+### Style Not Working
+Check that `writer_id_map.json` exists in your run directory and contains the author IDs from your dataset.
+
+## Model Architecture
+
+### Components
+- **Text Encoder**: Character-level transformer (256-dim, 6 layers, 8 heads)
+- **UNet**: HuggingFace UNet2DConditionModel with cross-attention
+- **VAE**: Stable Diffusion VAE (stabilityai/sd-vae-ft-mse)
+- **Tokenizer**: Character-level with special tokens (PAD, UNK, SOS, EOS)
+
+### Training
+Refer to `model/train_hugging.py` and `training/config_latent.yaml` for training configuration.
+
+## Downloads
+
+### Pre-trained Model
+**Required for diffusion-based generation**
+- Download Link: `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
+- Extract to: `model/experiments/`
+- Required files:
+  - `config.yaml` - Model configuration
+  - `writer_id_map.json` - Writer style mappings
+  - `checkpoints/latest.pt` - Model weights
+
+### Datasets
+**Optional - for testing and examples**
+- DocVQA Handwritten Dataset: `https://drive.google.com/drive/folders/1ujMRnW3avELk-oEhlrVeQ2oTd2j7nM77?usp=sharing`
+- Extract to: `data/`
+
+## Citation
+
+
+## License
+
+[Specify your license here]
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request.
diff --git a/TESTING_PLAN.md b/TESTING_PLAN.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe4bb5a2ba08e8b42d0343ca07088dd68cfff1dd
--- /dev/null
+++ b/TESTING_PLAN.md
@@ -0,0 +1,1161 @@
+# Comprehensive Testing Plan & Test Cases
+## DocGenie Synthetic Document Generation API
+
+**Document Version**: 1.0  
+**Date**: March 4, 2026  
+**Project**: DocGenie - AI-Powered Synthetic Document Dataset Generator
+
+---
+
+## Table of Contents
+1. [Testing Overview](#testing-overview)
+2. [Functional Testing](#functional-testing)
+   - [Unit Testing](#unit-testing)
+   - [Integration Testing](#integration-testing)
+   - [System Testing](#system-testing)
+3. [Non-Functional Testing](#non-functional-testing)
+   - [Performance Testing](#performance-testing)
+   - [Security Testing](#security-testing)
+   - [Reliability Testing](#reliability-testing)
+   - [Scalability Testing](#scalability-testing)
+   - [Usability Testing](#usability-testing)
+4. [Test Environment Setup](#test-environment-setup)
+5. [Testing Tools & Frameworks](#testing-tools--frameworks)
+6. [Test Execution Plan](#test-execution-plan)
+7. [Success Criteria & Metrics](#success-criteria--metrics)
+8. [Risk Assessment](#risk-assessment)
+
+---
+
+## Testing Overview
+
+### Purpose
+This document outlines the comprehensive testing strategy for DocGenie API, ensuring quality, reliability, and performance of the synthetic document generation system across all 19 pipeline stages.
+
+### Scope
+- API endpoints testing (`/generate`, `/generate/pdf`, `/generate/async`)
+- 19-stage pipeline validation
+- External service integrations (Claude API, RunPod handwriting service)
+- Database operations (Supabase)
+- Background job processing (Redis Queue)
+- Error handling and recovery mechanisms
+
+### Testing Approach
+- **Test-Driven Development (TDD)**: Write tests before implementation where applicable
+- **Continuous Integration**: Automated test execution on every commit
+- **Coverage Target**: Minimum 80% code coverage for critical paths
+- **Risk-Based Testing**: Prioritize high-risk components (LLM integration, handwriting service)
+
+---
+
+## Functional Testing
+
+### A.1 Unit Testing
+
+Unit tests verify individual functions and methods in isolation. Target: 85% code coverage.
+
+#### **A.1.1 Seed Image Processing (Stage 01)**
+
+**Module**: `api/utils.py::download_seed_images()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-SEED-001 | Download valid image URL | Valid HTTPS URL (JPEG) | Base64-encoded image string | High |
+| UT-SEED-002 | Download PNG format | Valid PNG URL | Base64-encoded PNG | High |
+| UT-SEED-003 | Handle 503 timeout error | URL returning 503 | Retry 3 times, eventual success | Critical |
+| UT-SEED-004 | Handle 502 bad gateway | URL returning 502 | Retry with exponential backoff | High |
+| UT-SEED-005 | Handle 404 not found | Invalid URL | Raise HTTPException(400) | High |
+| UT-SEED-006 | Handle connection timeout | Slow/unresponsive server | Retry then raise exception | Medium |
+| UT-SEED-007 | Validate image format | Non-image URL (HTML) | Raise validation error | Medium |
+| UT-SEED-008 | Handle oversized images | >10MB image | Process or reject gracefully | Low |
+| UT-SEED-009 | Test retry backoff timing | Mock 503 responses | Delays: 2s, 4s, 8s | Medium |
+| UT-SEED-010 | Test max retries exhausted | Persistent 503 errors | Raise exception after 3 attempts | High |
+
+**Test Implementation**:
+```python
+# test_seed_download.py
+import pytest
+from api.utils import download_seed_images
+from unittest.mock import patch, Mock
+
+@pytest.mark.asyncio
+async def test_download_valid_image():
+    url = "https://example.com/test.jpg"
+    with patch('httpx.AsyncClient') as mock_client:
+        mock_response = Mock()
+        mock_response.content = b'\xff\xd8\xff\xe0'  # JPEG header
+        mock_client.return_value.__aenter__.return_value.get.return_value = mock_response
+        
+        result = await download_seed_images([url])
+        assert len(result) == 1
+        assert isinstance(result[0], str)  # base64 string
+
+@pytest.mark.asyncio
+async def test_download_503_retry():
+    url = "https://example.com/test.jpg"
+    with patch('httpx.AsyncClient') as mock_client:
+        # First two calls: 503, third call: success
+        responses = [
+            Mock(status_code=503, raise_for_status=Mock(side_effect=httpx.HTTPStatusError("503", request=Mock(), response=Mock()))),
+            Mock(status_code=503, raise_for_status=Mock(side_effect=httpx.HTTPStatusError("503", request=Mock(), response=Mock()))),
+            Mock(content=b'\xff\xd8\xff\xe0', raise_for_status=Mock())
+        ]
+        mock_client.return_value.__aenter__.return_value.get.side_effect = responses
+        
+        result = await download_seed_images([url])
+        assert len(result) == 1
+        assert mock_client.return_value.__aenter__.return_value.get.call_count == 3
+```
+
+#### **A.1.2 HTML Processing (Stage 03)**
+
+**Module**: `api/utils.py::extract_html_documents_from_response()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-HTML-001 | Extract single HTML | LLM response with 1 HTML | List with 1 HTML document | High |
+| UT-HTML-002 | Extract multiple HTMLs | Response with 3 HTMLs | List with 3 documents | High |
+| UT-HTML-003 | Extract ground truth | HTML with `<script id="GT">` | GT JSON extracted, script removed | Critical |
+| UT-HTML-004 | Handle malformed HTML | Invalid HTML tags | Parse with BeautifulSoup recovery | Medium |
+| UT-HTML-005 | Handle missing DOCTYPE | HTML without DOCTYPE | Add DOCTYPE or flag error | Low |
+| UT-HTML-006 | Validate CSS presence | HTML without `<style>` | Raise validation error | High |
+| UT-HTML-007 | Extract handwriting markers | HTML with `class="handwritten"` | Identify 5 handwriting elements | High |
+| UT-HTML-008 | Extract visual elements | HTML with `data-placeholder` | Identify 3 visual elements | High |
+| UT-HTML-009 | Handle empty response | Empty string from LLM | Return empty list | Medium |
+| UT-HTML-010 | Prettify minified HTML | Single-line HTML | Multi-line formatted HTML | Low |
+
+#### **A.1.3 PDF Rendering (Stage 04)**
+
+**Module**: `api/utils.py::render_html_to_pdf()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-PDF-001 | Render A4 document | HTML with A4 page size | PDF 210×297mm | High |
+| UT-PDF-002 | Render Letter size | HTML with Letter page | PDF 215.9×279.4mm | Medium |
+| UT-PDF-003 | Extract geometries | HTML with handwriting | Geometries JSON with rects | Critical |
+| UT-PDF-004 | Handle custom fonts | HTML with @font-face | PDF with embedded fonts | Low |
+| UT-PDF-005 | Preserve CSS styling | HTML with colors/borders | PDF matches visual style | Medium |
+| UT-PDF-006 | Handle images in HTML | HTML with <img> tags | Images embedded in PDF | Low |
+| UT-PDF-007 | Extract text coordinates | HTML with paragraphs | Accurate bbox coordinates | High |
+| UT-PDF-008 | Handle landscape orientation | HTML with landscape CSS | PDF in landscape mode | Low |
+| UT-PDF-009 | Validate page dimensions | Various page sizes | Dimensions match CSS @page | High |
+| UT-PDF-010 | Handle Playwright errors | Browser crash scenario | Retry or graceful failure | Medium |
+
+#### **A.1.4 Bbox Extraction (Stage 05)**
+
+**Module**: `api/utils.py::extract_bboxes_from_rendered_pdf()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-BBOX-001 | Extract word bboxes | Standard PDF | List of word-level bboxes | Critical |
+| UT-BBOX-002 | Extract char bboxes | Same PDF | List of char-level bboxes | High |
+| UT-BBOX-003 | Handle multi-line text | PDF with paragraphs | Correct block/line grouping | High |
+| UT-BBOX-004 | Filter whitespace | PDF with spaces/tabs | No whitespace-only bboxes | Medium |
+| UT-BBOX-005 | Handle special characters | PDF with ©, ®, ™ | Characters properly extracted | Medium |
+| UT-BBOX-006 | Handle non-Latin scripts | PDF with Chinese/Arabic | Correct unicode extraction | Low |
+| UT-BBOX-007 | Validate coordinates | Extracted bboxes | All coords within page bounds | High |
+| UT-BBOX-008 | Handle empty PDF | PDF with no text | Return empty list | Low |
+| UT-BBOX-009 | Handle rotated text | PDF with rotation | Bboxes account for rotation | Low |
+| UT-BBOX-010 | Parse bbox strings | "0_0_0 Hello 10 20 50 30" | OCRBox object with correct fields | High |
+
+#### **A.1.5 Handwriting Region Extraction (Stage 07)**
+
+**Module**: `api/utils.py::process_stage3_complete()` - handwriting section
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-HW-001 | Filter by handwriting_ratio | 10 regions, ratio=0.3 | ~3 regions selected | Critical |
+| UT-HW-002 | Parse author IDs | `class="handwritten author1"` | author_id="author1" | High |
+| UT-HW-003 | Match to word bboxes | Geometry + bboxes | Correct bbox mapping | Critical |
+| UT-HW-004 | Handle signature class | `class="handwritten signature"` | is_signature=True | Medium |
+| UT-HW-005 | DPI coordinate conversion | Browser coords (96 DPI) | PDF coords (72 DPI) with 0.75 scale | High |
+| UT-HW-006 | Handle overlapping regions | 2 regions, same text | Prevent duplicate bbox usage | Medium |
+| UT-HW-007 | Validate rect boundaries | Geometries with rect | Check bboxes within rect threshold | High |
+| UT-HW-008 | Test seed reproducibility | Same seed, same input | Identical region selection | High |
+| UT-HW-009 | Handle zero ratio | ratio=0.0 | No regions selected | Medium |
+| UT-HW-010 | Handle full ratio | ratio=1.0 | All regions selected | Medium |
+
+#### **A.1.6 Handwriting Service Integration**
+
+**Module**: `api/utils.py::call_handwriting_service_batch()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-HWSVC-001 | Batch request format | 10 texts with metadata | Correct RunPod JSON format | Critical |
+| UT-HWSVC-002 | Handle sync response | Immediate completion | Parse output.images[] | High |
+| UT-HWSVC-003 | Handle IN_PROGRESS | Delayed completion | Poll status endpoint | Critical |
+| UT-HWSVC-004 | Status polling timeout | Job exceeds 30 polls | Raise timeout exception | High |
+| UT-HWSVC-005 | Handle FAILED status | RunPod job failure | Raise exception with error | High |
+| UT-HWSVC-006 | Parse image results | Batch response | Map hw_id to image_base64 | Critical |
+| UT-HWSVC-007 | Calculate dynamic timeout | 50 texts | Timeout = 50×20+30 = 1030s | Medium |
+| UT-HWSVC-008 | Handle network errors | Connection timeout | Retry up to max_retries | High |
+| UT-HWSVC-009 | Validate authorization | Missing API key | Request includes Bearer token | Medium |
+| UT-HWSVC-010 | Test exponential backoff | Status polling | Delays: 5s, 6s, 7s... up to 10s | Low |
+
+#### **A.1.7 Visual Element Generation (Stage 10)**
+
+**Module**: `api/utils.py::generate_visual_element_images()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-VE-001 | Select logo prefab | type="logo" | Random logo from prefabs/ | High |
+| UT-VE-002 | Select photo prefab | type="photo" | Random photo image | High |
+| UT-VE-003 | Generate barcode | type="barcode" | EAN-13 barcode image | Medium |
+| UT-VE-004 | Generate QR code | type="qr_code", content="URL" | QR code image | Medium |
+| UT-VE-005 | Test seed reproducibility | Same seed, same type | Identical prefab selection | High |
+| UT-VE-006 | Handle missing prefabs | type with no files | Fallback or error | Medium |
+| UT-VE-007 | Load SVG prefabs | SVG logo file | Convert to PNG | Low |
+| UT-VE-008 | Filter by requested types | types=["logo","signature"] | Only matching types generated | High |
+| UT-VE-009 | Normalize type synonyms | "chart" → "figure" | Consistent type mapping | Medium |
+| UT-VE-010 | Return base64 encoding | All image types | Valid base64 strings | High |
+
+#### **A.1.8 PDF Modification (Stages 12-13)**
+
+**Module**: `api/utils.py::process_stage3_complete()` - insertion sections
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-PDFMOD-001 | Whiteout text regions | 5 word bboxes | White rectangles drawn | High |
+| UT-PDFMOD-002 | Insert handwriting image | Image + bbox | Image at correct position | Critical |
+| UT-PDFMOD-003 | Apply random offsets | Word bbox | Position offset within limits | Medium |
+| UT-PDFMOD-004 | Resize with aspect ratio | Wide/tall images | Scaled to fit bbox | High |
+| UT-PDFMOD-005 | Insert visual element | Logo + rect | Centered in bbox | High |
+| UT-PDFMOD-006 | Handle rotation | Element with rotation=45 | Rotated image insertion | Low |
+| UT-PDFMOD-007 | Save intermediate PDF | After handwriting | _with_handwriting.pdf created | Medium |
+| UT-PDFMOD-008 | Save final PDF | After visual elements | _final.pdf created | High |
+| UT-PDFMOD-009 | Scale factor application | 3x upscale | High-res image quality | Medium |
+| UT-PDFMOD-010 | Handle insertion errors | Invalid image data | Log error, continue | Medium |
+
+#### **A.1.9 OCR Processing (Stage 15)**
+
+**Module**: `api/utils.py::run_paddle_ocr()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-OCR-001 | OCR English text | English document image | Accurate word recognition | Critical |
+| UT-OCR-002 | OCR with handwriting | Mixed typed/handwritten | Both text types detected | High |
+| UT-OCR-003 | Extract word bboxes | Document image | List of word-level bboxes | Critical |
+| UT-OCR-004 | Calculate confidence | OCR results | Confidence score per word | High |
+| UT-OCR-005 | Handle low quality | Blurry/noisy image | Reasonable accuracy (>70%) | Medium |
+| UT-OCR-006 | Handle rotated text | 90° rotated document | Correct orientation detection | Low |
+| UT-OCR-007 | Multi-language support | Document with German text | lang="de" parameter works | Medium |
+| UT-OCR-008 | Handle empty image | Blank white image | Empty results list | Low |
+| UT-OCR-009 | DPI configuration | Various DPI settings | Consistent accuracy | Medium |
+| UT-OCR-010 | Return image dimensions | Any image | width, height in pixels | High |
+
+#### **A.1.10 Bbox Normalization (Stage 16)**
+
+**Module**: `api/utils.py::normalize_bboxes()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-NORM-001 | Normalize to [0,1] | Pixel bboxes, image dims | Normalized coordinates | Critical |
+| UT-NORM-002 | Handle out-of-bounds | x1 > image_width | Clipped to [0, 1] | High |
+| UT-NORM-003 | Preserve text data | Bboxes with text field | Text preserved in output | High |
+| UT-NORM-004 | Create segment bboxes | Word-level bboxes | Aggregated segment bboxes | Medium |
+| UT-NORM-005 | Handle zero dimensions | Image with width=0 | Raise validation error | Low |
+| UT-NORM-006 | Round to precision | Float coordinates | 6 decimal places | Low |
+| UT-NORM-007 | Maintain bbox order | Ordered input list | Same order in output | Medium |
+| UT-NORM-008 | Handle negative coords | bbox with x0=-5 | Clipped to 0 | Medium |
+| UT-NORM-009 | Validate bbox format | Various input formats | Consistent output schema | High |
+| UT-NORM-010 | Handle empty list | No bboxes | Return empty list | Low |
+
+#### **A.1.11 Dataset Export (Stage 19)**
+
+**Module**: `api/utils.py::export_to_msgpack()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-EXPORT-001 | Create msgpack file | Complete document data | Valid .msgpack file | Critical |
+| UT-EXPORT-002 | Encode image bytes | PNG image | Binary image in msgpack | High |
+| UT-EXPORT-003 | Store normalized bboxes | Normalized coordinates | Bboxes in [0,1] range | High |
+| UT-EXPORT-004 | Store ground truth | GT JSON | GT dict in msgpack | High |
+| UT-EXPORT-005 | Store metadata | Document metadata | Metadata dict in msgpack | Medium |
+| UT-EXPORT-006 | Validate msgpack format | Generated file | Readable by msgpack.load() | Critical |
+| UT-EXPORT-007 | Handle large files | 10MB+ image | Compression applied | Low |
+| UT-EXPORT-008 | Store words list | OCR words | Ordered word list | High |
+| UT-EXPORT-009 | Handle missing fields | Partial data | Fill with null/defaults | Medium |
+| UT-EXPORT-010 | Return file path | Export operation | Absolute path to .msgpack | Medium |
+
+#### **A.1.12 Validation Functions**
+
+**Module**: `api/utils.py::validate_*()`
+
+| Test Case ID | Test Name | Input | Expected Output | Priority |
+|--------------|-----------|-------|-----------------|----------|
+| UT-VAL-001 | Validate HTML structure | Valid HTML5 | (True, None) | High |
+| UT-VAL-002 | Detect missing DOCTYPE | HTML without DOCTYPE | (False, "Missing DOCTYPE") | Medium |
+| UT-VAL-003 | Detect missing CSS | HTML without <style> | (False, "Missing CSS") | High |
+| UT-VAL-004 | Validate PDF file | Valid PDF | (True, None) | High |
+| UT-VAL-005 | Detect corrupt PDF | Truncated PDF file | (False, "Corrupt PDF") | High |
+| UT-VAL-006 | Validate bbox count | 100 bboxes, min=50 | (True, None) | Medium |
+| UT-VAL-007 | Detect insufficient bboxes | 10 bboxes, min=50 | (False, "Insufficient bboxes") | Medium |
+| UT-VAL-008 | Validate bbox coordinates | Valid bboxes | (True, None) | High |
+| UT-VAL-009 | Detect invalid coordinates | x0 > x1 | (False, "Invalid bbox") | High |
+| UT-VAL-010 | Validate page count | Multi-page PDF | (False, "Expected 1 page") | Medium |
+
+**Total Unit Tests**: 120+ test cases
+
+---
+
+### A.2 Integration Testing
+
+Integration tests verify interactions between multiple components. Target: Complete workflow coverage.
+
+#### **A.2.1 Pipeline Stage Integration**
+
+**Purpose**: Verify data flow between consecutive pipeline stages
+
+| Test Case ID | Test Name | Components | Test Scenario | Priority |
+|--------------|-----------|------------|---------------|----------|
+| IT-PIPE-001 | Stages 01-03 integration | Seed download → LLM → HTML extraction | Download seeds, call LLM, extract HTML successfully | Critical |
+| IT-PIPE-002 | Stages 03-05 integration | HTML extraction → PDF render → Bbox extraction | Clean HTML renders to PDF, bboxes extracted | Critical |
+| IT-PIPE-003 | Stages 07-09 integration | HW extraction → Service call | HW regions trigger service batch request | Critical |
+| IT-PIPE-004 | Stages 09-12 integration | HW generation → Insertion | Generated images inserted at correct positions | Critical |
+| IT-PIPE-005 | Stages 14-15 integration | Image render → OCR | Final image passed to OCR successfully | High |
+| IT-PIPE-006 | Stages 15-16 integration | OCR → Normalization | OCR bboxes normalized with correct dimensions | High |
+| IT-PIPE-007 | Stages 07-13 complete | Full Stage 3 | Handwriting + visual elements end-to-end | Critical |
+| IT-PIPE-008 | Stages 14-19 complete | Full Stages 4-5 | OCR → export complete workflow | High |
+| IT-PIPE-009 | Stages 01-19 minimal | End-to-end minimal | No handwriting/VE, basic generation | Critical |
+| IT-PIPE-010 | Stages 01-19 full | End-to-end full features | All features enabled, complete dataset | Critical |
+
+#### **A.2.2 External Service Integration**
+
+**Purpose**: Verify interactions with external APIs and services
+
+| Test Case ID | Test Name | Services | Test Scenario | Priority |
+|--------------|-----------|----------|---------------|----------|
+| IT-EXT-001 | Claude API integration | Claude Messages API | Send prompt, receive valid response | Critical |
+| IT-EXT-002 | Claude error handling | Claude API | Handle rate limits (429) gracefully | High |
+| IT-EXT-003 | Claude retry logic | Claude API | Automatic retry on transient errors | High |
+| IT-EXT-004 | RunPod sync integration | RunPod /runsync | Send batch, receive images | Critical |
+| IT-EXT-005 | RunPod async integration | RunPod /run + status | Queue job, poll until completion | High |
+| IT-EXT-006 | RunPod auth | RunPod API | Bearer token authentication works | Medium |
+| IT-EXT-007 | Supabase storage | Supabase storage API | Upload/download seed images | Medium |
+| IT-EXT-008 | Supabase database | Supabase DB | Store generation metadata | Medium |
+| IT-EXT-009 | Redis Queue | RQ worker | Enqueue async job, process in background | High |
+| IT-EXT-010 | Google Drive | Drive API (optional) | Export to Google Drive if configured | Low |
+
+#### **A.2.3 Database Operations**
+
+**Purpose**: Verify database interactions (Supabase)
+
+| Test Case ID | Test Name | Operations | Test Scenario | Priority |
+|--------------|-----------|------------|---------------|----------|
+| IT-DB-001 | Insert generation record | INSERT | New generation logged in DB | High |
+| IT-DB-002 | Update generation status | UPDATE | Status changes reflected | High |
+| IT-DB-003 | Query by task ID | SELECT | Retrieve generation by ID | High |
+| IT-DB-004 | Store metadata | INSERT | Complete metadata stored | Medium |
+| IT-DB-005 | Handle connection errors | Network failure | Retry or graceful degradation | High |
+| IT-DB-006 | Transaction rollback | Error mid-transaction | Data consistency maintained | Medium |
+| IT-DB-007 | Concurrent updates | Multiple workers | No race conditions | Medium |
+| IT-DB-008 | Pagination | Large result sets | Efficient pagination | Low |
+| IT-DB-009 | Search functionality | Full-text search | Search by doc_type, language | Low |
+| IT-DB-010 | Data retention | Cleanup old data | Archive/delete after N days | Low |
+
+#### **A.2.4 API Endpoint Integration**
+
+**Purpose**: Test complete request/response cycles through endpoints
+
+| Test Case ID | Test Name | Endpoint | Test Scenario | Priority |
+|--------------|-----------|----------|---------------|----------|
+| IT-API-001 | GET /health | Health check | Returns 200 with system status | Critical |
+| IT-API-002 | POST /generate | Legacy endpoint | Returns JSON with complete data | High |
+| IT-API-003 | POST /generate/pdf | Sync PDF endpoint | Returns ZIP file download | Critical |
+| IT-API-004 | POST /generate/async | Async endpoint | Returns task ID | Critical |
+| IT-API-005 | GET /generate/async/status/{id} | Status check | Returns current job status | Critical |
+| IT-API-006 | GET /generate/async/result/{id} | Result download | Returns ZIP when complete | High |
+| IT-API-007 | Request validation | All endpoints | Invalid params rejected with 400 | High |
+| IT-API-008 | Authentication | Protected endpoints | Requires valid API key | High |
+| IT-API-009 | Rate limiting | All endpoints | Enforces rate limits | Medium |
+| IT-API-010 | CORS headers | All endpoints | Correct CORS configuration | Medium |
+
+#### **A.2.5 Background Worker Integration**
+
+**Purpose**: Test async job processing via Redis Queue
+
+| Test Case ID | Test Name | Components | Test Scenario | Priority |
+|--------------|-----------|------------|---------------|----------|
+| IT-WORKER-001 | Job enqueue | API → RQ | Job added to queue successfully | Critical |
+| IT-WORKER-002 | Job processing | Worker → Pipeline | Worker picks up and processes job | Critical |
+| IT-WORKER-003 | Job status updates | Worker → DB | Status updated throughout processing | High |
+| IT-WORKER-004 | Job failure handling | Worker error | Failed job logged, error reported | High |
+| IT-WORKER-005 | Job retry | Transient failure | Failed job retried up to max attempts | High |
+| IT-WORKER-006 | Job timeout | Long-running job | Timeout enforced, job killed | Medium |
+| IT-WORKER-007 | Result storage | Worker → Storage | Results saved to correct location | High |
+| IT-WORKER-008 | Queue priority | Multiple jobs | High priority jobs processed first | Low |
+| IT-WORKER-009 | Worker scaling | Multiple workers | Jobs distributed across workers | Medium |
+| IT-WORKER-010 | Worker health | Worker crash | Replaced automatically, jobs reassigned | High |
+
+**Total Integration Tests**: 50+ test cases
+
+---
+
+### A.3 System Testing
+
+System tests verify end-to-end workflows from user perspective. Target: All user journeys covered.
+
+#### **A.3.1 Complete Generation Workflows**
+
+| Test Case ID | Test Name | Workflow | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|----------|---------------|------------------|----------|
+| ST-GEN-001 | Basic document generation | Minimal config | Generate 1 English invoice, no handwriting/VE | PDF + metadata returned in <60s | Critical |
+| ST-GEN-002 | Handwriting generation | Enable handwriting | Generate document with handwriting | Handwriting visible in PDF | Critical |
+| ST-GEN-003 | Visual elements | Enable VE | Generate document with logo + barcode | Elements visible in PDF | High |
+| ST-GEN-004 | Full feature set | All features enabled | Generate with HW + VE + OCR + analysis | Complete dataset ZIP | Critical |
+| ST-GEN-005 | Multi-document batch | num_solutions=5 | Generate 5 documents from 3 seeds | 5 complete documents | High |
+| ST-GEN-006 | Reproducible generation | Same seed value | Generate twice with seed=42 | Identical outputs | High |
+| ST-GEN-007 | Multi-language | language="german" | Generate German document | Correct language output | Medium |
+| ST-GEN-008 | Various doc types | doc_type variations | Test invoice, receipt, form, letter | All types work | High |
+| ST-GEN-009 | Different GT formats | gt_type="kie" / "qa" | Test both GT formats | Correct GT structure | High |
+| ST-GEN-010 | Custom seed images | User-provided URLs | Generate from user's images | Images influence output | High |
+
+#### **A.3.2 Error Handling Workflows**
+
+| Test Case ID | Test Name | Error Condition | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|-----------------|---------------|------------------|----------|
+| ST-ERR-001 | Invalid seed URL | 404 not found | Submit invalid image URL | HTTP 400 with clear error message | High |
+| ST-ERR-002 | LLM API failure | Claude API down | Submit request during outage | HTTP 503 with retry-after | Critical |
+| ST-ERR-003 | Handwriting service failure | RunPod timeout | Enable handwriting, service fails | HTTP 500, generation stopped | Critical |
+| ST-ERR-004 | Invalid parameters | Missing required field | Omit doc_type parameter | HTTP 422 with validation details | High |
+| ST-ERR-005 | Rate limit exceeded | Too many requests | Submit 100 concurrent requests | HTTP 429 with retry info | High |
+| ST-ERR-006 | Payload too large | Huge request | Submit 50 seed image URLs | HTTP 413 payload too large | Medium |
+| ST-ERR-007 | Malformed JSON | Invalid JSON | Submit broken JSON request | HTTP 400 with parse error | High |
+| ST-ERR-008 | Authentication failure | Missing/invalid API key | Request without auth | HTTP 401 unauthorized | High |
+| ST-ERR-009 | Database connection loss | DB unavailable | Submit during DB outage | Graceful degradation or 503 | Medium |
+| ST-ERR-010 | Disk space exhausted | No storage space | Generate large batch | HTTP 507 insufficient storage | Low |
+
+#### **A.3.3 Async Processing Workflows**
+
+| Test Case ID | Test Name | Workflow | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|----------|---------------|------------------|----------|
+| ST-ASYNC-001 | Submit async job | POST /generate/async | Submit batch job | Receive task ID immediately | Critical |
+| ST-ASYNC-002 | Check pending status | GET status before completion | Poll status endpoint | Returns "pending" or "processing" | High |
+| ST-ASYNC-003 | Check completed status | GET status after completion | Poll status after 5 minutes | Returns "completed" | Critical |
+| ST-ASYNC-004 | Download results | GET result/{id} | Download after completion | Returns ZIP file | Critical |
+| ST-ASYNC-005 | Check failed status | Job fails during processing | Check status of failed job | Returns "failed" with error details | High |
+| ST-ASYNC-006 | Multiple concurrent jobs | Submit 10 jobs | 10 async submissions | All jobs process independently | High |
+| ST-ASYNC-007 | Job cancellation | Cancel in-progress job | Submit, then cancel | Job stops, partial results cleaned | Medium |
+| ST-ASYNC-008 | Result expiration | Check old results | Access 7-day old result | HTTP 410 gone (expired) | Low |
+| ST-ASYNC-009 | Progress updates | Monitor long job | Poll during processing | Progress % increases | Medium |
+| ST-ASYNC-010 | Worker restart recovery | Worker crashes mid-job | Kill worker process | Job reassigned, completes | High |
+
+#### **A.3.4 Data Quality Workflows**
+
+| Test Case ID | Test Name | Quality Check | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|---------------|---------------|------------------|----------|
+| ST-QUAL-001 | OCR accuracy | Compare OCR to ground truth | Generate doc, compare OCR text to GT | >90% accuracy | High |
+| ST-QUAL-002 | Bbox alignment | Visual inspection | Generate doc with debug viz | Bboxes align with text | High |
+| ST-QUAL-003 | Handwriting quality | Visual realism | Generate handwritten doc | Handwriting looks realistic | Medium |
+| ST-QUAL-004 | Visual element placement | Correct positioning | Generate with logo + barcode | Elements at correct positions | High |
+| ST-QUAL-005 | GT completeness | All GT fields present | Generate KIE document | All expected GT fields extracted | High |
+| ST-QUAL-006 | Dataset format validity | msgpack validation | Export dataset | PyTorch can load msgpack | High |
+| ST-QUAL-007 | Image resolution | Check output image | Render final image | Minimum 220 DPI quality | Medium |
+| ST-QUAL-008 | PDF compliance | PDF/A validation | Generate PDF | Valid PDF/A format | Low |
+| ST-QUAL-009 | Metadata accuracy | Check metadata fields | Generate document | Metadata matches actual data | High |
+| ST-QUAL-010 | Reproducibility | Same input → same output | Generate 3 times with seed | All outputs identical | High |
+
+#### **A.3.5 Performance Workflows**
+
+| Test Case ID | Test Name | Performance Metric | Test Scenario | Target Performance | Priority |
+|--------------|-----------|-------------------|---------------|---------------------|----------|
+| ST-PERF-001 | Basic generation time | Time to completion | Generate minimal document | <60 seconds | High |
+| ST-PERF-002 | Handwriting generation time | Time with HW | Generate with 20 HW words | <300 seconds | High |
+| ST-PERF-003 | Batch generation time | Multiple documents | Generate 10 documents | <15 minutes | Medium |
+| ST-PERF-004 | API response time | Endpoint latency | Submit request | <500ms to return task ID | High |
+| ST-PERF-005 | Status check latency | Status endpoint | Check job status | <100ms response time | Medium |
+| ST-PERF-006 | Concurrent requests | Load handling | 50 concurrent requests | All complete successfully | High |
+| ST-PERF-007 | Large payload | Big request | 8 seed images, 10 solutions | Processes without timeout | Medium |
+| ST-PERF-008 | Memory usage | Resource consumption | Generate 100 documents | <8GB RAM per worker | Medium |
+| ST-PERF-009 | Disk I/O | Storage performance | Rapid sequential generations | No I/O bottleneck | Low |
+| ST-PERF-010 | Network bandwidth | Data transfer | Download large result ZIP | Download completes in <60s | Low |
+
+**Total System Tests**: 50+ test cases
+
+---
+
+## Non-Functional Testing
+
+### B.1 Performance Testing
+
+Purpose: Verify system performance under various load conditions.
+
+#### **B.1.1 Load Testing**
+
+**Tool**: Apache JMeter / Locust
+
+| Test Case ID | Test Name | Load Profile | Metrics | Acceptance Criteria | Priority |
+|--------------|-----------|--------------|---------|---------------------|----------|
+| NFT-LOAD-001 | Normal load | 10 concurrent users, 1 hour | Throughput, response time | Avg response <5s, 0 errors | Critical |
+| NFT-LOAD-002 | Peak load | 50 concurrent users, 30 min | Throughput, error rate | <5% error rate, response <15s | Critical |
+| NFT-LOAD-003 | Sustained load | 25 concurrent users, 4 hours | CPU, memory, throughput | Stable resource usage, no leaks | High |
+| NFT-LOAD-004 | Ramp-up load | 1→100 users over 30 min | System behavior | Graceful scaling or degradation | High |
+| NFT-LOAD-005 | Spike load | Sudden 0→100 users | Response time spike | Recovers within 2 minutes | Medium |
+
+**Test Script Example (Locust)**:
+```python
+# locustfile.py
+from locust import HttpUser, task, between
+
+class DocGenieUser(HttpUser):
+    wait_time = between(5, 15)
+    
+    @task(3)
+    def generate_basic_document(self):
+        payload = {
+            "seed_images": ["https://example.com/seed1.jpg"],
+            "prompt_params": {
+                "language": "english",
+                "doc_type": "invoice",
+                "num_solutions": 1,
+                "enable_handwriting": False,
+                "enable_visual_elements": False
+            }
+        }
+        self.client.post("/generate", json=payload, timeout=120)
+    
+    @task(1)
+    def check_async_status(self):
+        # Assume task_id from previous task
+        self.client.get(f"/generate/async/status/{self.task_id}")
+```
+
+#### **B.1.2 Stress Testing**
+
+**Purpose**: Determine system breaking point
+
+| Test Case ID | Test Name | Stress Condition | Metrics | Acceptance Criteria | Priority |
+|--------------|-----------|------------------|---------|---------------------|----------|
+| NFT-STRESS-001 | User overload | 200+ concurrent users | Max capacity | Identifies max users before failure | High |
+| NFT-STRESS-002 | Memory stress | Generate 1000 docs without cleanup | Memory usage | OOM protection, graceful failure | High |
+| NFT-STRESS-003 | CPU stress | Complex documents, no throttling | CPU utilization | System remains responsive | Medium |
+| NFT-STRESS-004 | Disk stress | Fill 95% of disk space | I/O performance | Handles low disk gracefully | Medium |
+| NFT-STRESS-005 | Network stress | Simulate slow network | Timeout handling | Appropriate timeouts, retries | Medium |
+
+#### **B.1.3 Endurance Testing (Soak Testing)**
+
+**Purpose**: Detect memory leaks and performance degradation over time
+
+| Test Case ID | Test Name | Duration | Load | Metrics | Acceptance Criteria | Priority |
+|--------------|-----------|----------|------|---------|---------------------|----------|
+| NFT-ENDUR-001 | 24-hour test | 24 hours | 10 concurrent users | Memory, CPU over time | No memory leaks, stable performance | High |
+| NFT-ENDUR-002 | 7-day test | 7 days | 5 concurrent users | All resources | System stable, no degradation | Medium |
+| NFT-ENDUR-003 | Weekend load | 48 hours | Variable load | Error rate | <1% errors throughout | Medium |
+
+#### **B.1.4 Scalability Testing**
+
+**Purpose**: Verify horizontal and vertical scaling
+
+| Test Case ID | Test Name | Scaling Type | Test Scenario | Acceptance Criteria | Priority |
+|--------------|-----------|--------------|---------------|---------------------|----------|
+| NFT-SCALE-001 | Horizontal scaling | Add workers | 1→5 workers, measure throughput | Linear throughput increase | High |
+| NFT-SCALE-002 | Vertical scaling | Increase CPU/RAM | 2→8 cores, 4→16GB RAM | Performance improvement | Medium |
+| NFT-SCALE-003 | Auto-scaling | Dynamic load | Trigger auto-scale rules | Scales up/down automatically | Medium |
+| NFT-SCALE-004 | Database scaling | Database load | High concurrent DB ops | No DB bottleneck | High |
+| NFT-SCALE-005 | Storage scaling | Large datasets | Generate 10,000 documents | Storage handles volume | Low |
+
+#### **B.1.5 Benchmark Testing**
+
+**Purpose**: Establish performance baselines
+
+| Test Case ID | Component | Benchmark | Target | Priority |
+|--------------|-----------|-----------|--------|----------|
+| NFT-BENCH-001 | Seed download | 1 image (1MB) | <2 seconds | High |
+| NFT-BENCH-002 | LLM call | 1 prompt (standard) | <30 seconds | Critical |
+| NFT-BENCH-003 | PDF rendering | 1 A4 page | <3 seconds | High |
+| NFT-BENCH-004 | Bbox extraction | 500 words | <2 seconds | Medium |
+| NFT-BENCH-005 | Handwriting service | 10 words batch | <200 seconds | Critical |
+| NFT-BENCH-006 | Visual element generation | 5 elements | <5 seconds | Medium |
+| NFT-BENCH-007 | OCR processing | 1 A4 page (300 DPI) | <5 seconds | High |
+| NFT-BENCH-008 | Msgpack export | 1 document | <2 seconds | Medium |
+| NFT-BENCH-009 | Complete pipeline (minimal) | End-to-end | <60 seconds | Critical |
+| NFT-BENCH-010 | Complete pipeline (full) | End-to-end with HW | <300 seconds | Critical |
+
+---
+
+### B.2 Security Testing
+
+Purpose: Identify vulnerabilities and ensure data protection.
+
+#### **B.2.1 Authentication & Authorization Testing**
+
+| Test Case ID | Test Name | Security Control | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|------------------|---------------|------------------|----------|
+| NFT-SEC-001 | API key validation | Authentication | Request without API key | HTTP 401 Unauthorized | Critical |
+| NFT-SEC-002 | Invalid API key | Authentication | Request with wrong key | HTTP 401 Unauthorized | Critical |
+| NFT-SEC-003 | Expired API key | Token expiration | Request with expired key | HTTP 401 with renewal info | High |
+| NFT-SEC-004 | API key rotation | Key management | Rotate keys, test old key | Old key rejected | Medium |
+| NFT-SEC-005 | Role-based access | Authorization | User tries admin endpoint | HTTP 403 Forbidden | High |
+| NFT-SEC-006 | Resource ownership | Authorization | User accesses other's job | HTTP 403 Forbidden | High |
+| NFT-SEC-007 | JWT validation | Token security | Tampered JWT token | Signature validation fails | High |
+| NFT-SEC-008 | Session hijacking | Session security | Stolen session token | Token invalidated after detection | Medium |
+| NFT-SEC-009 | Brute force protection | Rate limiting | 100 failed auth attempts | Account locked, IP blocked | High |
+| NFT-SEC-010 | Multi-factor auth | MFA | Admin login without MFA | MFA required | Low |
+
+#### **B.2.2 Input Validation & Injection Testing**
+
+| Test Case ID | Test Name | Vulnerability | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|---------------|---------------|------------------|----------|
+| NFT-SEC-011 | SQL injection | Injection | Inject SQL in parameters | Parameterized queries prevent injection | Critical |
+| NFT-SEC-012 | XSS attack | Cross-site scripting | Inject `<script>` in doc_type | Input sanitized, script not executed | High |
+| NFT-SEC-013 | Command injection | OS command injection | Inject shell commands | Commands not executed | Critical |
+| NFT-SEC-014 | Path traversal | Directory traversal | `../../etc/passwd` in filename | Access denied | Critical |
+| NFT-SEC-015 | SSRF attack | Server-side request forgery | seed_image URL to internal IP | Internal IPs blocked | High |
+| NFT-SEC-016 | XXE attack | XML external entity | Upload XML with external entity | External entities disabled | Medium |
+| NFT-SEC-017 | LLM prompt injection | Prompt manipulation | Inject ignore instructions | Prompt sandboxing prevents escape | High |
+| NFT-SEC-018 | Buffer overflow | Memory safety | Send 10MB+ parameter | Request rejected, no crash | Medium |
+| NFT-SEC-019 | Unicode attack | Unicode bypass | Unicode normalization tricks | Normalized before processing | Low |
+| NFT-SEC-020 | Regex DoS | ReDoS | Complex regex in input | Timeout protection active | Medium |
+
+#### **B.2.3 Data Protection Testing**
+
+| Test Case ID | Test Name | Protection Mechanism | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|---------------------|---------------|------------------|----------|
+| NFT-SEC-021 | Data encryption at rest | Storage encryption | Check stored files | Files encrypted on disk | High |
+| NFT-SEC-022 | Data encryption in transit | TLS/HTTPS | Inspect network traffic | All traffic over HTTPS | Critical |
+| NFT-SEC-023 | API key exposure | Secret management | Check logs, errors | API keys never logged | Critical |
+| NFT-SEC-024 | PII handling | Data privacy | Generate docs with PII | PII not stored beyond retention | High |
+| NFT-SEC-025 | Data sanitization | Data cleanup | Delete job after 7 days | All data removed | High |
+| NFT-SEC-026 | Backup encryption | Backup security | Check backup files | Backups encrypted | Medium |
+| NFT-SEC-027 | Secure headers | HTTP headers | Check response headers | Security headers present | High |
+| NFT-SEC-028 | CORS policy | Cross-origin security | Request from unauthorized origin | CORS policy blocks request | High |
+| NFT-SEC-029 | Cookie security | Cookie flags | Check cookie attributes | HttpOnly, Secure, SameSite set | Medium |
+| NFT-SEC-030 | Sensitive data in URLs | URL security | Check for secrets in URLs | No sensitive data in query params | High |
+
+#### **B.2.4 Dependency & Supply Chain Security**
+
+| Test Case ID | Test Name | Security Aspect | Test Method | Expected Outcome | Priority |
+|--------------|-----------|-----------------|-------------|------------------|----------|
+| NFT-SEC-031 | Vulnerable dependencies | CVE scanning | Run `pip-audit` | No high/critical vulnerabilities | High |
+| NFT-SEC-032 | Outdated packages | Package versions | Check `requirements.txt` | All packages recent (<6 months) | Medium |
+| NFT-SEC-033 | Malicious packages | Supply chain | Verify package checksums | Checksums match official registry | High |
+| NFT-SEC-034 | License compliance | Legal compliance | Check package licenses | All licenses compatible | Low |
+| NFT-SEC-035 | Container security | Docker image | Scan with Trivy | No critical image vulnerabilities | High |
+
+**Security Testing Tools**:
+- **OWASP ZAP**: Automated security scanning
+- **Burp Suite**: Manual penetration testing
+- **pip-audit**: Python dependency vulnerability scanning
+- **Trivy**: Container image scanning
+- **Bandit**: Python code security linter
+
+---
+
+### B.3 Reliability Testing
+
+Purpose: Verify system stability and fault tolerance.
+
+#### **B.3.1 Fault Tolerance Testing**
+
+| Test Case ID | Test Name | Fault Condition | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|-----------------|---------------|------------------|----------|
+| NFT-REL-001 | Database failover | Primary DB failure | Kill primary DB instance | Failover to standby, no downtime | Critical |
+| NFT-REL-002 | Worker crash recovery | Worker process crash | Kill worker mid-job | Job reassigned, completes | High |
+| NFT-REL-003 | Network partition | Network split | Simulate network partition | System detects, retries | High |
+| NFT-REL-004 | External API failure | Claude API down | LLM service unavailable | Graceful error, retry queue | Critical |
+| NFT-REL-005 | Handwriting service failure | RunPod timeout | Service exceeds timeout | Exception raised, clear error | Critical |
+| NFT-REL-006 | Disk full | No storage space | Fill disk to 100% | Rejects new jobs, alerts sent | High |
+| NFT-REL-007 | Redis failure | Queue unavailable | Redis server down | Async jobs fail with clear error | High |
+| NFT-REL-008 | Load balancer failure | LB goes down | Kill load balancer | Requests reach servers via backup | Medium |
+| NFT-REL-009 | DNS resolution failure | DNS timeout | DNS server unreachable | Falls back to IP or cached DNS | Low |
+| NFT-REL-010 | Partial service degradation | Some features down | VE prefabs missing | Skips VE, completes other features | Medium |
+
+#### **B.3.2 Data Integrity Testing**
+
+| Test Case ID | Test Name | Integrity Check | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|-----------------|---------------|------------------|----------|
+| NFT-REL-011 | Transaction atomicity | Database transactions | Simulate crash mid-transaction | Either all or no changes applied | High |
+| NFT-REL-012 | Data corruption detection | Checksum validation | Corrupt file on disk | Corruption detected, file rejected | High |
+| NFT-REL-013 | Concurrent write safety | Race conditions | Multiple writes to same resource | Last write wins or lock prevents | High |
+| NFT-REL-014 | Duplicate prevention | Idempotency | Submit same request twice | Duplicate detected, not processed | Medium |
+| NFT-REL-015 | Backup restoration | Backup recovery | Restore from backup | Data fully restored, consistent | High |
+
+#### **B.3.3 Recovery Testing**
+
+| Test Case ID | Test Name | Recovery Scenario | Test Procedure | Expected Outcome | Priority |
+|--------------|-----------|-------------------|----------------|------------------|----------|
+| NFT-REL-016 | Crash recovery | Server crash | Kill server, restart | Server recovers, in-flight jobs resume | Critical |
+| NFT-REL-017 | Database restore | DB corruption | Restore from backup | System operational with latest data | High |
+| NFT-REL-018 | Disaster recovery | Complete site failure | Failover to DR site | Service restored within RTO (4 hours) | Critical |
+| NFT-REL-019 | Job queue recovery | Redis crash | Redis restart with persistence | Queued jobs not lost | High |
+| NFT-REL-020 | Config recovery | Bad config deployment | Deploy bad config | Rollback to previous config | Medium |
+
+---
+
+### B.4 Scalability Testing
+
+Purpose: Verify system can handle growth in load and data.
+
+#### **B.4.1 Capacity Testing**
+
+| Test Case ID | Test Name | Capacity Metric | Test Scenario | Target Capacity | Priority |
+|--------------|-----------|-----------------|---------------|-----------------|----------|
+| NFT-SCAL-001 | Max concurrent users | User capacity | Gradually increase users | Support 100+ concurrent users | High |
+| NFT-SCAL-002 | Max documents per hour | Throughput | Generate continuously | Process 500+ docs/hour | High |
+| NFT-SCAL-003 | Max queue depth | Job queue | Enqueue 10,000 jobs | Queue handles all jobs | Medium |
+| NFT-SCAL-004 | Max dataset size | Storage | Generate large dataset | Handle 1TB+ datasets | Low |
+| NFT-SCAL-005 | Max file size | Upload limit | Upload large seed image | Accept up to 10MB images | Medium |
+
+#### **B.4.2 Elasticity Testing**
+
+| Test Case ID | Test Name | Scaling Behavior | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|------------------|---------------|------------------|----------|
+| NFT-SCAL-006 | Scale-up | Add resources | Increase from 2→10 workers | Linear throughput increase | High |
+| NFT-SCAL-007 | Scale-down | Remove resources | Decrease from 10→2 workers | Graceful job completion | High |
+| NFT-SCAL-008 | Auto-scale up | Load increase | Load triggers scale-up | New instances launched | Medium |
+| NFT-SCAL-009 | Auto-scale down | Load decrease | Low load triggers scale-down | Excess instances terminated | Medium |
+| NFT-SCAL-010 | Burst scaling | Sudden spike | 0→100 requests instantly | Scale-up handles burst | High |
+
+---
+
+### B.5 Usability Testing
+
+Purpose: Verify API ease of use and developer experience.
+
+#### **B.5.1 API Documentation Testing**
+
+| Test Case ID | Test Name | Documentation Aspect | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|---------------------|---------------|------------------|----------|
+| NFT-USAB-001 | API docs completeness | All endpoints documented | Review /docs | All endpoints, params documented | High |
+| NFT-USAB-002 | Example accuracy | Code examples | Test all code examples | Examples work without modification | High |
+| NFT-USAB-003 | Error messages clarity | Error documentation | Check error responses | Errors have clear messages, codes | High |
+| NFT-USAB-004 | OpenAPI spec validity | Swagger/OpenAPI | Validate spec | Spec passes OpenAPI validation | Medium |
+| NFT-USAB-005 | Interactive docs | Try-it-out feature | Use /docs to test | Can test endpoints in browser | Medium |
+
+#### **B.5.2 Developer Experience Testing**
+
+| Test Case ID | Test Name | DX Aspect | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|-----------|---------------|------------------|----------|
+| NFT-USAB-006 | SDK availability | Client libraries | Check for Python/JS SDKs | SDKs available, documented | Low |
+| NFT-USAB-007 | Quick start guide | Getting started | Follow quick start | Working request in <10 minutes | High |
+| NFT-USAB-008 | API versioning | Version management | Check version headers | Versions clearly indicated | Medium |
+| NFT-USAB-009 | Changelog maintenance | Release notes | Review changelog | All changes documented | Low |
+| NFT-USAB-010 | Deprecation notices | Breaking changes | Check deprecated features | Clear deprecation warnings | Medium |
+
+---
+
+### B.6 Compatibility Testing
+
+Purpose: Verify system works across different environments.
+
+#### **B.6.1 Browser Compatibility** (for API docs)
+
+| Test Case ID | Browser | Version | Expected Outcome |
+|--------------|---------|---------|------------------|
+| NFT-COMPAT-001 | Chrome | Latest | /docs fully functional |
+| NFT-COMPAT-002 | Firefox | Latest | /docs fully functional |
+| NFT-COMPAT-003 | Safari | Latest | /docs fully functional |
+| NFT-COMPAT-004 | Edge | Latest | /docs fully functional |
+
+#### **B.6.2 Platform Compatibility**
+
+| Test Case ID | Platform | Test Scenario | Expected Outcome | Priority |
+|--------------|----------|---------------|------------------|----------|
+| NFT-COMPAT-005 | Docker | Deploy in container | Runs without issues | Critical |
+| NFT-COMPAT-006 | Railway | Deploy to Railway | Successful deployment | High |
+| NFT-COMPAT-007 | AWS | Deploy to ECS/Lambda | Runs on AWS | Medium |
+| NFT-COMPAT-008 | GCP | Deploy to Cloud Run | Runs on GCP | Low |
+| NFT-COMPAT-009 | Azure | Deploy to App Service | Runs on Azure | Low |
+
+#### **B.6.3 Python Version Compatibility**
+
+| Test Case ID | Python Version | Test Scenario | Expected Outcome | Priority |
+|--------------|----------------|---------------|------------------|----------|
+| NFT-COMPAT-010 | Python 3.11 | Run full test suite | All tests pass | Critical |
+| NFT-COMPAT-011 | Python 3.10 | Run full test suite | All tests pass | High |
+| NFT-COMPAT-012 | Python 3.12 | Run full test suite | All tests pass | Medium |
+
+---
+
+### B.7 Maintainability Testing
+
+Purpose: Verify system is easy to maintain and debug.
+
+#### **B.7.1 Logging & Monitoring**
+
+| Test Case ID | Test Name | Aspect | Test Scenario | Expected Outcome | Priority |
+|--------------|-----------|--------|---------------|------------------|----------|
+| NFT-MAINT-001 | Log completeness | Logging | Check logs during generation | All stages logged | High |
+| NFT-MAINT-002 | Log levels | Log filtering | Filter by ERROR, INFO, DEBUG | Correct levels used | Medium |
+| NFT-MAINT-003 | Structured logging | Log format | Parse log entries | JSON-formatted, parseable | High |
+| NFT-MAINT-004 | Error traceability | Error tracking | Trace error through logs | Request ID tracks full flow | High |
+| NFT-MAINT-005 | Metrics collection | Monitoring | Check Prometheus metrics | Key metrics exported | High |
+| NFT-MAINT-006 | Health checks | Monitoring | Call /health endpoint | Returns detailed status | Critical |
+| NFT-MAINT-007 | Alert configuration | Alerting | Trigger alert condition | Alert fired, notification sent | Medium |
+| NFT-MAINT-008 | Dashboard usability | Visualization | View Grafana dashboards | Clear, actionable insights | Medium |
+
+#### **B.7.2 Code Quality**
+
+| Test Case ID | Test Name | Quality Metric | Tool | Acceptance Criteria | Priority |
+|--------------|-----------|----------------|------|---------------------|----------|
+| NFT-MAINT-009 | Code coverage | Test coverage | pytest-cov | >80% coverage | High |
+| NFT-MAINT-010 | Code complexity | Cyclomatic complexity | radon | CC <10 per function | Medium |
+| NFT-MAINT-011 | Code duplication | DRY principle | pylint | <5% duplicated code | Low |
+| NFT-MAINT-012 | Code style | PEP 8 compliance | flake8 | No style violations | Medium |
+| NFT-MAINT-013 | Type hints | Type coverage | mypy | >90% type hints | Medium |
+| NFT-MAINT-014 | Security linting | Vulnerability scan | bandit | No high-severity issues | High |
+
+---
+
+## Test Environment Setup
+
+### Test Environments
+
+| Environment | Purpose | Configuration | Access |
+|-------------|---------|---------------|--------|
+| **Local Dev** | Development testing | Local Docker Compose | Developers |
+| **CI/CD** | Automated testing | GitHub Actions runners | Automated |
+| **Staging** | Pre-production testing | Mirrors production | QA team |
+| **Production** | Live system | Full infrastructure | Ops team |
+
+### Test Data Management
+
+**Seed Image Dataset**:
+- **Source**: Curated test set of 50 diverse seed images
+- **Location**: `tests/fixtures/seed_images/`
+- **Categories**: Invoice samples, receipt samples, form samples, letter samples
+- **Licensing**: Public domain or test-licensed images
+
+**Test Parameters**:
+```yaml
+# tests/fixtures/test_params.yaml
+test_cases:
+  minimal:
+    language: "english"
+    doc_type: "invoice"
+    num_solutions: 1
+    enable_handwriting: false
+    enable_visual_elements: false
+  
+  full_features:
+    language: "english"
+    doc_type: "medical_form"
+    num_solutions: 2
+    enable_handwriting: true
+    handwriting_ratio: 0.3
+    enable_visual_elements: true
+    visual_element_types: ["logo", "signature", "barcode"]
+    enable_ocr: true
+    enable_dataset_export: true
+```
+
+**Mock Services**:
+- **Mock Claude API**: Returns predefined HTML responses for testing
+- **Mock RunPod API**: Returns test handwriting images, simulates delays
+- **Mock Supabase**: In-memory database for testing
+
+---
+
+## Testing Tools & Frameworks
+
+### Test Frameworks
+
+| Tool | Purpose | Usage |
+|------|---------|-------|
+| **pytest** | Unit & integration testing | `pytest tests/` |
+| **pytest-asyncio** | Async test support | Async function testing |
+| **pytest-cov** | Code coverage | `pytest --cov=api` |
+| **httpx** | HTTP client testing | API request mocking |
+| **respx** | HTTP mock library | Mock external APIs |
+| **pytest-mock** | Mocking framework | Mock functions, classes |
+| **Faker** | Test data generation | Generate realistic data |
+
+### Load Testing Tools
+
+| Tool | Purpose | Usage |
+|------|---------|-------|
+| **Locust** | Load & stress testing | `locust -f locustfile.py` |
+| **Apache JMeter** | Performance testing | GUI-based test scenarios |
+| **k6** | Cloud-native load testing | Scripted load tests |
+
+### Security Testing Tools
+
+| Tool | Purpose | Usage |
+|------|---------|-------|
+| **OWASP ZAP** | Security scanning | Automated vulnerability scan |
+| **Burp Suite** | Penetration testing | Manual security testing |
+| **pip-audit** | Dependency scanning | `pip-audit -r requirements.txt` |
+| **Bandit** | Code security linting | `bandit -r api/` |
+| **Trivy** | Container scanning | `trivy image docgenie-api:latest` |
+
+### Monitoring & Observability
+
+| Tool | Purpose | Usage |
+|------|---------|-------|
+| **Prometheus** | Metrics collection | Scrape /metrics endpoint |
+| **Grafana** | Metrics visualization | Dashboard creation |
+| **ELK Stack** | Log aggregation | Centralized logging |
+| **Sentry** | Error tracking | Automatic error reporting |
+
+---
+
+## Test Execution Plan
+
+### Phase 1: Unit Testing (Week 1-2)
+**Objective**: Achieve 80%+ code coverage
+
+**Tasks**:
+1. Write unit tests for all utility functions (`api/utils.py`)
+2. Test all pipeline stages individually (Stages 01-19)
+3. Mock external dependencies (Claude API, RunPod, Supabase)
+4. Achieve minimum 80% code coverage
+5. Set up CI/CD pipeline for automated testing
+
+**Deliverables**:
+- 120+ unit test cases passing
+- Coverage report >80%
+- CI/CD pipeline configured
+
+### Phase 2: Integration Testing (Week 3)
+**Objective**: Verify component interactions
+
+**Tasks**:
+1. Test pipeline stage integrations (01-03, 03-05, 07-09, etc.)
+2. Test external service integrations (Claude, RunPod, Supabase)
+3. Test database operations (CRUD, transactions)
+4. Test API endpoint workflows
+5. Test background worker integration
+
+**Deliverables**:
+- 50+ integration test cases passing
+- All critical workflows tested
+- Service mocks validated
+
+### Phase 3: System Testing (Week 4)
+**Objective**: End-to-end workflow validation
+
+**Tasks**:
+1. Test complete generation workflows (minimal, full features)
+2. Test error handling scenarios
+3. Test async processing workflows
+4. Test data quality and accuracy
+5. Test performance benchmarks
+
+**Deliverables**:
+- 50+ system test cases passing
+- All user journeys tested
+- Performance baselines established
+
+### Phase 4: Non-Functional Testing (Week 5-6)
+**Objective**: Verify performance, security, reliability
+
+**Tasks**:
+1. **Performance**: Load, stress, endurance, scalability tests
+2. **Security**: Penetration testing, vulnerability scanning
+3. **Reliability**: Fault tolerance, recovery testing
+4. **Usability**: Documentation review, DX testing
+
+**Deliverables**:
+- Load test report (normal, peak, sustained)
+- Security audit report
+- Reliability test report
+- Performance benchmarks
+
+### Phase 5: Regression Testing (Ongoing)
+**Objective**: Prevent defect reintroduction
+
+**Tasks**:
+1. Run full test suite on every commit (CI/CD)
+2. Add tests for every bug fix
+3. Update tests for new features
+4. Maintain >80% code coverage
+
+**Frequency**: Continuous (automated on every PR/commit)
+
+---
+
+## Success Criteria & Metrics
+
+### Test Completion Criteria
+
+| Criteria | Target | Critical |
+|----------|--------|----------|
+| Unit test coverage | >80% | Yes |
+| Integration tests passing | 100% | Yes |
+| System tests passing | 100% | Yes |
+| Load test: Normal load | 0% errors | Yes |
+| Load test: Peak load | <5% errors | Yes |
+| Security: Critical vulnerabilities | 0 | Yes |
+| Security: High vulnerabilities | <5 | Yes |
+| Performance: Basic generation | <60s | Yes |
+| Performance: Handwriting generation | <300s | Yes |
+| Uptime SLA | >99.5% | No |
+
+### Quality Metrics
+
+**Code Quality**:
+- Code coverage: >80%
+- Cyclomatic complexity: <10
+- Code duplication: <5%
+- Type hint coverage: >90%
+
+**Performance**:
+- API response time (P95): <500ms
+- Document generation (minimal): <60s
+- Document generation (with handwriting): <300s
+- Throughput: >500 docs/hour
+
+**Reliability**:
+- Uptime: >99.5%
+- MTBF (Mean Time Between Failures): >720 hours (30 days)
+- MTTR (Mean Time To Recover): <30 minutes
+- Error rate: <1%
+
+**Security**:
+- Zero critical vulnerabilities
+- <5 high-severity vulnerabilities
+- Dependency update cadence: <30 days behind
+
+---
+
+## Risk Assessment
+
+### High-Risk Areas
+
+| Component | Risk Level | Mitigation Strategy | Priority |
+|-----------|------------|---------------------|----------|
+| Claude API integration | **HIGH** | Retry logic, fallback prompts, rate limiting | Critical |
+| RunPod handwriting service | **HIGH** | Timeout handling, batch optimization, error raising | Critical |
+| PDF rendering (Playwright) | **MEDIUM** | Headless browser stability, resource limits | High |
+| OCR accuracy | **MEDIUM** | Multiple OCR engine options, confidence thresholds | High |
+| Async job processing | **MEDIUM** | Worker health checks, job retry mechanisms | High |
+| Database transactions | **MEDIUM** | ACID compliance, connection pooling | High |
+| File storage | **LOW** | Disk space monitoring, cleanup policies | Medium |
+
+### Test Risk Mitigation
+
+| Risk | Impact | Probability | Mitigation |
+|------|--------|-------------|------------|
+| External API unavailable during tests | High | Medium | Use mocks, record/replay mode |
+| Test data corruption | Medium | Low | Version control test fixtures |
+| Test environment instability | High | Medium | Docker isolation, reproducible builds |
+| Long test execution time | Low | High | Parallel execution, selective testing |
+| Flaky tests | Medium | Medium | Retry logic, better assertions |
+
+---
+
+## Test Reporting
+
+### Test Reports
+
+**Daily Reports** (Automated):
+- Test execution summary (pass/fail counts)
+- Code coverage trends
+- Failed test details
+- Performance benchmark comparison
+
+**Weekly Reports** (Manual):
+- Test progress against plan
+- New defects discovered
+- Defect resolution rate
+- Risk updates
+
+**Release Reports** (Per Release):
+- Complete test execution summary
+- All test case results
+- Performance test results
+- Security scan results
+- Known issues and limitations
+
+### Defect Tracking
+
+**Defect Workflow**:
+1. **Report**: Tester creates defect in issue tracker
+2. **Triage**: Team prioritizes defect (P0-Critical, P1-High, P2-Medium, P3-Low)
+3. **Assign**: Developer assigned to fix
+4. **Fix**: Developer implements fix
+5. **Verify**: Tester verifies fix
+6. **Close**: Defect closed, regression test added
+
+**Defect Metrics**:
+- Defect discovery rate
+- Defect resolution rate
+- Defect escape rate (to production)
+- Mean time to resolve (MTTR)
+
+---
+
+## Continuous Improvement
+
+### Test Optimization
+
+**Quarterly Reviews**:
+- Review test coverage (identify gaps)
+- Remove obsolete tests
+- Update test data
+- Optimize test execution time
+- Review test environment stability
+
+**Automation Goals**:
+- Automate 100% of unit tests
+- Automate 90% of integration tests
+- Automate 70% of system tests
+- Automate 50% of non-functional tests
+
+---
+
+## Appendix
+
+### Test Case Template
+
+```markdown
+## Test Case ID: [ID]
+
+**Test Name**: [Descriptive name]
+
+**Component**: [Module/Component under test]
+
+**Test Type**: [Unit/Integration/System/Non-Functional]
+
+**Priority**: [Critical/High/Medium/Low]
+
+**Prerequisites**:
+- [List any setup required]
+
+**Test Steps**:
+1. [Step 1]
+2. [Step 2]
+3. [Step 3]
+
+**Test Data**:
+- [Input data required]
+
+**Expected Result**:
+- [What should happen]
+
+**Actual Result**:
+- [What actually happened - filled during execution]
+
+**Status**: [Pass/Fail/Blocked/Not Run]
+
+**Notes**:
+- [Any additional observations]
+```
+
+### Glossary
+
+- **API**: Application Programming Interface
+- **CI/CD**: Continuous Integration/Continuous Deployment
+- **DPI**: Dots Per Inch
+- **GT**: Ground Truth
+- **HW**: Handwriting
+- **KIE**: Key Information Extraction
+- **LLM**: Large Language Model
+- **MTBF**: Mean Time Between Failures
+- **MTTR**: Mean Time To Recover
+- **OCR**: Optical Character Recognition
+- **P95**: 95th Percentile
+- **SLA**: Service Level Agreement
+- **VE**: Visual Element
+
+---
+
+**Document Control**:
+- **Author**: DocGenie QA Team
+- **Reviewers**: Development Team, Product Manager
+- **Approval**: Project Lead
+- **Next Review Date**: [3 months from approval]
+
+---
+
+**END OF DOCUMENT**
diff --git a/api/README.md b/api/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..016ca26e8b7cfd582efb755a7a13448d989ec8fc
--- /dev/null
+++ b/api/README.md
@@ -0,0 +1,1220 @@
+# DocGenie API
+
+FastAPI-based REST API for generating synthetic documents using LLMs. This API is **optimized for ML dataset creation** with comprehensive handwriting and visual element support.
+
+## Features
+
+- 🚀 **Simple REST API** - Easy to integrate with any frontend
+- 🖼️ **URL-based seed images** - Provide seed images via URLs
+- 🎨 **Customizable prompts** - Control document type, language, and ground truth format
+- ✍️ **Handwriting Generation** - WordStylist diffusion model with 339 author styles
+- 🎯 **Visual Elements** - Stamps, logos, barcodes, photos, figures
+- 📊 **ML-Ready Datasets** - Individual token images with complete metadata
+- 📄 **Complete output** - Returns PDF, HTML, CSS, and bounding boxes
+- ⚡ **Async processing** - Fast and efficient document generation
+
+## ML Dataset Creation
+
+The API is **fully equipped for ML training dataset creation** with `output_detail: "dataset"` mode:
+
+### ✅ Handwriting Data
+- **Individual token images**: Each handwriting field saved as separate PNG (`hw0.png`, `hw1.png`, ...)
+- **Author style IDs**: 339 unique writer styles (0-338) for style-consistent generation
+- **Text content**: Original text for each handwriting field
+- **Position data**: Precise bounding boxes (x, y, width, height) in mm
+- **Signature detection**: Boolean flag for signature vs regular handwriting
+- **Image dimensions**: Width and height for each generated token
+
+### ✅ Visual Element Data
+- **Stamps**: Generated with realistic textures, borders, and rotations
+  - Text content preserved
+  - Red/green color variants
+  - Circle/rectangle shapes
+- **Logos**: Random selection from 6+ logo prefabs
+- **Barcodes**: Code128 format with customizable content
+- **Photos**: Random selection from 5+ photo prefabs
+- **Figures/Charts**: Random selection from 6+ chart/diagram prefabs
+- **Individual images**: Each element saved as separate PNG with transparency
+
+### ✅ Dataset Metadata
+- **Token mapping JSON**: Complete mapping with:
+  - Token IDs and references
+  - Style IDs for handwriting
+  - Element types for visual elements
+  - Position rectangles
+  - Image filenames
+  - Content text
+- **Ground truth annotations**: QA pairs, classification labels, NER tags
+- **Bounding boxes**: Word, segment, and layout-level bboxes
+- **Normalized coordinates**: [0,1] scaled for ML frameworks
+- **Msgpack export**: Compatible with datadings library
+
+### ✅ Additional ML Features
+- **OCR results**: Word-level bboxes and text for Document AI training
+- **Layout elements**: Document structure annotations
+- **Page dimensions**: Physical measurements (mm) and pixel dimensions
+- **Reproducibility**: Seed-based generation for consistent results
+
+## Pipeline Overview
+
+The API implements a simplified version of the DocGenie generation pipeline:
+
+1. **Download seed images** from URLs
+2. **Convert to base64** for LLM input
+3. **Build custom prompt** with user parameters
+4. **Call Claude API** to generate HTML documents
+5. **Extract HTML/CSS** and ground truth from response
+6. **Render to PDF** using Playwright
+7. **Extract bounding boxes** from PDF
+8. **Return results** as JSON with base64-encoded PDF
+
+## Installation
+
+### Prerequisites
+
+- Python 3.10+
+- DocGenie main package installed
+- Playwright browsers installed
+
+### Setup
+
+1. Install dependencies (all API dependencies are included in the main project):
+```bash
+# Using uv (recommended)
+uv sync
+
+# Or using pip
+pip install -e .
+
+# Or install API-specific dependencies
+cd api/
+pip install -r requirements.txt
+```
+
+**Note**: For async endpoint support, ensure you have:
+- `redis>=5.0.0` and `rq>=1.15.0` (job queue)
+- `supabase>=2.0.0` (database)
+- `google-api-python-client>=2.100.0` (Google Drive integration)
+
+2. Install Playwright browsers:
+```bash
+playwright install chromium
+```
+
+3. Install Tesseract OCR (for local OCR support):
+```bash
+# Ubuntu/Debian
+sudo apt-get update && sudo apt-get install tesseract-ocr
+
+# macOS
+brew install tesseract
+
+# Windows
+# Download installer from: https://github.com/UB-Mannheim/tesseract/wiki
+```
+
+4. Set your Anthropic API key:
+```bash
+export ANTHROPIC_API_KEY="your-api-key-here"
+```
+
+5. Configure OCR in `.env`:
+```bash
+cp .env.example .env
+# Edit .env and set:
+OCR_SERVICE_ENABLED=true
+OCR_USE_LOCAL=true  # Use local Tesseract (recommended)
+```
+
+## Running the API
+
+### Development Mode
+
+```bash
+cd api
+python main.py
+```
+
+The API will be available at `http://localhost:8000`
+
+### Production Mode
+
+```bash
+cd api
+uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
+```
+
+## API Endpoints
+
+### Health Check
+
+```http
+GET /health
+```
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "version": "1.0.0"
+}
+```
+
+### Generate Documents
+
+```http
+POST /generate
+```
+
+**Request Body:**
+```json
+{
+  "seed_images": [
+    "https://example.com/seed1.jpg",
+    "https://example.com/seed2.jpg"
+  ],
+  "prompt_params": {
+    "language": "English",
+    "doc_type": "business and administrative",
+    "gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
+    "gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
+    "num_solutions": 3
+  },
+  "model": "claude-sonnet-4-5-20250929",
+  "api_key": "optional-api-key"
+}
+```
+
+**Response:**
+```json
+{
+  "success": true,
+  "message": "Successfully generated 3 documents",
+  "total_documents": 3,
+  "documents": [
+    {
+      "document_id": "uuid-123_0",
+      "html": "<!DOCTYPE html>...",
+      "css": "body { ... }",
+      "ground_truth": {
+        "What is the invoice number?": "INV-12345",
+        "What is the total amount?": "$1,234.56"
+      },
+      "pdf_base64": "JVBERi0xLjQK...",
+      "bboxes": [
+        {
+          "text": "Invoice",
+          "x": 0.1,
+          "y": 0.05,
+          "width": 0.2,
+          "height": 0.03,
+          "page": 0
+        }
+      ],
+      "page_width_mm": 210.0,
+      "page_height_mm": 297.0
+    }
+  ]
+}
+```
+
+### Generate Documents (Async) - **Recommended for Production**
+
+```http
+POST /generate/async
+```
+
+**🎯 Cost Optimization**: This endpoint uses Claude's **Batch API** for **50% cost savings** ($2.50 vs $5.00 per 1M input tokens).
+
+**⏱️ Latency**: 5-30 minutes (vs 30-120 seconds for direct API)
+
+**✅ Best For**: Multi-user production systems with non-realtime requirements
+
+**Request Body:**
+```json
+{
+  "user_id": 123,
+  "seed_images": [
+    "https://example.com/seed1.jpg",
+    "https://example.com/seed2.jpg"
+  ],
+  "prompt_params": {
+    "language": "English",
+    "doc_type": "business and administrative",
+    "num_solutions": 3,
+    "enable_handwriting": true,
+    "enable_visual_elements": true,
+    "enable_ocr": true,
+    "output_detail": "dataset"
+  }
+}
+```
+
+**Response:**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "estimated_time_minutes": 10,
+  "poll_url": "/jobs/550e8400-e29b-41d4-a716-446655440000/status",
+  "created_at": "2025-01-15T12:00:00Z"
+}
+```
+
+**Workflow:**
+1. Submit generation request → Get `request_id`
+2. Poll status endpoint every 30-60 seconds
+3. When `status: "completed"`, download from Google Drive
+4. Results uploaded to user's Google Drive with shareable link
+
+### Check Job Status
+
+```http
+GET /jobs/{request_id}/status
+```
+
+**Response (Queued):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "created_at": "2025-01-15T12:00:00Z",
+  "updated_at": "2025-01-15T12:00:00Z"
+}
+```
+
+**Response (Processing):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "processing",
+  "created_at": "2025-01-15T12:00:00Z",
+  "updated_at": "2025-01-15T12:05:00Z",
+  "progress": "Creating batch request..."
+}
+```
+
+**Response (Completed):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "completed",
+  "created_at": "2025-01-15T12:00:00Z",
+  "updated_at": "2025-01-15T12:15:00Z",
+  "download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
+  "file_size_mb": 15.4,
+  "document_count": 3
+}
+```
+
+**Response (Failed):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "failed",
+  "created_at": "2025-01-15T12:00:00Z",
+  "updated_at": "2025-01-15T12:08:00Z",
+  "error_message": "Batch processing timeout"
+}
+```
+
+**Status Values:**
+- `queued`: Job submitted, waiting for worker
+- `processing`: Worker picked up job, creating batch
+- `generating`: Batch submitted to Claude, waiting for completion
+- `completed`: Documents generated and uploaded to Google Drive
+- `failed`: Error occurred (see `error_message`)
+
+### List User Jobs
+
+```http
+GET /jobs/user/{user_id}?limit=50&offset=0
+```
+
+**Response:**
+```json
+{
+  "user_id": 123,
+  "jobs": [
+    {
+      "request_id": "550e8400-e29b-41d4-a716-446655440000",
+      "status": "completed",
+      "created_at": "2025-01-15T12:00:00Z",
+      "download_url": "https://drive.google.com/...",
+      "document_count": 3
+    },
+    {
+      "request_id": "660e8400-e29b-41d4-a716-446655440111",
+      "status": "processing",
+      "created_at": "2025-01-15T12:30:00Z"
+    }
+  ],
+  "count": 2,
+  "limit": 50,
+  "offset": 0
+}
+```
+
+## Usage Examples
+
+### cURL
+
+```bash
+curl -X POST http://localhost:8000/generate \
+  -H "Content-Type: application/json" \
+  -d '{
+    "seed_images": [
+      "https://example.com/receipt1.jpg",
+      "https://example.com/receipt2.jpg"
+    ],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "receipts",
+      "num_solutions": 2
+    }
+  }'
+```
+
+### Python (Direct API)
+
+```python
+import requests
+import base64
+
+response = requests.post(
+    "http://localhost:8000/generate",
+    json={
+        "seed_images": [
+            "https://example.com/seed1.jpg",
+            "https://example.com/seed2.jpg"
+        ],
+        "prompt_params": {
+            "language": "English",
+            "doc_type": "business forms",
+            "num_solutions": 3
+        }
+    }
+)
+
+result = response.json()
+
+# Save first PDF
+if result["success"]:
+    pdf_data = base64.b64decode(result["documents"][0]["pdf_base64"])
+    with open("generated_doc.pdf", "wb") as f:
+        f.write(pdf_data)
+```
+
+### Python (Async API with Polling) - **Recommended**
+
+```python
+import requests
+import time
+
+# Step 1: Submit job
+response = requests.post(
+    "http://localhost:8000/generate/async",
+    json={
+        "user_id": 123,
+        "seed_images": [
+            "https://example.com/seed1.jpg",
+            "https://example.com/seed2.jpg"
+        ],
+        "prompt_params": {
+            "language": "English",
+            "doc_type": "receipts and invoices",
+            "num_solutions": 5,
+            "enable_handwriting": True,
+            "enable_visual_elements": True,
+            "enable_ocr": True,
+            "output_detail": "dataset"
+        }
+    }
+)
+
+job = response.json()
+request_id = job["request_id"]
+print(f"✓ Job submitted: {request_id}")
+print(f"  Estimated time: {job['estimated_time_minutes']} minutes")
+
+# Step 2: Poll status until complete
+while True:
+    status_response = requests.get(
+        f"http://localhost:8000/jobs/{request_id}/status"
+    )
+    status = status_response.json()
+    
+    print(f"  Status: {status['status']}", end="")
+    if status.get("progress"):
+        print(f" - {status['progress']}")
+    else:
+        print()
+    
+    if status["status"] == "completed":
+        print(f"✓ Generation complete!")
+        print(f"  Download: {status['download_url']}")
+        print(f"  Size: {status.get('file_size_mb', 0):.1f} MB")
+        print(f"  Documents: {status.get('document_count', 0)}")
+        break
+    elif status["status"] == "failed":
+        print(f"✗ Generation failed: {status.get('error_message')}")
+        break
+    
+    # Wait 30 seconds before next poll
+    time.sleep(30)
+
+# Step 3: Download from Google Drive (if completed)
+if status["status"] == "completed":
+    # User can download from their Google Drive using the shareable link
+    print(f"\nDownload your documents at:\n{status['download_url']}")
+```
+
+### JavaScript
+
+```javascript
+const response = await fetch('http://localhost:8000/generate', {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+  },
+  body: JSON.stringify({
+    seed_images: [
+      'https://example.com/seed1.jpg',
+      'https://example.com/seed2.jpg'
+    ],
+    prompt_params: {
+      language: 'English',
+      doc_type: 'invoices',
+      num_solutions: 2
+    }
+  })
+});
+
+const result = await response.json();
+
+// Convert base64 PDF to blob
+const pdfBlob = await fetch(`data:application/pdf;base64,${result.documents[0].pdf_base64}`)
+  .then(res => res.blob());
+```
+
+## Configuration
+
+### Prompt Parameters
+
+- **language**: Language for generated documents (default: "English")
+- **doc_type**: Type of documents to generate (e.g., "business and administrative", "receipts", "forms")
+- **gt_type**: Description of ground truth type to generate
+- **gt_format**: Format specification for ground truth JSON
+- **num_solutions**: Number of document variations (1-5)
+
+### Stage 3-5 Advanced Features
+
+The API supports advanced document synthesis and dataset packaging:
+
+#### Stage 3: Handwriting & Visual Elements
+- **enable_handwriting**: Add handwritten text using diffusion model (default: false)
+- **handwriting_ratio**: Percentage of text to convert to handwriting 0-1 (default: 0.5)
+- **enable_visual_elements**: Add stamps, barcodes, logos (default: false)
+- **visual_element_types**: Types of elements to add: ["stamp", "logo", "figure", "barcode", "photo"] (default: all types)
+
+#### Stage 4: OCR
+- **enable_ocr**: Perform OCR on generated document (default: false)
+- **ocr_language**: OCR language code (default: "en")
+
+#### Stage 5: Dataset Packaging
+- **enable_bbox_normalization**: Normalize bboxes to [0,1] scale (default: false)
+- **enable_gt_verification**: Verify ground truth quality (default: false)
+- **enable_analysis**: Generate dataset statistics (default: false)
+- **enable_debug_visualization**: Create bbox overlay images (default: false)
+
+#### Dataset Export (Msgpack Format)
+- **enable_dataset_export**: Export as msgpack dataset format (default: false)
+- **dataset_export_format**: Export format - only "msgpack" is supported (default: "msgpack")
+
+**Note**: Only msgpack format is implemented in the current pipeline. COCO and HuggingFace export formats mentioned in some documentation are not yet available.
+
+#### Output Detail Level
+- **output_detail**: Controls how much data is returned/saved (default: "minimal")
+  - `"minimal"` (default): Final outputs only (PDFs, images, metadata) - 2-5 MB per document
+  - `"dataset"`: Includes individual token images for ML training - 10-20 MB per document
+    - Individual handwriting token images (`handwriting_tokens/hw0.png`, ...)
+    - Individual visual element images (`visual_elements/logo_0.png`, ...)
+    - Token mapping JSON with style IDs and positions
+  - `"complete"`: All intermediate files and debug info - 20-50 MB per document
+    - Everything from `dataset` mode
+    - Intermediate PDFs from each processing stage
+    - Generation logs
+    - ⚠️ **Warning**: Can result in 50+ MB JSON responses for `/generate` endpoint
+
+**Recommendation**: Use `"minimal"` for production, `"dataset"` for ML research, `"complete"` for debugging (only with `/generate/pdf`).
+
+**Example with dataset output detail:**
+```python
+import requests
+import base64
+import json
+
+# Generate ML training dataset
+response = requests.post(
+    "http://localhost:8000/generate",
+    json={
+        "seed_images": ["https://example.com/seed.jpg"],
+        "prompt_params": {
+            "language": "English",
+            "doc_type": "receipts and invoices",
+            "num_solutions": 5,
+            
+            # Enable handwriting and visual elements
+            "enable_handwriting": True,
+            "handwriting_ratio": 0.4,
+            "enable_visual_elements": True,
+            "visual_element_types": ["stamp", "logo", "figure", "barcode", "photo"],  # All types by default
+            
+            # Enable dataset features
+            "enable_ocr": True,
+            "enable_bbox_normalization": True,
+            "enable_dataset_export": True,
+            
+            # IMPORTANT: Set output_detail to "dataset" for ML training
+            "output_detail": "dataset",
+            
+            # Use seed for reproducibility
+            "seed": 42
+        }
+    }
+)
+
+result = response.json()
+
+# Process each generated document
+for doc in result["documents"]:
+    doc_id = doc["document_id"]
+    print(f"\\nProcessing {doc_id}:")
+    
+    # 1. Save individual handwriting token images
+    if doc.get("handwriting_token_images"):
+        print(f"  - Handwriting tokens: {len(doc['handwriting_token_images'])}")
+        for hw_id, img_b64 in doc["handwriting_token_images"].items():
+            with open(f"dataset/{doc_id}/{hw_id}.png", "wb") as f:
+                f.write(base64.b64decode(img_b64))
+    
+    # 2. Save individual visual element images
+    if doc.get("visual_element_images"):
+        print(f"  - Visual elements: {len(doc['visual_element_images'])}")
+        for ve_id, img_b64 in doc["visual_element_images"].items():
+            with open(f"dataset/{doc_id}/{ve_id}.png", "wb") as f:
+                f.write(base64.b64decode(img_b64))
+    
+    # 3. Save token mapping for ML training
+    if doc.get("token_mapping"):
+        mapping = doc["token_mapping"]
+        print(f"  - Mapping: {mapping['handwriting']['total_count']} HW + {mapping['visual_elements']['total_count']} VE")
+        with open(f"dataset/{doc_id}/token_mapping.json", "w") as f:
+            json.dump(mapping, f, indent=2)
+    
+    # 4. Save ground truth annotations
+    if doc.get("ground_truth"):
+        with open(f"dataset/{doc_id}/ground_truth.json", "w") as f:
+            json.dump(doc["ground_truth"], f, indent=2)
+    
+    # 5. Save bounding boxes (normalized coordinates)
+    if doc.get("normalized_bboxes_word"):
+        with open(f"dataset/{doc_id}/bboxes_normalized.json", "w") as f:
+            json.dump(doc["normalized_bboxes_word"], f, indent=2)
+    
+    # 6. Save final document image
+    if doc.get("image_base64"):
+        with open(f"dataset/{doc_id}/final_image.png", "wb") as f:
+            f.write(base64.b64decode(doc["image_base64"]))
+    
+    # 7. Save msgpack dataset file
+    if doc.get("dataset_export") and doc["dataset_export"].get("msgpack_base64"):
+        with open(f"dataset/{doc_id}/dataset.msgpack", "wb") as f:
+            f.write(base64.b64decode(doc["dataset_export"]["msgpack_base64"]))
+
+print(f"\\n✅ Generated {len(result['documents'])} ML-ready documents")
+```
+
+### PDF Generation Endpoint (Recommended for Large Datasets)
+
+For bulk generation with comprehensive file outputs, use `/generate/pdf`:
+
+```bash
+curl -X POST http://localhost:8000/generate/pdf \
+  -H "Content-Type: application/json" \
+  -d '{
+    "seed_images": ["https://example.com/seed1.jpg"],
+    "prompt_params": {
+      "num_solutions": 3,
+      "enable_handwriting": true,
+      "enable_ocr": true,
+      "enable_bbox_normalization": true,
+      "enable_dataset_export": true,
+      "output_detail": "dataset"
+    }
+  }' \
+  --output documents.zip
+```
+
+#### ZIP File Contents
+
+Based on `output_detail` level:
+
+**Minimal (default):**
+- `document_<id>.pdf` - Generated PDF files
+- `document_<id>/` - Per-document directories with:
+  - `document.html`, `document.css` - Source files
+  - `ground_truth.json`, `bboxes.json` - Annotations
+  - `final_image.png` - Final rendered image (if Stage 3 enabled)
+  - `handwriting_regions.json`, `visual_elements.json` - Stage 3 metadata (if enabled)
+  - `ocr_results.json` - OCR word-level data (if OCR enabled)
+- `README.md` - Package documentation
+- `metadata.json` - Combined metadata
+
+**Dataset (for ML training):**
+- All files from "minimal" level, plus:
+  - `handwriting_tokens/` - Individual token images (`hw0.png`, `hw1.png`, ...)
+  - `visual_elements/` - Individual element images (`logo_0.png`, `stamp_1.png`, ...)
+  - `token_mapping.json` - Complete mapping with style IDs and positions
+  - `dataset.msgpack` - Msgpack dataset file (if export enabled)
+  - `normalized_bboxes_word.json` - Normalized coordinates (if Stage 5 enabled)
+
+**Complete (for debugging):**
+- All files from "dataset" level, plus:
+  - Intermediate PDFs from each processing stage
+  - Generation logs with timing information
+  - `debug_visualization.png` - Bbox overlay images
+
+### Supported Models
+
+- `claude-sonnet-4-5-20250929` (default, recommended)
+- `claude-3-5-sonnet-20241022`
+
+### Environment Variables
+
+- `ANTHROPIC_API_KEY`: Your Anthropic API key (required if not provided in request)
+
+## API Documentation
+
+Interactive API documentation is available when the server is running:
+
+- **Swagger UI**: http://localhost:8000/docs
+- **ReDoc**: http://localhost:8000/redoc
+
+## Error Handling
+
+The API returns appropriate HTTP status codes:
+
+- `200 OK`: Successful generation
+- `400 Bad Request`: Invalid input (e.g., invalid image URLs)
+- `401 Unauthorized`: Missing or invalid API key
+- `500 Internal Server Error`: Processing error
+
+Error response format:
+```json
+{
+  "detail": "Error message describing what went wrong"
+}
+```
+
+## Performance Considerations
+
+- **Concurrent requests**: The API can handle multiple requests concurrently
+- **Image size**: Larger seed images take longer to process
+- **Number of solutions**: More solutions = longer processing time
+- **Model selection**: Sonnet is slower but higher quality than Haiku
+
+## Limitations
+
+- Maximum 10 seed images per request
+- Maximum 5 document variations (`num_solutions`)
+- Single-page documents only
+- Timeout: 60 seconds per PDF render
+
+## Troubleshooting
+
+### Playwright browser not found
+
+```bash
+playwright install chromium
+```
+
+### API key not working
+
+Make sure your API key is set correctly:
+```bash
+echo $ANTHROPIC_API_KEY
+```
+
+### PDF rendering fails
+
+Ensure Chromium is installed and accessible:
+```bash
+playwright show-trace
+```
+
+## Integration with Frontend
+
+Example React integration:
+
+```jsx
+const [loading, setLoading] = useState(false);
+const [result, setResult] = useState(null);
+
+const generateDocuments = async () => {
+  setLoading(true);
+  
+  try {
+    const response = await fetch('http://localhost:8000/generate', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        seed_images: seedImageUrls,
+        prompt_params: {
+          language: 'English',
+          doc_type: documentType,
+          num_solutions: 3
+        }
+      })
+    });
+    
+    const data = await response.json();
+    setResult(data);
+  } catch (error) {
+    console.error('Generation failed:', error);
+  } finally {
+    setLoading(false);
+  }
+};
+```
+
+### React Integration (Async API with Progress)
+
+```jsx
+import { useState, useEffect } from 'react';
+
+function DocumentGenerator({ userId, seedImages }) {
+  const [requestId, setRequestId] = useState(null);
+  const [status, setStatus] = useState(null);
+  const [progress, setProgress] = useState(0);
+
+  // Submit job
+  const handleGenerate = async () => {
+    const response = await fetch('http://localhost:8000/generate/async', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        user_id: userId,
+        seed_images: seedImages,
+        prompt_params: {
+          language: 'English',
+          doc_type: 'receipts',
+          num_solutions: 3,
+          enable_handwriting: true,
+          output_detail: 'dataset'
+        }
+      })
+    });
+    
+    const job = await response.json();
+    setRequestId(job.request_id);
+    setStatus('queued');
+  };
+
+  // Poll job status
+  useEffect(() => {
+    if (!requestId || status === 'completed' || status === 'failed') return;
+
+    const interval = setInterval(async () => {
+      const response = await fetch(`http://localhost:8000/jobs/${requestId}/status`);
+      const jobStatus = await response.json();
+      
+      setStatus(jobStatus.status);
+      
+      // Update progress bar
+      const progressMap = {
+        'queued': 10,
+        'processing': 30,
+        'generating': 60,
+        'completed': 100,
+        'failed': 0
+      };
+      setProgress(progressMap[jobStatus.status] || 0);
+      
+      if (jobStatus.status === 'completed') {
+        // Open Google Drive download link
+        window.open(jobStatus.download_url, '_blank');
+      }
+    }, 30000); // Poll every 30 seconds
+
+    return () => clearInterval(interval);
+  }, [requestId, status]);
+
+  return (
+    <div>
+      <button onClick={handleGenerate} disabled={status && status !== 'completed'}>
+        Generate Documents
+      </button>
+      
+      {status && (
+        <div className="progress-container">
+          <div className="progress-bar" style={{ width: `${progress}%` }} />
+          <p>Status: {status}</p>
+          {status === 'completed' && (
+            <a href={`http://localhost:8000/jobs/${requestId}/status`}>
+              Download Results
+            </a>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
+```
+
+## Background Processing Setup
+
+The async endpoints (`/generate/async`) require a background worker system for job processing.
+
+### Prerequisites
+
+1. **Redis** - Job queue storage
+2. **Supabase** - Database for job tracking and user data
+3. **Google Drive OAuth** - For uploading results to user's Drive
+
+### Installing Redis
+
+**Ubuntu/Debian:**
+```bash
+sudo apt-get update
+sudo apt-get install redis-server
+sudo systemctl start redis
+sudo systemctl enable redis
+```
+
+**macOS:**
+```bash
+brew install redis
+brew services start redis
+```
+
+**Docker:**
+```bash
+docker run -d -p 6379:6379 --name redis redis:7-alpine
+```
+
+**Verify Redis is running:**
+```bash
+redis-cli ping
+# Should return: PONG
+```
+
+### Configuring Supabase
+
+1. Create a Supabase project at [supabase.com](https://supabase.com)
+
+2. Create the required tables in your Supabase SQL Editor:
+
+```sql
+-- Document generation requests
+CREATE TABLE document_requests (
+  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+  user_id INTEGER NOT NULL,
+  status TEXT NOT NULL CHECK (status IN ('queued', 'processing', 'generating', 'completed', 'failed')),
+  request_metadata JSONB NOT NULL,
+  error_message TEXT,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+  updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Generated documents
+CREATE TABLE generated_documents (
+  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+  request_id UUID NOT NULL REFERENCES document_requests(id),
+  document_id TEXT NOT NULL,
+  file_url TEXT,
+  zip_url TEXT,
+  file_size_mb DECIMAL,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- User integrations (Google Drive OAuth)
+CREATE TABLE user_integrations (
+  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+  user_id INTEGER NOT NULL,
+  integration_type TEXT NOT NULL CHECK (integration_type IN ('google_drive', 'dropbox')),
+  access_token TEXT NOT NULL,
+  refresh_token TEXT,
+  token_expiry TIMESTAMPTZ,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+  updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+  UNIQUE(user_id, integration_type)
+);
+
+-- Analytics events
+CREATE TABLE analytics_events (
+  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+  user_id INTEGER,
+  event_type TEXT NOT NULL,
+  entity_id UUID,
+  event_data JSONB,
+  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Indexes for performance
+CREATE INDEX idx_document_requests_user_id ON document_requests(user_id);
+CREATE INDEX idx_document_requests_status ON document_requests(status);
+CREATE INDEX idx_generated_documents_request_id ON generated_documents(request_id);
+CREATE INDEX idx_user_integrations_user_id ON user_integrations(user_id);
+CREATE INDEX idx_analytics_events_user_id ON analytics_events(user_id);
+```
+
+3. Add your Supabase credentials to `.env`:
+
+```bash
+# In api/.env
+SUPABASE_URL=https://your-project-ref.supabase.co
+SUPABASE_KEY=your-anon-or-service-role-key
+```
+
+### Configuring Google Drive OAuth
+
+Users need to connect their Google Drive account for result storage:
+
+1. Create a Google Cloud Project at [console.cloud.google.com](https://console.cloud.google.com)
+2. Enable Google Drive API
+3. Create OAuth 2.0 credentials (Web application)
+4. Add authorized redirect URIs (e.g., `http://localhost:3000/auth/google/callback`)
+5. Download credentials JSON
+
+6. Users authenticate via OAuth flow (implement in your frontend):
+
+```python
+# Example OAuth flow (implement in your auth system)
+from google_auth_oauthlib.flow import Flow
+
+flow = Flow.from_client_config(
+    client_config={
+        "web": {
+            "client_id": "YOUR_CLIENT_ID",
+            "client_secret": "YOUR_CLIENT_SECRET",
+            "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+            "token_uri": "https://oauth2.googleapis.com/token",
+            "redirect_uris": ["http://localhost:3000/auth/google/callback"]
+        }
+    },
+    scopes=["https://www.googleapis.com/auth/drive.file"]
+)
+
+# User visits auth URL, gets redirected back with code
+authorization_url, state = flow.authorization_url(access_type='offline', include_granted_scopes='true')
+
+# Exchange code for tokens
+flow.fetch_token(code=authorization_code)
+credentials = flow.credentials
+
+# Store in Supabase user_integrations table
+supabase.table('user_integrations').insert({
+    'user_id': user_id,
+    'integration_type': 'google_drive',
+    'access_token': credentials.token,
+    'refresh_token': credentials.refresh_token,
+    'token_expiry': credentials.expiry
+}).execute()
+```
+
+### Starting the Background Worker
+
+1. Configure environment variables in `api/.env`:
+
+```bash
+# Redis Configuration
+REDIS_URL=redis://localhost:6379/0
+RQ_QUEUE_NAME=docgenie
+
+# Batch Processing
+BATCH_POLL_INTERVAL=30  # seconds
+BATCH_DATA_DIR=/tmp/docgenie_batches
+MESSAGE_DATA_DIR=/tmp/docgenie_messages
+
+# Google Drive
+GOOGLE_DRIVE_FOLDER_NAME=DocGenie Documents
+
+# Supabase (already configured above)
+SUPABASE_URL=https://your-project.supabase.co
+SUPABASE_KEY=your_key_here
+
+# Claude API
+ANTHROPIC_API_KEY=your_api_key_here
+```
+
+2. Start the worker:
+
+```bash
+cd api/
+./start_worker.sh
+```
+
+The worker will:
+- ✓ Check Redis connection
+- ✓ Validate Supabase configuration
+- ✓ Verify Claude API key
+- ✓ Create temporary directories
+- ✓ Start RQ worker listening on `docgenie` queue
+
+**Output:**
+```
+🚀 Starting DocGenie RQ Worker...
+✓ Loading .env file...
+✓ Redis connected
+✓ Supabase configured
+✓ Claude API key configured
+✓ Temporary directories created
+
+============================================
+Worker Configuration:
+  Queue: docgenie
+  Redis: redis://localhost:6379/0
+  Batch Data: /tmp/docgenie_batches
+  Message Data: /tmp/docgenie_messages
+============================================
+
+✅ Starting RQ worker (press Ctrl+C to stop)...
+
+12:00:00 RQ worker 'worker-abc123' started on docgenie queue
+```
+
+### Running Multiple Workers (Production)
+
+For production systems with high load, run multiple workers:
+
+```bash
+# Terminal 1
+./start_worker.sh
+
+# Terminal 2
+./start_worker.sh
+
+# Terminal 3
+./start_worker.sh
+```
+
+Each worker processes jobs independently from the same queue.
+
+**For detailed scaling instructions**, see [SCALING.md](SCALING.md).
+
+### Monitoring Workers
+
+```bash
+# View worker status
+rq info --url redis://localhost:6379/0
+
+# View queue status
+rq info --queue docgenie --url redis://localhost:6379/0
+
+# View failed jobs
+rq info --queue failed --url redis://localhost:6379/0
+```
+
+### Architecture Overview
+
+```
+┌─────────────┐        ┌─────────────┐        ┌─────────────────┐
+│   FastAPI   │───────▶│    Redis    │◀───────│  RQ Workers     │
+│   Server    │        │   Queue     │        │  (1-5 instances)│
+│             │        │             │        │                 │
+│ /generate/  │        │ Job Queue:  │        │ • Downloads     │
+│  async      │        │ - queued    │        │ • Claude Batch  │
+│             │        │ - pending   │        │ • PDF render    │
+│ /jobs/      │        │ - active    │        │ • Handwriting   │
+│  {id}/      │        │             │        │ • OCR           │
+│  status     │        │             │        │ • ZIP creation  │
+└──────┬──────┘        └─────────────┘        └────────┬────────┘
+       │                                               │
+       │                                               │
+       ▼                                               ▼
+┌──────────────────────────────────────────────────────────────┐
+│                          Supabase                             │
+│  • document_requests (job tracking)                           │
+│  • generated_documents (results metadata)                     │
+│  • user_integrations (Google Drive OAuth)                     │
+│  • analytics_events (usage tracking)                          │
+└───────────────────────────────────────────────────────────────┘
+       │
+       │ Upload Results
+       ▼
+┌──────────────────────────────────────────────────────────────┐
+│                      Google Drive                             │
+│  • User's "DocGenie Documents" folder                         │
+│  • ZIP files with generated documents                         │
+│  • Shareable links returned to API                            │
+└──────────────────────────────────────────────────────────────┘
+```
+
+### Cost Comparison: Direct vs Batched API
+
+| API Type | Cost (Input) | Cost (Output) | Latency | Use Case |
+|----------|-------------|---------------|---------|----------|
+| Direct   | $5.00/1M tokens | $15.00/1M tokens | 30-120s | Real-time, interactive |
+| **Batched** | **$2.50/1M tokens** | **$7.50/1M tokens** | 5-30 min | **Background jobs (recommended)** |
+
+**Example Cost Calculation:**
+- Generate 100 documents per day
+- Each request: 5,000 input tokens, 10,000 output tokens
+
+**Direct API Cost:**
+- Input: (100 × 5,000 / 1M) × $5.00 = $2.50/day
+- Output: (100 × 10,000 / 1M) × $15.00 = $15.00/day
+- **Total: $17.50/day = $525/month**
+
+**Batched API Cost:**
+- Input: (100 × 5,000 / 1M) × $2.50 = $1.25/day
+- Output: (100 × 10,000 / 1M) × $7.50 = $7.50/day
+- **Total: $8.75/day = $262.50/month**
+
+**💰 Savings: $262.50/month (50% reduction)**
+
+## Scaling Workers
+
+The API uses Redis Queue (RQ) workers for background job processing. Scale workers based on load:
+
+| User Load | Workers | Redis RAM | Notes |
+|-----------|---------|-----------|-------|
+| < 10 req/hr | 1 | 256 MB | Development |
+| 10–50 req/hr | 2–3 | 512 MB | Small production |
+| 50–200 req/hr | 3–5 | 1 GB | Medium production |
+| > 200 req/hr | 5+ | 2+ GB | Large production |
+
+### Starting Workers
+
+```bash
+# Single worker (development)
+./start_worker.sh
+
+# Multiple workers (production) — run in separate terminals
+./start_worker.sh   # Terminal 1
+./start_worker.sh   # Terminal 2
+
+# Docker Compose — scale to 3 workers
+docker-compose up --scale worker=3
+
+# Monitor
+rq info --url redis://localhost:6379/0
+rq info --queue docgenie --url redis://localhost:6379/0
+```
+
+### Railway Multi-Worker (Separate Service)
+1. Railway dashboard → New Service → GitHub Repo (same repo)
+2. Name: `docgenie-worker`
+3. Custom Start Command: `rq worker --url $REDIS_URL`
+4. Add the same environment variables as the API service
+
+> For most use cases the **combined** mode (API + worker in one service, see `railway.json`) is sufficient and cheaper.
+
+## Contributing
+
+This API is a simplified interface to the DocGenie pipeline. For the full pipeline with all features, see the main DocGenie documentation.
+
+## License
+
+Same as DocGenie main project.
diff --git a/api/TESTING.md b/api/TESTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..40b757888d6759423420ff41eb191c7daf576e90
--- /dev/null
+++ b/api/TESTING.md
@@ -0,0 +1,936 @@
+# Testing Guide: DocGenie API
+
+Complete guide for testing the document generation API endpoints with Google Drive integration.
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Quick Start](#quick-start)
+3. [Getting Google Drive Token](#getting-google-drive-token)
+4. [Testing Async API](#testing-async-api)
+5. [Testing Sync PDF API](#testing-sync-pdf-api)
+6. [Manual Testing with cURL](#manual-testing-with-curl)
+7. [Frontend Integration Example](#frontend-integration-example)
+8. [Troubleshooting](#troubleshooting)
+
+---
+
+## Prerequisites
+
+### 1. Start Required Services
+
+```bash
+# Terminal 1: Start Redis
+
+## Option A: Local Redis (Recommended for Development)
+# Install Redis (Ubuntu/Debian)
+sudo apt-get update && sudo apt-get install redis-server -y
+sudo systemctl start redis-server
+sudo systemctl enable redis-server
+
+# Verify Redis is running
+redis-cli ping  # Should return "PONG"
+
+## Option B: Docker (if Docker is installed)
+# docker run -d -p 6379:6379 --name redis redis:7-alpine
+
+# Terminal 2: Start FastAPI Server
+cd docgenie/api
+python main.py
+
+# Terminal 3: Start RQ Worker
+cd docgenie/api
+./start_worker.sh
+```
+
+### 2. Configure Environment
+
+Make sure your `api/.env` file has:
+
+```bash
+# Required
+ANTHROPIC_API_KEY=your_claude_api_key
+SUPABASE_URL=https://your-project.supabase.co
+SUPABASE_KEY=your_supabase_key
+REDIS_URL=redis://localhost:6379/0
+
+# Optional (for token refresh)
+GOOGLE_CLIENT_ID=your_client_id.apps.googleusercontent.com
+GOOGLE_CLIENT_SECRET=your_client_secret
+```
+
+### 3. Create Supabase Tables
+
+Run the SQL from [DEPLOYMENT.md](DEPLOYMENT.md#32-create-database-schema) in your Supabase SQL Editor.
+
+---
+
+## Quick Start
+
+### Option 1: Using Test Script (Easiest)
+
+```bash
+# Get Google Drive token first (one-time setup)
+python api/test_get_google_token.py \
+  --client-id YOUR_CLIENT_ID \
+  --client-secret YOUR_CLIENT_SECRET
+
+# Copy the access token, then run test
+python api/test_async_api.py --google-token YOUR_ACCESS_TOKEN
+```
+
+### Option 2: Using OAuth Playground (Quick Test)
+
+1. Go to [OAuth Playground](https://developers.google.com/oauthplayground/)
+2. Configure with your credentials
+3. Get access token
+4. Run test script with token
+
+---
+
+## Getting Google Drive Token
+
+### Method 1: Using Helper Script (Recommended)
+
+Our helper script automates the OAuth flow:
+
+```bash
+cd docgenie/api
+
+python test_get_google_token.py \
+  --client-id YOUR_GOOGLE_CLIENT_ID \
+  --client-secret YOUR_GOOGLE_CLIENT_SECRET
+```
+
+**What it does:**
+1. Opens browser for Google authorization
+2. Starts local server on port 8080 for callback
+3. Exchanges authorization code for tokens
+4. Displays access token and refresh token
+
+**Output:**
+```
+Access Token: ya29.a0AfH6SMBx...
+Refresh Token: 1//0gw...
+```
+
+### Method 2: OAuth Playground (No Code)
+
+1. **Go to**: https://developers.google.com/oauthplayground/
+
+2. **Configure Credentials**:
+   - Click gear icon (⚙) in top right
+   - Check "Use your own OAuth credentials"
+   - Enter your Client ID and Client Secret
+
+3. **Authorize API**:
+   - In left panel, scroll to "Drive API v3"
+   - Select: `https://www.googleapis.com/auth/drive.file`
+   - Click "Authorize APIs"
+   - Sign in with your Google account
+
+4. **Get Token**:
+   - Click "Exchange authorization code for tokens"
+   - Copy the "Access token" value
+
+### Method 3: Manual cURL (For Advanced Users)
+
+**Step 1: Get Authorization Code**
+
+Open this URL in browser (replace YOUR_CLIENT_ID):
+
+```
+https://accounts.google.com/o/oauth2/v2/auth?client_id=YOUR_CLIENT_ID&redirect_uri=http://localhost:8080&response_type=code&scope=https://www.googleapis.com/auth/drive.file&access_type=offline&prompt=consent
+```
+
+**Step 2: Exchange Code for Token**
+
+After authorization, you'll be redirected to:
+```
+http://localhost:8080/?code=AUTHORIZATION_CODE
+```
+
+Exchange the code:
+
+```bash
+curl -X POST https://oauth2.googleapis.com/token \
+  -d "code=AUTHORIZATION_CODE" \
+  -d "client_id=YOUR_CLIENT_ID" \
+  -d "client_secret=YOUR_CLIENT_SECRET" \
+  -d "redirect_uri=http://localhost:8080" \
+  -d "grant_type=authorization_code"
+```
+
+Response:
+```json
+{
+  "access_token": "ya29.a0AfH6SMBx...",
+  "refresh_token": "1//0gw...",
+  "expires_in": 3600,
+  "token_type": "Bearer"
+}
+```
+
+---
+
+## Testing Async API
+
+The async API (`/generate/async`) is optimized for batch processing with 50% cost savings. Jobs are queued and processed in the background, with status polling.
+
+### Full Automated Test
+
+```bash
+cd docgenie/api
+
+# Set token as environment variable
+export GOOGLE_DRIVE_TOKEN="ya29.a0AfH6SMBx..."
+
+# Run test (generates 2 documents by default)
+python test_async_api.py
+
+# Or pass token directly
+python test_async_api.py --google-token "ya29.a0AfH6SMBx..."
+```
+
+**Test Flow:**
+1. ✓ Health check
+2. ✓ Submit async job
+3. ✓ Poll status (every 30 seconds)
+4. ✓ List user jobs
+5. ✓ Display Google Drive link
+
+**Expected Output:**
+```
+================================================================================
+                        ASYNC API TEST SUITE
+================================================================================
+Base URL: http://localhost:8000
+User ID: 1
+Documents to Generate: 2
+================================================================================
+
+============================================================
+1. Testing API Health
+============================================================
+✓ API is healthy: {'status': 'healthy', 'version': '1.0.0'}
+
+============================================================
+2. Submitting Async Job
+============================================================
+Payload:
+  User ID: 1
+  Seed Images: 1
+  Num Solutions: 2
+  Google Token: ya29.a0AfH6SMBx...
+
+✓ Job submitted successfully!
+  Request ID: 550e8400-e29b-41d4-a716-446655440000
+  Status: queued
+  Estimated Time: 10 minutes
+  Poll URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status
+
+============================================================
+3. Polling Job Status
+============================================================
+Polling every 30 seconds (max 60 attempts)
+Status flow: queued → processing → generating → completed/failed
+
+[12:00:00] Poll 1/60: QUEUED
+[12:00:30] Poll 2/60: PROCESSING - Creating batch request...
+[12:01:00] Poll 3/60: GENERATING - Batch submitted to Claude...
+[12:08:30] Poll 17/60: GENERATING - Polling batch status...
+[12:15:00] Poll 30/60: COMPLETED
+
+============================================================
+✓ JOB COMPLETED!
+============================================================
+  Download URL: https://drive.google.com/file/d/abc123xyz/view?usp=sharing
+  File Size: 15.4 MB
+  Document Count: 2
+  Created: 2026-02-28T12:00:00Z
+  Completed: 2026-02-28T12:15:00Z
+
+================================================================================
+                           TEST SUMMARY
+================================================================================
+✓ ALL TESTS PASSED!
+
+Your documents are available at:
+  https://drive.google.com/file/d/abc123xyz/view?usp=sharing
+
+Next steps:
+  1. Open the Google Drive link in your browser
+  2. Download the ZIP file
+  3. Extract and verify generated documents
+```
+
+### Test Options
+
+```bash
+# Custom number of documents
+python test_async_api.py --google-token TOKEN --num-solutions 5
+
+# Custom API URL (if deployed)
+python test_async_api.py --google-token TOKEN --base-url https://api.yourdomain.com
+
+# Different user ID
+python test_async_api.py --google-token TOKEN --user-id 42
+
+# With refresh token
+python test_async_api.py \
+  --google-token ACCESS_TOKEN \
+  --google-refresh-token REFRESH_TOKEN
+
+# Show help for getting token
+python test_async_api.py --help-token
+```
+
+---Testing Sync PDF API
+
+The sync PDF API (`/generate/pdf`) returns results immediately (20-60s) and supports three modes of operation. Perfect for smaller batch sizes and real-time workflows.
+
+### Three Operating Modes
+
+**Mode 1: Quick Demo (No Tracking)**
+- Returns ZIP immediately
+- No Supabase records created
+- Perfect for quick testing and demos
+- No user_id required
+
+**Mode 2: Demo with Tracking**
+- Returns ZIP immediately
+- Creates Supabase record for tracking
+- Can poll status during generation
+- Requires user_id
+
+**Mode 3: Full Production**
+- Returns ZIP immediately
+- Creates Supabase record
+- Uploads to Google Drive in background
+- Requires user_id + google_drive_token
+- Best for production use
+
+### Full Automated Test
+
+```bash
+cd docgenie/api
+
+# Mode 1: Quick demo (no tracking)
+python test_sync_pdf_api.py
+
+# Mode 2: Demo with tracking
+python test_sync_pdf_api.py --user-id 123
+
+# Mode 3: Full production (tracking + GDrive)
+python test_sync_pdf_api.py \
+  --user-id 123 \
+  --google-token "ya29.a0AfH6SMBx..." \
+  --google-refresh-token "1//0gw..."
+```
+
+**Test Flow for All Modes:**
+1. ✓ Health check
+2. ✓ Test Mode 1: Quick demo (always runs)
+3. ✓ Test Mode 2: With tracking (if user_id provided)
+4. ✓ Test Mode 3: Full production (if user_id + token provided)
+5. ✓ Validate ZIP contents
+6. ✓ Test status polling (Modes 2 & 3)
+7. ✓ Verify GDrive upload (Mode 3)
+
+**Expected Output:**
+```
+================================================================================
+DocGenie /generate/pdf Endpoint Test Suite
+================================================================================
+
+================================================================================
+1. Testing API Health
+================================================================================
+✓ API is healthy: {'status': 'healthy', 'version': '1.0.0'}
+
+================================================================================
+2. Testing Mode 1: Quick Demo (No Tracking)
+================================================================================
+This mode returns ZIP immediately without creating Supabase records.
+Use for quick testing and demos.
+
+Payload:
+  Seed Images: 1
+  Num Solutions: 1
+  User ID: None (no tracking)
+  Google Token: None
+
+⏳ Calling /generate/pdf (expect 20-60 seconds)...
+
+✓ Response received in 42.3 seconds
+
+Response Headers:
+  Content-Type: application/zip
+  Content-Disposition: attachment; filename=docgenie_documents.zip
+  X-Request-ID: NOT SET (expected in mode 1)
+  X-Status-URL: NOT SET (expected in mode 1)
+
+✓ ZIP file size: 145.2 KB
+✓ ZIP contains 18 files:
+    - README.md
+    - metadata.json
+    - analysis/document_1.json
+    - annotations/gt/document_1.json
+    - bbox/bbox_pdf/word/document_1.json
+    - html/document_1.css
+    - html/document_1.html
+    - img/document_1.png
+    - pdf/pdf_final/document_1.pdf
+    - pdf/pdf_initial/document_1.pdf
+  ✓ Contains metadata.json
+  ✓ Contains README.md
+
+✅ Mode 1 (Quick Demo) Test PASSED
+   ⚡ Fast response: 42.3s
+   📦 Valid ZIP file
+   ✓ No tracking overhead
+
+================================================================================
+3. Testing Mode 2: Demo with Progress Tracking
+================================================================================
+This mode returns ZIP immediately AND creates Supabase record.
+Client can poll /jobs/{request_id}/status during generation.
+
+Payload:
+  User ID: 123 (tracking enabled)
+  Seed Images: 1
+  Num Solutions: 2
+  Google Token: None
+
+⏳ Calling /generate/pdf (expect 20-60 seconds)...
+
+✓ Response received in 58.7 seconds
+
+Response Headers:
+  Content-Type: application/zip
+  ✓ X-Request-ID: 550e8400-e29b-41d4-a716-446655440000
+  ✓ X-Status-URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status
+
+✓ ZIP file size: 287.4 KB
+✓ ZIP contains 32 files
+✓ Found 4 PDF files
+
+⏳ Testing status polling endpoint...
+✓ Status endpoint working:
+  Request ID: 550e8400-e29b-41d4-a716-446655440000
+  Status: completed
+  Created: 2026-03-01T10:15:00Z
+  Updated: 2026-03-01T10:15:58Z
+  ✓ Job marked as completed
+
+✅ Mode 2 (Tracking) Test PASSED
+   ⚡ Fast response: 58.7s
+   📦 Valid ZIP file
+   📊 Progress tracking enabled
+   ✓ Can poll status during generation
+
+================================================================================
+4. Testing Mode 3: Full Production (Tracking + GDrive Upload)
+================================================================================
+This mode returns ZIP immediately AND uploads to Google Drive in background.
+Best for production use with full tracking and backup.
+
+Payload:
+  User ID: 123
+  Google Token: ya29.a0AfH6SMBx...
+  Google Refresh: Yes
+  Seed Images: 1
+  Num Solutions: 1
+
+⏳ Calling /generate/pdf (expect 20-60 seconds)...
+
+✓ Response received in 45.1 seconds
+
+Response Headers:
+  ✓ X-Request-ID: 660f9511-f3ac-52e5-b827-557766551111
+  ✓ X-Status-URL: /jobs/660f9511-f3ac-52e5-b827-557766551111/status
+
+✓ ZIP file size: 151.8 KB
+✓ ZIP contains 18 files
+
+⏳ ZIP returned immediately, GDrive upload happening in background...
+   (This doesn't block the response)
+
+⏳ Waiting 10 seconds for background GDrive upload...
+✓ Status after background upload:
+  Status: completed
+  ✓ GDrive URL: https://drive.google.com/file/d/abc123xyz/view?usp=...
+  ✓ Background upload completed!
+
+✅ Mode 3 (Full Production) Test PASSED
+   ⚡ Fast response: 45.1s (GDrive doesn't block)
+   📦 Valid ZIP file delivered immediately
+   📊 Progress tracking enabled
+   ☁️  Google Drive backup scheduled
+   ✓ Production-ready configuration
+
+================================================================================
+TEST SUMMARY
+================================================================================
+  ✅ health: PASSED
+  ✅ mode_1: PASSED
+  ✅ mode_2: PASSED
+  ✅ mode_3: PASSED
+
+4/4 tests passed
+
+🎉 All tests passed!
+================================================================================
+```
+
+### Test Options
+
+```bash
+# Mode 1 only (default)
+python test_sync_pdf_api.py
+
+# Mode 2 with custom user ID
+python test_sync_pdf_api.py --user-id 456
+
+# Mode 3 with custom API URL
+python test_sync_pdf_api.py \
+  --base-url https://api.yourdomain.com \
+  --user-id 123 \
+  --google-token TOKEN \
+  --google-refresh-token REFRESH_TOKEN
+```
+
+### Comparing Sync vs Async
+
+| Feature | Sync (`/generate/pdf`) | Async (`/generate/async`) |
+|---------|------------------------|---------------------------|
+| **Response Time** | 20-60 seconds | 5-30 minutes |
+| **Best For** | 1-3 documents | 5-50+ documents |
+| **Cost** | Standard API pricing | 50% cheaper (Batch API) |
+| **Result Delivery** | Direct ZIP download | Google Drive upload |
+| **Progress Tracking** | Optional (Modes 2 & 3) | Always enabled |
+| **Use Case** | Real-time workflows, demos | Bulk generation, scheduled jobs |
+
+**When to use Sync:**
+- Generating 1-3 documents
+- Need immediate results
+- Real-time user interactions
+- Quick testing and demos
+
+**When to use Async:**
+- Generating 5+ documents
+- Cost optimization (50% savings)
+- Background/scheduled processing
+- Large batch jobs
+
+---
+
+## Manual Testing with cURL
+
+### Async API (`/generate/async`)
+
+#### 1. Submit Async Job
+
+```bash
+curl -X POST http://localhost:8000/generate/async \
+  -H "Content-Type: application/json" \
+  -d '{
+    "user_id": 1,
+    "google_drive_token": "ya29.a0AfH6SMBx...",
+    "seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "receipts",
+      "num_solutions": 2,
+      "enable_handwriting": false,
+      "enable_visual_elements": false,
+      "output_detail": "minimal"
+    }
+  }'
+```
+
+**Response:**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "queued",
+  "estimated_time_minutes": 10,
+  "poll_url": "/jobs/550e8400-e29b-41d4-a716-446655440000/status",
+  "created_at": "2026-02-28T12:00:00Z"
+}
+```
+
+#### 2. Check Job Status
+
+```bash
+curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
+```
+
+**Response (Processing):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "processing",
+  "created_at": "2026-02-28T12:00:00Z",
+  "updated_at": "2026-02-28T12:02:00Z",
+  "progress": "Creating batch request..."
+}
+```
+
+**Response (Completed):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "completed",
+  "created_at": "2026-02-28T12:00:00Z",
+  "updated_at": "2026-02-28T12:15:00Z",
+  "download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
+  "file_size_mb": 15.4,
+  "document_count": 2
+}
+```
+
+#### 3. List User Jobs
+
+```bash
+curl "http://localhost:8000/jobs/user/1?limit=10&offset=0"
+```
+
+**Response:**
+```json
+{
+  "user_id": 1,
+  "jobs": [
+    {
+      "request_id": "550e8400-e29b-41d4-a716-446655440000",
+      "status": "completed",
+      "created_at": "2026-02-28T12:00:00Z",
+      "download_url": "https://drive.google.com/file/d/abc123xyz/view"
+    }
+  ],
+  "count": 1,
+  "limit": 10,
+  "offset": 0
+}
+```
+
+### Sync PDF API (`/generate/pdf`)
+
+#### Mode 1: Quick Demo (No Tracking)
+
+```bash
+curl -X POST http://localhost:8000/generate/pdf \
+  -H "Content-Type: application/json" \
+  -d '{
+    "seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "receipts",
+      "num_solutions": 1,
+      "enable_handwriting": false,
+      "enable_visual_elements": false,
+      "output_detail": "minimal"
+    }
+  }' \
+  --output documents.zip
+```
+
+**Response:**
+- Returns ZIP file directly (binary)
+- No tracking headers
+- File saved as `documents.zip`
+
+#### Mode 2: Demo with Tracking
+
+```bash
+curl -X POST http://localhost:8000/generate/pdf \
+  -H "Content-Type: application/json" \
+  -d '{
+    "user_id": 123,
+    "seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "business documents",
+      "num_solutions": 2,
+      "enable_handwriting": false,
+      "output_detail": "minimal"
+    }
+  }' \
+  --output documents.zip \
+  -D headers.txt
+```
+
+**Response:**
+- Returns ZIP file directly (binary)
+- Headers saved to `headers.txt` contain:
+  - `X-Request-ID: 550e8400-e29b-41d4-a716-446655440000`
+  - `X-Status-URL: /jobs/550e8400-e29b-41d4-a716-446655440000/status`
+
+**Check Status:**
+```bash
+# Extract request_id from headers.txt, then:
+curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
+```
+
+#### Mode 3: Full Production (Tracking + GDrive)
+
+```bash
+curl -X POST http://localhost:8000/generate/pdf \
+  -H "Content-Type: application/json" \
+  -d '{
+    "user_id": 123,
+    "google_drive_token": "ya29.a0AfH6SMBx...",
+    "google_drive_refresh_token": "1//0gw...",
+    "seed_images": ["https://ocr.space/Content/Images/receipt-ocr-original.webp"],
+    "prompt_params": {
+      "language": "English",
+      "doc_type": "invoices",
+      "num_solutions": 1,
+      "enable_handwriting": false,
+      "output_detail": "dataset"
+    }
+  }' \
+  --output documents.zip \
+  -D headers.txt
+```
+
+**Response:**
+- Returns ZIP file immediately (binary)
+- Google Drive upload happens in background
+- Wait 10-30 seconds, then check status for GDrive URL:
+
+```bash
+curl http://localhost:8000/jobs/550e8400-e29b-41d4-a716-446655440000/status
+```
+
+**Response (after background upload):**
+```json
+{
+  "request_id": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "completed",
+  "created_at": "2026-03-01T10:00:00Z",
+  "updated_at": "2026-03-01T10:00:45Z",
+  "results": {
+    "download_url": "https://drive.google.com/file/d/abc123xyz/view?usp=sharing",
+    "file_size_mb": 0.15,
+    "document_count": 1,
+    "zip_filename": "docgenie_550e8400-e29b-41d4-a716-446655440000.zip"
+  }
+}
+```
+
+---
+
+## Frontend Integration Example
+
+### React + TypeScript
+
+```typescript
+import { useState, useEffect } from 'react';
+
+interface JobStatus {
+  request_id: string;
+  status: 'queued' | 'processing' | 'generating' | 'completed' | 'failed';
+  download_url?: string;
+  error_message?: string;
+}
+
+function DocumentGenerator() {
+  const [jobId, setJobId] = useState<string | null>(null);
+  const [status, setStatus] = useState<JobStatus | null>(null);
+  const [googleToken, setGoogleToken] = useState<string>('');
+
+  // Step 1: Google OAuth (implement separately)
+  const handleGoogleAuth = async () => {
+    // Redirect to Google OAuth
+    const clientId = 'YOUR_CLIENT_ID';
+    const redirectUri = 'https://yourapp.com/auth/callback';
+    const scope = 'https://www.googleapis.com/auth/drive.file';
+    
+    const authUrl = `https://accounts.google.com/o/oauth2/v2/auth?` +
+      `client_id=${clientId}&` +
+      `redirect_uri=${redirectUri}&` +
+      `response_type=code&` +
+      `scope=${scope}&` +
+      `access_type=offline&` +
+      `prompt=consent`;
+    
+    window.location.href = authUrl;
+  };
+
+  // Step 2: Submit job
+  const handleGenerateDocuments = async () => {
+    const response = await fetch('http://localhost:8000/generate/async', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        user_id: 1,
+        google_drive_token: googleToken,
+        seed_images: ['https://example.com/seed.jpg'],
+        prompt_params: {
+          language: 'English',
+          doc_type: 'receipts',
+          num_solutions: 3
+        }
+      })
+    });
+
+    const job = await response.json();
+    setJobId(job.request_id);
+  };
+
+  // Step 3: Poll status
+  useEffect(() => {
+    if (!jobId) return;
+
+    const interval = setInterval(async () => {
+      const response = await fetch(
+        `http://localhost:8000/jobs/${jobId}/status`
+      );
+      const data = await response.json();
+      setStatus(data);
+
+      if (data.status === 'completed' || data.status === 'failed') {
+        clearInterval(interval);
+      }
+    }, 30000); // Poll every 30 seconds
+
+    return () => clearInterval(interval);
+  }, [jobId]);
+
+  return (
+    <div>
+      {!googleToken ? (
+        <button onClick={handleGoogleAuth}>
+          Connect Google Drive
+        </button>
+      ) : (
+        <button onClick={handleGenerateDocuments}>
+          Generate Documents
+        </button>
+      )}
+
+      {status && (
+        <div>
+          <p>Status: {status.status}</p>
+          {status.status === 'completed' && (
+            <a href={status.download_url} target="_blank">
+              Download Documents
+            </a>
+          )}
+          {status.status === 'failed' && (
+            <p>Error: {status.error_message}</p>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
+```
+
+---
+
+## Troubleshooting
+
+### Issue: "google_drive_token is required"
+
+**Cause**: No token provided in request
+
+**Solution**:
+```bash
+# Make sure you're passing the token
+python test_async_api.py --google-token "ya29.a0AfH6SMBx..."
+```
+
+### Issue: "Failed to refresh Google Drive token"
+
+**Cause**: Token expired and no refresh token provided
+
+**Solutions**:
+1. Get a new token (tokens expire in ~1 hour)
+2. Include refresh token in request
+3. Frontend should refresh tokens automatically
+
+### Issue: "Google Drive upload failed: insufficient permissions"
+
+**Cause**: Token doesn't have drive.file scope
+
+**Solution**: Re-authorize with correct scope:
+```
+https://www.googleapis.com/auth/drive.file
+```
+
+### Issue: Worker not processing jobs
+
+**Check 1**: Is Redis running?
+```bash
+redis-cli ping  # Should return "PONG"
+```
+
+**Check 2**: Is worker running?
+```bash
+# Check worker logs
+journalctl -u docgenie-worker@1 -f
+
+# Or check RQ info
+rq info --url redis://localhost:6379/0
+```
+
+**Check 3**: Check failed queue
+```bash
+rq info --queue failed --url redis://localhost:6379/0
+```
+
+### Issue: Job stuck in "generating" status
+
+**Cause**: Batch API taking longer than expected
+
+**Solution**: Wait up to 30 minutes for batched requests. Check Anthropic dashboard:
+https://console.anthropic.com/settings/batches
+
+### Issue: Cannot access Google Drive link
+
+**Cause**: File not shared properly
+
+**Solution**: Check worker logs for sharing errors. File should have "anyone with link" permission.
+
+---
+
+## Performance Testing
+
+### Test Batch API Cost Savings
+
+```bash
+# Generate 10 documents
+time python test_async_api.py --google-token TOKEN --num-solutions 10
+
+# Compare with direct API (for reference)
+curl -X POST http://localhost:8000/generate \
+  -H "Content-Type: application/json" \
+  -d '{"seed_images": ["..."], "prompt_params": {"num_solutions": 10}}'
+```
+
+**Expected Results:**
+- **Batched API**: 10-20 minutes, ~$2.50 per 1M tokens
+- **Direct API**: 3-5 minutes, ~$5.00 per 1M tokens
+- **Cost Savings**: 50%
+
+---
+
+## Next Steps
+
+1. ✅ Test locally with script
+2. ✅ Verify Google Drive upload
+3. ✅ Test with your frontend
+4. ✅ Deploy to production (see [DEPLOYMENT.md](DEPLOYMENT.md))
+5. ✅ Set up monitoring (see [SCALING.md](SCALING.md))
+
+---
+
+## Additional Resources
+
+- **API Documentation**: http://localhost:8000/docs
+- **Deployment Guide**: [DEPLOYMENT.md](DEPLOYMENT.md)
+- **Scaling Guide**: [SCALING.md](SCALING.md)
+- **Google OAuth Docs**: https://developers.google.com/identity/protocols/oauth2
+- **Anthropic Batch API**: https://docs.anthropic.com/en/docs/batch-api
diff --git a/api/__init__.py b/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..891c248012ecf26d05119d380696615c4f4cd514
--- /dev/null
+++ b/api/__init__.py
@@ -0,0 +1,4 @@
+"""
+DocGenie FastAPI - REST API for document generation.
+"""
+__version__ = "1.0.0"
diff --git a/api/config.py b/api/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfa1d429a566d366123a3e59440929b91434964
--- /dev/null
+++ b/api/config.py
@@ -0,0 +1,127 @@
+"""
+Configuration settings for DocGenie API
+"""
+import os
+from typing import Optional, List
+
+
+class Settings:
+    """API configuration settings"""
+    
+    # ==================== LLM Configuration ====================
+    ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
+    CLAUDE_MODEL: str = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
+    # Backward compatibility
+    LLM_MODEL: str = os.getenv("LLM_MODEL", CLAUDE_MODEL)
+    
+    # ==================== Handwriting Service (Stage 3) ====================
+    HANDWRITING_SERVICE_URL: str = os.getenv(
+        "HANDWRITING_SERVICE_URL",
+        "http://localhost:8080"
+    )
+    RUNPOD_API_KEY: str = os.getenv("RUNPOD_API_KEY", "")
+    HANDWRITING_SERVICE_TIMEOUT: int = int(os.getenv("HANDWRITING_SERVICE_TIMEOUT", "300"))
+    HANDWRITING_SERVICE_MAX_RETRIES: int = int(os.getenv("HANDWRITING_SERVICE_MAX_RETRIES", "3"))
+    HANDWRITING_SERVICE_ENABLED: bool = os.getenv("HANDWRITING_SERVICE_ENABLED", "false").lower() == "true"
+    HANDWRITING_APPLY_BLUR: bool = os.getenv("HANDWRITING_APPLY_BLUR", "false").lower() == "true"
+    
+    # ==================== OCR Service (Stage 4) ====================
+    OCR_SERVICE_URL: str = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
+    OCR_SERVICE_TIMEOUT: int = int(os.getenv("OCR_SERVICE_TIMEOUT", "30"))
+    OCR_SERVICE_ENABLED: bool = os.getenv("OCR_SERVICE_ENABLED", "false").lower() == "true"
+    OCR_ENGINE: str = os.getenv("OCR_ENGINE", "microsoft_di")
+    OCR_DPI: int = int(os.getenv("OCR_DPI", "300"))  # DPI for PDF to image conversion
+    
+    # Local Tesseract OCR (alternative to remote service)
+    OCR_USE_LOCAL: bool = os.getenv("OCR_USE_LOCAL", "false").lower() == "true"
+    OCR_TESSERACT_LANG: str = os.getenv("OCR_TESSERACT_LANG", "eng")  # Tesseract language
+    OCR_TESSERACT_CONFIG: str = os.getenv("OCR_TESSERACT_CONFIG", "--psm 3")  # Tesseract config
+    
+    # ==================== Stage 5: Dataset Packaging ====================
+    # Stage 16: BBox normalization
+    BBOX_NORMALIZATION_ENABLED: bool = os.getenv("BBOX_NORMALIZATION_ENABLED", "false").lower() == "true"
+    BBOX_NORMALIZATION_SCALE: str = os.getenv("BBOX_NORMALIZATION_SCALE", "0-1")  # "0-1" or "0-1000"
+    
+    # Stage 17: GT verification
+    GT_VERIFICATION_ENABLED: bool = os.getenv("GT_VERIFICATION_ENABLED", "false").lower() == "true"
+    GT_VERIFICATION_SIMILARITY_CUTOFF: float = float(os.getenv("GT_VERIFICATION_SIMILARITY_CUTOFF", "0.8"))
+    GT_VERIFICATION_OVERLAP_THRESHOLD: float = float(os.getenv("GT_VERIFICATION_OVERLAP_THRESHOLD", "0.5"))
+    
+    # Stage 18: Analysis
+    ANALYSIS_ENABLED: bool = os.getenv("ANALYSIS_ENABLED", "false").lower() == "true"
+    ANALYSIS_MIN_ANNOTATION_COUNT: int = int(os.getenv("ANALYSIS_MIN_ANNOTATION_COUNT", "1"))
+    
+    # Stage 19: Debug visualization
+    DEBUG_VISUALIZATION_ENABLED: bool = os.getenv("DEBUG_VISUALIZATION_ENABLED", "false").lower() == "true"
+    DEBUG_SHOW_TEXT_IN_BBOX: bool = os.getenv("DEBUG_SHOW_TEXT_IN_BBOX", "true").lower() == "true"
+    DEBUG_BBOX_COLOR_RGB: str = os.getenv("DEBUG_BBOX_COLOR_RGB", "255,0,0")  # Red default
+    
+    # Dataset export
+    DATASET_EXPORT_ENABLED: bool = os.getenv("DATASET_EXPORT_ENABLED", "false").lower() == "true"
+    DATASET_EXPORT_FORMAT: str = os.getenv("DATASET_EXPORT_FORMAT", "msgpack")  # msgpack, coco, huggingface
+    DATASET_EXPORT_DIR: str = os.getenv("DATASET_EXPORT_DIR", "/tmp/docgenie_datasets")
+    DATASET_RESIZE_IMAGES: bool = os.getenv("DATASET_RESIZE_IMAGES", "false").lower() == "true"
+    DATASET_CLIP_BBOXES_TO_FOREGROUND: bool = os.getenv("DATASET_CLIP_BBOXES_TO_FOREGROUND", "false").lower() == "true"
+    
+    # ==================== API Server Configuration ====================
+    API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
+    API_PORT: int = int(os.getenv("API_PORT", "8000"))
+    DEBUG_MODE: bool = os.getenv("DEBUG_MODE", "false").lower() == "true"
+    
+    # ==================== CORS Configuration ====================
+    CORS_ORIGINS: List[str] = [
+        origin.strip() 
+        for origin in os.getenv("CORS_ORIGINS", "*").split(",")
+        if origin.strip()
+    ] or ["*"]
+    
+    # ==================== File Storage ====================
+    TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/docgenie_api")
+    
+    # ==================== Logging ====================
+    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
+    
+    # ==================== Database (Optional) ====================
+    DATABASE_URL: Optional[str] = os.getenv("DATABASE_URL", None)
+    REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+    
+    # ==================== Supabase ====================
+    SUPABASE_URL: str = os.getenv("SUPABASE_URL", "")
+    SUPABASE_KEY: str = os.getenv("SUPABASE_KEY", "")
+    
+    # ==================== Background Jobs ====================
+    RQ_QUEUE_NAME: str = os.getenv("RQ_QUEUE_NAME", "docgenie")
+    BATCH_POLL_INTERVAL: int = int(os.getenv("BATCH_POLL_INTERVAL", "30"))  # seconds
+    BATCH_DATA_DIR: str = os.getenv("BATCH_DATA_DIR", "/tmp/docgenie_batches")
+    MESSAGE_DATA_DIR: str = os.getenv("MESSAGE_DATA_DIR", "/tmp/docgenie_messages")
+    
+    # ==================== Google Drive ====================
+    GOOGLE_DRIVE_FOLDER_NAME: str = os.getenv("GOOGLE_DRIVE_FOLDER_NAME", "DocGenie Documents")
+    GOOGLE_CLIENT_ID: Optional[str] = os.getenv("GOOGLE_CLIENT_ID", None)  # For token refresh only
+    GOOGLE_CLIENT_SECRET: Optional[str] = os.getenv("GOOGLE_CLIENT_SECRET", None)  # For token refresh only
+    
+    # ==================== Monitoring ====================
+    SENTRY_DSN: Optional[str] = os.getenv("SENTRY_DSN", None)
+    ENABLE_METRICS: bool = os.getenv("ENABLE_METRICS", "false").lower() == "true"
+    METRICS_PORT: int = int(os.getenv("METRICS_PORT", "9090"))
+    
+    # ==================== AWS (Optional) ====================
+    AWS_ACCESS_KEY_ID: Optional[str] = os.getenv("AWS_ACCESS_KEY_ID", None)
+    AWS_SECRET_ACCESS_KEY: Optional[str] = os.getenv("AWS_SECRET_ACCESS_KEY", None)
+    AWS_REGION: str = os.getenv("AWS_REGION", "us-east-1")
+    S3_BUCKET: Optional[str] = os.getenv("S3_BUCKET", None)
+    
+    @classmethod
+    def validate(cls) -> bool:
+        """Validate required settings"""
+        if not cls.ANTHROPIC_API_KEY:
+            raise ValueError("ANTHROPIC_API_KEY environment variable is required")
+        return True
+    
+    @classmethod
+    def get_cors_origins(cls) -> List[str]:
+        """Get CORS origins list"""
+        return cls.CORS_ORIGINS if cls.CORS_ORIGINS != ["*"] else ["*"]
+
+
+settings = Settings()
diff --git a/api/dataset_exporter.py b/api/dataset_exporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea960058761dcf6caef5da889c3552b56a790219
--- /dev/null
+++ b/api/dataset_exporter.py
@@ -0,0 +1,733 @@
+"""
+Dataset Export Manager for DocGenie API
+
+Handles organizing generated documents into a proper dataset structure
+following the original pipeline's SyntheticDatasetFileStructure pattern.
+"""
+
+import pathlib
+import json
+import base64
+import shutil
+from typing import Dict, List, Optional, Any
+
+
+class DatasetExporter:
+    """
+    Manages export of generated documents to organized dataset structure.
+    
+    Structure follows original pipeline pattern:
+    - Single msgpack for all documents
+    - Categorized folders (html/, pdf/, bbox/, etc.)
+    - Subfolders for per-document tokens
+    """
+    
+    def __init__(self, base_path: pathlib.Path, dataset_name: str = "docgenie_documents"):
+        """
+        Initialize dataset exporter.
+        
+        Args:
+            base_path: Base directory for dataset export
+            dataset_name: Name of the dataset (will be subfolder name)
+        """
+        self.base_path = base_path / dataset_name
+        self.dataset_name = dataset_name
+        self.documents = []
+        
+        # Create directory structure
+        self._create_directory_structure()
+    
+    def _create_directory_structure(self):
+        """Create the organized directory structure."""
+        directories = [
+            # Root level
+            self.base_path,
+            
+            # HTML files and CSS
+            self.html_dir,
+            
+            # PDF stages
+            self.pdf_initial_dir,
+            self.pdf_with_handwriting_dir,
+            self.pdf_with_visual_elements_dir,
+            self.pdf_final_dir,
+            
+            # Images
+            self.img_dir,
+            
+            # Bounding boxes
+            self.bbox_pdf_word_dir,
+            self.bbox_pdf_char_dir,
+            self.bbox_final_word_dir,
+            self.bbox_final_segment_dir,
+            self.bbox_final_normalized_word_dir,
+            self.bbox_final_normalized_segment_dir,
+            
+            # Annotations
+            self.raw_annotations_dir,
+            self.gt_dir,
+            self.gt_verification_dir,
+            self.token_mapping_dir,
+            
+            # Handwriting
+            self.handwriting_regions_dir,
+            self.handwriting_tokens_dir,
+            
+            # Visual elements
+            self.visual_element_definitions_dir,
+            self.visual_element_images_dir,
+            
+            # Layout elements
+            self.layout_dir,
+            
+            # Geometries
+            self.geometries_dir,
+            
+            # OCR results
+            self.ocr_results_dir,
+            
+            # Analysis
+            self.analysis_dir,
+            
+            # Debug visualizations
+            self.debug_dir,
+        ]
+        
+        for directory in directories:
+            directory.mkdir(parents=True, exist_ok=True)
+    
+    # ==================== Directory Properties ====================
+    
+    @property
+    def html_dir(self) -> pathlib.Path:
+        """HTML and CSS files"""
+        return self.base_path / "html"
+    
+    @property
+    def pdf_initial_dir(self) -> pathlib.Path:
+        """PDFs before any synthesis"""
+        return self.base_path / "pdf" / "pdf_initial"
+    
+    @property
+    def pdf_with_handwriting_dir(self) -> pathlib.Path:
+        """PDFs with only handwriting added"""
+        return self.base_path / "pdf" / "pdf_with_handwriting"
+    
+    @property
+    def pdf_with_visual_elements_dir(self) -> pathlib.Path:
+        """PDFs with only visual elements added"""
+        return self.base_path / "pdf" / "pdf_with_visual_elements"
+    
+    @property
+    def pdf_final_dir(self) -> pathlib.Path:
+        """PDFs with both handwriting and visual elements"""
+        return self.base_path / "pdf" / "pdf_final"
+    
+    @property
+    def img_dir(self) -> pathlib.Path:
+        """Final rendered images"""
+        return self.base_path / "img"
+    
+    @property
+    def bbox_pdf_word_dir(self) -> pathlib.Path:
+        """Word-level bounding boxes extracted from PDF (ground truth positions)"""
+        return self.base_path / "bbox" / "bbox_pdf" / "word"
+    
+    @property
+    def bbox_pdf_char_dir(self) -> pathlib.Path:
+        """Character-level bounding boxes extracted from PDF"""
+        return self.base_path / "bbox" / "bbox_pdf" / "char"
+    
+    @property
+    def bbox_final_word_dir(self) -> pathlib.Path:
+        """Final word-level bounding boxes (from OCR if modifications applied, else from PDF)"""
+        return self.base_path / "bbox" / "bbox_final" / "word"
+    
+    @property
+    def bbox_final_segment_dir(self) -> pathlib.Path:
+        """Final segment-level bounding boxes (from OCR if modifications applied, else from PDF)"""
+        return self.base_path / "bbox" / "bbox_final" / "segment"
+    
+    @property
+    def bbox_final_normalized_word_dir(self) -> pathlib.Path:
+        """Normalized word-level bounding boxes"""
+        return self.base_path / "bbox" / "bbox_final_normalized" / "word"
+    
+    @property
+    def bbox_final_normalized_segment_dir(self) -> pathlib.Path:
+        """Normalized segment-level bounding boxes"""
+        return self.base_path / "bbox" / "bbox_final_normalized" / "segment"
+    
+    @property
+    def raw_annotations_dir(self) -> pathlib.Path:
+        """Raw annotations (layout boxes before normalization)"""
+        return self.base_path / "annotations" / "raw_annotations"
+    
+    @property
+    def gt_dir(self) -> pathlib.Path:
+        """Ground truth annotations"""
+        return self.base_path / "annotations" / "gt"
+    
+    @property
+    def gt_verification_dir(self) -> pathlib.Path:
+        """Ground truth verification results"""
+        return self.base_path / "annotations" / "gt_verification"
+    
+    @property
+    def token_mapping_dir(self) -> pathlib.Path:
+        """Token mapping files"""
+        return self.base_path / "annotations" / "token_mapping"
+    
+    @property
+    def handwriting_regions_dir(self) -> pathlib.Path:
+        """Handwriting region definitions"""
+        return self.base_path / "handwriting" / "handwriting_regions"
+    
+    @property
+    def handwriting_tokens_dir(self) -> pathlib.Path:
+        """Handwriting token images (per-document subfolders)"""
+        return self.base_path / "handwriting" / "handwriting_tokens"
+    
+    @property
+    def visual_element_definitions_dir(self) -> pathlib.Path:
+        """Visual element definitions"""
+        return self.base_path / "visual_elements" / "visual_element_definitions"
+    
+    @property
+    def visual_element_images_dir(self) -> pathlib.Path:
+        """Visual element images (per-document subfolders)"""
+        return self.base_path / "visual_elements" / "visual_element_images"
+    
+    @property
+    def layout_dir(self) -> pathlib.Path:
+        """Layout element definitions"""
+        return self.base_path / "layout"
+    
+    @property
+    def geometries_dir(self) -> pathlib.Path:
+        """Extracted geometries from HTML"""
+        return self.base_path / "geometries"
+    
+    @property
+    def ocr_results_dir(self) -> pathlib.Path:
+        """OCR results"""
+        return self.base_path / "ocr_results"
+    
+    @property
+    def analysis_dir(self) -> pathlib.Path:
+        """Analysis statistics"""
+        return self.base_path / "analysis"
+    
+    @property
+    def debug_dir(self) -> pathlib.Path:
+        """Debug visualizations"""
+        return self.base_path / "debug"
+    
+    @property
+    def msgpack_path(self) -> pathlib.Path:
+        """
+        Path to the dataset msgpack file.
+        
+        This file aggregates all documents in the dataset into a single msgpack
+        for efficient loading during ML training.
+        """
+        return self.base_path / "dataset.msgpack"
+    
+    @property
+    def metadata_path(self) -> pathlib.Path:
+        """Path to dataset metadata JSON"""
+        return self.base_path / "metadata.json"
+    
+    # ==================== Export Methods ====================
+    
+    def add_document(
+        self,
+        document_id: str,
+        html: str,
+        css: str,
+        pdf_initial: Optional[bytes] = None,
+        pdf_with_handwriting: Optional[bytes] = None,
+        pdf_with_visual_elements: Optional[bytes] = None,
+        pdf_final: Optional[bytes] = None,
+        final_image: Optional[bytes] = None,
+        ground_truth: Optional[dict] = None,
+        raw_annotations: Optional[list] = None,
+        bboxes_pdf_word: Optional[list] = None,
+        bboxes_pdf_char: Optional[list] = None,
+        bboxes_final_word: Optional[list] = None,
+        bboxes_final_segment: Optional[list] = None,
+        bboxes_normalized_word: Optional[dict] = None,
+        bboxes_normalized_segment: Optional[dict] = None,
+        gt_verification: Optional[dict] = None,
+        token_mapping: Optional[dict] = None,
+        handwriting_regions: Optional[list] = None,
+        handwriting_images: Optional[dict] = None,  # {hw_id: base64_png}
+        visual_elements: Optional[list] = None,
+        visual_element_images: Optional[dict] = None,  # {ve_id: base64_png}
+        layout_elements: Optional[list] = None,
+        geometries: Optional[list] = None,  # List of element geometry dicts
+        ocr_results: Optional[dict] = None,
+        analysis_stats: Optional[dict] = None,
+        debug_visualization: Optional[bytes] = None,
+    ):
+        """
+        Add a document to the dataset export.
+        
+        Args:
+            document_id: Unique document identifier
+            html: Document HTML content
+            css: Document CSS content
+            pdf_initial: Initial PDF bytes (before modifications)
+            pdf_with_handwriting: PDF bytes after handwriting insertion
+            pdf_with_visual_elements: PDF bytes after visual element insertion (no handwriting)
+            pdf_final: PDF bytes with both handwriting and visual elements
+            final_image: Final rendered image (PNG bytes)
+            ground_truth: Ground truth annotations
+            raw_annotations: Raw layout boxes (before normalization)
+            bboxes_pdf_word: Word-level bboxes from PDF (ground truth)
+            bboxes_pdf_char: Character-level bboxes from PDF
+            bboxes_final_word: Final word-level bboxes (OCR or PDF)
+            bboxes_final_segment: Final segment-level bboxes (OCR or PDF)
+            bboxes_normalized_word: Normalized word-level bboxes
+            bboxes_normalized_segment: Normalized segment-level bboxes
+            gt_verification: Ground truth verification results
+            token_mapping: Token to bbox mapping
+            handwriting_regions: Handwriting region metadata
+            handwriting_images: Dict of handwriting token images
+            visual_elements: Visual element metadata
+            visual_element_images: Dict of visual element images
+            layout_elements: Layout element definitions
+            geometries: Extracted geometries from HTML
+            ocr_results: OCR results
+            analysis_stats: Analysis statistics
+            debug_visualization: Debug visualization image (PNG bytes)
+        """
+        # Save HTML and CSS
+        (self.html_dir / f"{document_id}.html").write_text(html, encoding='utf-8')
+        (self.html_dir / f"{document_id}.css").write_text(css, encoding='utf-8')
+        
+        # Save all PDF stages
+        if pdf_initial:
+            (self.pdf_initial_dir / f"{document_id}.pdf").write_bytes(pdf_initial)
+        
+        if pdf_with_handwriting:
+            (self.pdf_with_handwriting_dir / f"{document_id}.pdf").write_bytes(pdf_with_handwriting)
+        
+        if pdf_with_visual_elements:
+            (self.pdf_with_visual_elements_dir / f"{document_id}.pdf").write_bytes(pdf_with_visual_elements)
+        
+        if pdf_final:
+            (self.pdf_final_dir / f"{document_id}.pdf").write_bytes(pdf_final)
+        
+        # Save final image
+        if final_image:
+            (self.img_dir / f"{document_id}.png").write_bytes(final_image)
+        
+        # Save annotations
+        if raw_annotations:
+            (self.raw_annotations_dir / f"{document_id}.json").write_text(
+                json.dumps(raw_annotations, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if ground_truth:
+            (self.gt_dir / f"{document_id}.json").write_text(
+                json.dumps(ground_truth, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if gt_verification:
+            (self.gt_verification_dir / f"{document_id}.json").write_text(
+                json.dumps(gt_verification, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if token_mapping:
+            (self.token_mapping_dir / f"{document_id}.json").write_text(
+                json.dumps(token_mapping, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        # Save bounding boxes
+        if bboxes_pdf_word:
+            (self.bbox_pdf_word_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_pdf_word, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if bboxes_pdf_char:
+            (self.bbox_pdf_char_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_pdf_char, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if bboxes_final_word:
+            (self.bbox_final_word_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_final_word, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if bboxes_final_segment:
+            (self.bbox_final_segment_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_final_segment, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if bboxes_normalized_word:
+            (self.bbox_final_normalized_word_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_normalized_word, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if bboxes_normalized_segment:
+            (self.bbox_final_normalized_segment_dir / f"{document_id}.json").write_text(
+                json.dumps(bboxes_normalized_segment, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        # Save handwriting data
+        if handwriting_regions:
+            (self.handwriting_regions_dir / f"{document_id}.json").write_text(
+                json.dumps(handwriting_regions, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if handwriting_images:
+            # Create subfolder for this document's tokens
+            doc_hw_tokens_dir = self.handwriting_tokens_dir / document_id
+            doc_hw_tokens_dir.mkdir(parents=True, exist_ok=True)
+            
+            for hw_id, img_b64 in handwriting_images.items():
+                img_bytes = base64.b64decode(img_b64)
+                (doc_hw_tokens_dir / f"{hw_id}.png").write_bytes(img_bytes)
+        
+        # Save visual element data
+        if visual_elements:
+            (self.visual_element_definitions_dir / f"{document_id}.json").write_text(
+                json.dumps(visual_elements, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if visual_element_images:
+            # Create subfolder for this document's visual elements
+            doc_ve_images_dir = self.visual_element_images_dir / document_id
+            doc_ve_images_dir.mkdir(parents=True, exist_ok=True)
+            
+            for ve_id, img_b64 in visual_element_images.items():
+                img_bytes = base64.b64decode(img_b64)
+                (doc_ve_images_dir / f"{ve_id}.png").write_bytes(img_bytes)
+        
+        # Save other data
+        if layout_elements:
+            (self.layout_dir / f"{document_id}.json").write_text(
+                json.dumps(layout_elements, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if geometries:
+            (self.geometries_dir / f"{document_id}.json").write_text(
+                json.dumps(geometries, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if ocr_results:
+            (self.ocr_results_dir / f"{document_id}.json").write_text(
+                json.dumps(ocr_results, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if analysis_stats:
+            (self.analysis_dir / f"{document_id}.json").write_text(
+                json.dumps(analysis_stats, indent=2, ensure_ascii=False), encoding='utf-8'
+            )
+        
+        if debug_visualization:
+            (self.debug_dir / f"{document_id}_debug.png").write_bytes(debug_visualization)
+        
+        # Track document for metadata
+        self.documents.append({
+            'document_id': document_id,
+            'has_handwriting': handwriting_regions is not None and len(handwriting_regions) > 0,
+            'has_visual_elements': visual_elements is not None and len(visual_elements) > 0,
+            'has_ocr': ocr_results is not None,
+            'modification_type': (
+                "both" if pdf_final
+                else "handwriting" if pdf_with_handwriting
+                else "visual_elements" if pdf_with_visual_elements
+                else None
+            )
+        })
+    
+    def finalize(
+        self,
+        request_id: Optional[str] = None,
+        user_id: Optional[int] = None,
+        prompt_params: Optional[dict] = None,
+        api_mode: str = "sync"
+    ) -> pathlib.Path:
+        """
+        Finalize the dataset export by creating metadata, README, and optionally msgpack.
+        
+        Args:
+            request_id: Request UUID for tracking
+            user_id: User ID who made the request
+            prompt_params: Prompt parameters used for generation
+            api_mode: "sync" or "async"
+        
+        Returns:
+            Path to the dataset base directory
+        """
+        # Create metadata
+        metadata = {
+            'dataset_name': self.dataset_name,
+            'num_documents': len(self.documents),
+            'documents': self.documents,
+            'structure_version': '2.0',
+            'structure_description': 'Organized dataset following original pipeline structure',
+            'generation_metadata': {
+                'request_id': request_id,
+                'user_id': user_id,
+                'api_mode': api_mode,
+                'prompt_params': prompt_params or {}
+            }
+        }
+        
+        self.metadata_path.write_text(
+            json.dumps(metadata, indent=2, ensure_ascii=False), encoding='utf-8'
+        )
+        
+        # Create README
+        readme_content = self._generate_readme()
+        (self.base_path / "README.md").write_text(readme_content, encoding='utf-8')
+        
+        # Create msgpack dataset only if explicitly enabled
+        enable_dataset_export = prompt_params.get('enable_dataset_export', False) if prompt_params else False
+        dataset_export_format = prompt_params.get('dataset_export_format', 'msgpack') if prompt_params else 'msgpack'
+        
+        if enable_dataset_export and dataset_export_format.lower() == 'msgpack':
+            # Also check if bbox normalization was enabled (required for msgpack)
+            enable_bbox_normalization = prompt_params.get('enable_bbox_normalization', False) if prompt_params else False
+            
+            if enable_bbox_normalization:
+                self._create_msgpack_dataset()
+            else:
+                print(f"  ⚠ Msgpack export requested but bbox_normalization is disabled")
+                print(f"     Msgpack requires normalized bboxes. Enable 'enable_bbox_normalization: true' to export msgpack.")
+        
+        return self.base_path
+    
+    def _create_msgpack_dataset(self):
+        """
+        Create a single msgpack file aggregating all documents.
+        
+        This follows the original pipeline's approach of creating one msgpack
+        with all documents for easy loading in ML training pipelines.
+        """
+        try:
+            from datadings.writer import FileWriter
+            
+            print(f"  📦 Creating msgpack dataset...")
+            
+            # Collect all samples
+            samples = []
+            for doc in self.documents:
+                doc_id = doc['document_id']
+                
+                # Read normalized bboxes (required for msgpack)
+                bbox_word_path = self.bbox_final_normalized_word_dir / f"{doc_id}.json"
+                bbox_segment_path = self.bbox_final_normalized_segment_dir / f"{doc_id}.json"
+                
+                # Skip if bboxes don't exist
+                if not bbox_word_path.exists():
+                    print(f"  ⚠ Skipping {doc_id}: no normalized bboxes found")
+                    continue
+                
+                # Read word bboxes
+                word_bboxes_data = json.loads(bbox_word_path.read_text(encoding='utf-8'))
+                
+                # Read segment bboxes (fallback to word if not available)
+                if bbox_segment_path.exists():
+                    segment_bboxes_data = json.loads(bbox_segment_path.read_text(encoding='utf-8'))
+                else:
+                    segment_bboxes_data = word_bboxes_data
+                
+                # Extract words and bboxes
+                words = [item['text'] for item in word_bboxes_data if 'text' in item]
+                word_bboxes = [item['bbox'] for item in word_bboxes_data if 'bbox' in item]
+                segment_bboxes = [item['bbox'] for item in segment_bboxes_data if 'bbox' in item]
+                
+                # Ensure bbox format is [x0, y0, x2, y2] (normalized)
+                # If dict format, convert
+                word_bboxes = [
+                    bbox if isinstance(bbox, list) else [bbox['x0'], bbox['y0'], bbox['x2'], bbox['y2']]
+                    for bbox in word_bboxes
+                ]
+                segment_bboxes = [
+                    bbox if isinstance(bbox, list) else [bbox['x0'], bbox['y0'], bbox['x2'], bbox['y2']]
+                    for bbox in segment_bboxes
+                ]
+                
+                # Read ground truth
+                gt_path = self.gt_dir / f"{doc_id}.json"
+                annotations = {}
+                if gt_path.exists():
+                    annotations = json.loads(gt_path.read_text(encoding='utf-8'))
+                
+                # Determine image file path
+                img_path = self.img_dir / f"{doc_id}.png"
+                if not img_path.exists():
+                    # Fallback to PDF
+                    img_path = self.pdf_final_dir / f"{doc_id}.pdf"
+                    if not img_path.exists():
+                        img_path = self.pdf_initial_dir / f"{doc_id}.pdf"
+                
+                # Create sample dictionary matching original pipeline format
+                sample = {
+                    'key': doc_id,
+                    'sample_id': doc_id,
+                    'image_file_path': str(img_path),
+                    'words': words,
+                    'word_bboxes': word_bboxes,
+                    'segment_level_bboxes': segment_bboxes,
+                }
+                
+                # Add annotations if present
+                if annotations:
+                    sample.update(annotations)
+                
+                samples.append(sample)
+            
+            if not samples:
+                print(f"  ⚠ No samples to write to msgpack - skipping")
+                return
+            
+            # Write all samples to msgpack
+            with FileWriter(self.msgpack_path, overwrite=True) as writer:
+                for sample in samples:
+                    writer.write(sample)
+            
+            print(f"  ✓ Created msgpack dataset: {self.msgpack_path.name} ({len(samples)} documents)")
+            
+        except ImportError:
+            print(f"  ⚠ datadings not installed - skipping msgpack creation")
+            print(f"    Install with: pip install datadings")
+        except Exception as e:
+            print(f"  ⚠ Failed to create msgpack: {str(e)}")
+            import traceback
+            traceback.print_exc()
+    
+    def _generate_readme(self) -> str:
+        """Generate README content for the dataset."""
+        return f"""# DocGenie Dataset: {self.dataset_name}
+
+Generated using DocGenie API - Synthetic Document Generation Pipeline
+
+## Dataset Structure
+
+This dataset follows the original pipeline's organized structure with categorized folders:
+
+```
+{self.dataset_name}/
+├── dataset.msgpack                    # Aggregated dataset (all documents)
+├── metadata.json                      # Dataset metadata
+├── README.md                          # This file
+│
+├── html/                              # HTML and CSS files
+│   ├── document_1.html
+│   ├── document_1.css
+│   └── ...
+│
+├── pdf/                               # PDF files at different stages
+│   ├── pdf_initial/                  # Before synthesis
+│   ├── pdf_with_handwriting/         # With handwriting only
+│   ├── pdf_with_visual_elements/     # With visual elements only
+│   └── pdf_final/                    # With both features
+│
+├── img/                               # Final rendered images
+│   ├── document_1.png
+│   └── ...
+│
+├── bbox/                              # Bounding boxes
+│   ├── bbox_pdf/                     # Extracted from PDF (ground truth positions)
+│   │   ├── word/                     # Word-level from PDF
+│   │   └── char/                     # Character-level from PDF
+│   ├── bbox_final/                   # Final bboxes (OCR if modified, else PDF)
+│   │   ├── word/                     # Word-level (unnormalized)
+│   │   └── segment/                  # Segment-level (unnormalized)
+│   └── bbox_final_normalized/        # Normalized (0-1 range)
+│       ├── word/                     # Word-level normalized
+│       └── segment/                  # Segment-level normalized
+│
+├── annotations/                       # Ground truth and mappings
+│   ├── raw_annotations/              # Raw layout boxes (before normalization)
+│   ├── gt/                           # Ground truth annotations
+│   ├── gt_verification/              # Verification results
+│   └── token_mapping/                # Token-to-bbox mappings
+│
+├── handwriting/                       # Handwriting data
+│   ├── handwriting_regions/          # Region definitions
+│   └── handwriting_tokens/           # Token images (subfolders per document)
+│       ├── document_1/
+│       │   ├── hw1_b3_l1_w0.png
+│       │   └── ...
+│       └── ...
+│
+├── visual_elements/                   # Visual element data
+│   ├── visual_element_definitions/   # Element definitions
+│   └── visual_element_images/        # Element images (subfolders per document)
+│       ├── document_1/
+│       │   ├── ve0.png
+│       │   └── ...
+│       └── ...
+│
+├── layout/                            # Layout element definitions
+├── geometries/                        # Extracted geometries
+├── ocr_results/                       # OCR results
+├── analysis/                          # Analysis statistics
+└── debug/                             # Debug visualizations
+```
+
+## Dataset Statistics
+
+- **Total Documents**: {len(self.documents)}
+- **Documents with Handwriting**: {sum(1 for d in self.documents if d['has_handwriting'])}
+- **Documents with Visual Elements**: {sum(1 for d in self.documents if d['has_visual_elements'])}
+- **Documents with OCR**: {sum(1 for d in self.documents if d['has_ocr'])}
+
+## Usage
+
+This dataset is designed for document understanding and OCR tasks. Files are organized by category for easy access and processing.
+
+### Loading the Entire Dataset (Msgpack)
+
+The easiest way to load all documents for ML training:
+
+```python
+from datadings.reader import MsgpackReader
+
+# Load the aggregated dataset
+reader = MsgpackReader('dataset.msgpack')
+
+# Iterate through all documents
+for sample in reader:
+    doc_id = sample['sample_id']
+    words = sample['words']
+    word_bboxes = sample['word_bboxes']  # Normalized [x0, y0, x2, y2]
+    image_path = sample['image_file_path']
+    # Ground truth annotations are included in the sample
+```
+
+For more information on msgpack format, see: https://github.com/mweiss/datadings
+
+### Loading Individual Documents
+
+Each document is identified by its `document_id` (e.g., "document_1"). To load a document:
+
+1. **HTML/CSS**: `html/document_1.html`, `html/document_1.css`
+2. **PDF stages**: Check `pdf/pdf_initial/`, `pdf/pdf_final/`, etc.
+3. **Images**: `img/document_1.png`
+4. **Annotations**: `annotations/gt/document_1.json`, `annotations/raw_annotations/document_1.json`
+5. **Bounding boxes**: 
+   - PDF-extracted (ground truth): `bbox/bbox_pdf/word/document_1.json`, `bbox/bbox_pdf/char/document_1.json`
+   - Final bboxes: `bbox/bbox_final/word/document_1.json` (OCR or PDF)
+   - Normalized: `bbox/bbox_final_normalized/word/document_1.json`
+6. **Tokens**: `handwriting/handwriting_tokens/document_1/`, `visual_elements/visual_element_images/document_1/`
+
+### Notes
+
+- Bounding boxes in `bbox_pdf` are extracted from PDF and represent ground truth text positions
+- Bounding boxes in `bbox_final` are from OCR (if document has handwriting/visual elements) or PDF (otherwise)
+- Bounding boxes in `bbox_final_normalized` are normalized to [0, 1] range for ML training
+- Character-level bboxes (`bbox_pdf/char/`) provide fine-grained text localization
+- Raw annotations show the original layout boxes before normalization
+- Token images are organized in per-document subfolders
+- OCR results and analysis are only present if those features were enabled
+
+---
+Generated by DocGenie API v2.0
+"""
diff --git a/api/example_usage.py b/api/example_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ffa97193fcfc65a095bb4f8b9be77c54ac0381
--- /dev/null
+++ b/api/example_usage.py
@@ -0,0 +1,143 @@
+"""
+Example usage of the DocGenie API.
+Demonstrates how to call the API and save generated documents.
+"""
+import asyncio
+import base64
+import json
+from pathlib import Path
+
+import httpx
+
+
+async def generate_documents_example():
+    """
+    Example: Generate documents from seed images.
+    """
+    # API endpoint
+    api_url = "http://localhost:8000/generate"
+    
+    # Example seed image URLs (replace with your actual URLs)
+    seed_image_urls = [
+        "https://example.com/receipt1.jpg",
+        "https://example.com/receipt2.jpg",
+        # Add more seed image URLs here
+    ]
+    
+    # Request payload
+    payload = {
+        "seed_images": seed_image_urls,
+        "prompt_params": {
+            "language": "English",
+            "doc_type": "business and administrative documents",
+            "gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
+            "gt_format": '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}',
+            "num_solutions": 3
+        },
+        "model": "claude-sonnet-4-5-20250929"
+        # "api_key": "your-api-key"  # Optional if ANTHROPIC_API_KEY env var is set
+    }
+    
+    print("Sending request to DocGenie API...")
+    print(f"Seed images: {len(seed_image_urls)}")
+    print(f"Requested solutions: {payload['prompt_params']['num_solutions']}")
+    
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        response = await client.post(api_url, json=payload)
+        
+        if response.status_code != 200:
+            print(f"Error: {response.status_code}")
+            print(response.text)
+            return
+        
+        result = response.json()
+    
+    print(f"\nSuccess! Generated {result['total_documents']} documents")
+    
+    # Create output directory
+    output_dir = Path("api_output")
+    output_dir.mkdir(exist_ok=True)
+    
+    # Process each generated document
+    for idx, doc in enumerate(result["documents"]):
+        doc_id = doc["document_id"]
+        print(f"\n--- Document {idx + 1} (ID: {doc_id}) ---")
+        
+        # Save PDF
+        pdf_path = output_dir / f"{doc_id}.pdf"
+        pdf_bytes = base64.b64decode(doc["pdf_base64"])
+        with open(pdf_path, "wb") as f:
+            f.write(pdf_bytes)
+        print(f"  PDF saved: {pdf_path}")
+        
+        # Save HTML
+        html_path = output_dir / f"{doc_id}.html"
+        with open(html_path, "w", encoding="utf-8") as f:
+            f.write(doc["html"])
+        print(f"  HTML saved: {html_path}")
+        
+        # Save CSS
+        css_path = output_dir / f"{doc_id}.css"
+        with open(css_path, "w", encoding="utf-8") as f:
+            f.write(doc["css"])
+        print(f"  CSS saved: {css_path}")
+        
+        # Save ground truth
+        if doc["ground_truth"]:
+            gt_path = output_dir / f"{doc_id}_gt.json"
+            with open(gt_path, "w", encoding="utf-8") as f:
+                json.dump(doc["ground_truth"], f, indent=2, ensure_ascii=False)
+            print(f"  Ground truth saved: {gt_path}")
+            print(f"  GT entries: {len(doc['ground_truth'])}")
+        
+        # Save bounding boxes
+        bbox_path = output_dir / f"{doc_id}_bboxes.json"
+        bboxes_data = [bbox for bbox in doc["bboxes"]]
+        with open(bbox_path, "w", encoding="utf-8") as f:
+            json.dump(bboxes_data, f, indent=2)
+        print(f"  Bounding boxes saved: {bbox_path}")
+        print(f"  BBox count: {len(doc['bboxes'])}")
+        
+        # Print document info
+        print(f"  Dimensions: {doc['page_width_mm']:.1f}mm x {doc['page_height_mm']:.1f}mm")
+    
+    print(f"\n✅ All files saved to: {output_dir.absolute()}")
+
+
+async def health_check_example():
+    """
+    Example: Check if the API is running.
+    """
+    api_url = "http://localhost:8000/health"
+    
+    print("Checking API health...")
+    
+    async with httpx.AsyncClient() as client:
+        response = await client.get(api_url)
+        
+        if response.status_code == 200:
+            result = response.json()
+            print(f"✅ API is healthy!")
+            print(f"   Status: {result['status']}")
+            print(f"   Version: {result['version']}")
+        else:
+            print(f"❌ API is not responding: {response.status_code}")
+
+
+if __name__ == "__main__":
+    print("DocGenie API - Example Usage\n")
+    
+    # Run health check
+    asyncio.run(health_check_example())
+    
+    print("\n" + "="*60 + "\n")
+    
+    # Run document generation example
+    # NOTE: Replace seed_image_urls in the function with actual URLs
+    # asyncio.run(generate_documents_example())
+    
+    print("\n⚠️  To run document generation:")
+    print("   1. Make sure the API is running (python api/main.py)")
+    print("   2. Replace seed_image_urls in this script with actual image URLs")
+    print("   3. Set ANTHROPIC_API_KEY environment variable")
+    print("   4. Uncomment the generate_documents_example() line above")
diff --git a/api/google_drive.py b/api/google_drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5af8c5cb19c22383dd4c5508190287e738d48c0
--- /dev/null
+++ b/api/google_drive.py
@@ -0,0 +1,271 @@
+"""
+Google Drive integration for uploading generated documents.
+Accepts OAuth tokens directly from frontend (no backend OAuth flow).
+"""
+
+import io
+import pathlib
+from typing import Optional
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload, MediaIoBaseUpload
+from googleapiclient.errors import HttpError
+from google.auth.transport.requests import Request
+from datetime import datetime, timedelta
+
+from .config import settings
+
+
+class GoogleDriveClient:
+    """Google Drive API client for file uploads using frontend-provided tokens"""
+    
+    def __init__(self, access_token: str, refresh_token: Optional[str] = None):
+        """
+        Initialize Google Drive client with OAuth tokens from frontend.
+        
+        Args:
+            access_token: Google OAuth access token (provided by frontend)
+            refresh_token: Google OAuth refresh token (optional, for token renewal)
+        
+        Raises:
+            ValueError: If token is invalid or expired
+        """
+        self.access_token = access_token
+        self.refresh_token = refresh_token
+        self.credentials = self._create_credentials()
+        self.service = build('drive', 'v3', credentials=self.credentials)
+    
+    def _create_credentials(self) -> Credentials:
+        """Create credentials object from provided tokens"""
+        # Validate refresh token requirements
+        if self.refresh_token:
+            # If refresh_token is provided, we need client credentials for auto-refresh
+            if not settings.GOOGLE_CLIENT_ID or not settings.GOOGLE_CLIENT_SECRET:
+                raise ValueError(
+                    "GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET must be set in .env "
+                    "to support token refresh. Either:\n"
+                    "1. Set these environment variables, OR\n"
+                    "2. Ensure the access token doesn't expire during processing (get fresh token)"
+                )
+            
+            credentials = Credentials(
+                token=self.access_token,
+                refresh_token=self.refresh_token,
+                token_uri='https://oauth2.googleapis.com/token',
+                client_id=settings.GOOGLE_CLIENT_ID,
+                client_secret=settings.GOOGLE_CLIENT_SECRET,
+                scopes=['https://www.googleapis.com/auth/drive.file']
+            )
+        else:
+            # No refresh token - token must be valid for entire operation
+            credentials = Credentials(
+                token=self.access_token,
+                scopes=['https://www.googleapis.com/auth/drive.file']
+            )
+        
+        # Try to refresh if expired upfront (only if refresh_token available)
+        if credentials.expired and credentials.refresh_token:
+            try:
+                print(f"[Google Drive] Token expired, refreshing...")
+                credentials.refresh(Request())
+                print(f"[Google Drive] Token refreshed successfully")
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to refresh Google Drive token: {str(e)}. "
+                    "User needs to re-authenticate."
+                )
+        elif credentials.expired:
+            raise ValueError(
+                "Google Drive token has expired and no refresh token provided. "
+                "User needs to re-authenticate with a fresh token."
+            )
+        
+        return credentials
+    
+    def upload_file(
+        self,
+        file_path: pathlib.Path,
+        filename: Optional[str] = None,
+        folder_name: str = "DocGenie Documents",
+        mime_type: str = "application/zip"
+    ) -> str:
+        """
+        Upload a file to user's Google Drive.
+        
+        Args:
+            file_path: Path to local file to upload
+            filename: Name for file in Google Drive (default: use file_path name)
+            folder_name: Name of folder to create/use in Drive
+            mime_type: MIME type of the file
+        
+        Returns:
+            Google Drive file URL (shareable link)
+        
+        Raises:
+            HttpError: If upload fails
+        """
+        try:
+            # Get or create folder
+            folder_id = self._get_or_create_folder(folder_name)
+            
+            # Prepare file metadata
+            file_metadata = {
+                'name': filename or file_path.name,
+                'parents': [folder_id]
+            }
+            
+            # Upload file
+            media = MediaFileUpload(
+                str(file_path),
+                mimetype=mime_type,
+                resumable=True
+            )
+            
+            file = self.service.files().create(
+                body=file_metadata,
+                media_body=media,
+                fields='id, webViewLink, webContentLink'
+            ).execute()
+            
+            # Make file accessible (reader permissions)
+            self._share_file(file['id'])
+            
+            # Return shareable link
+            return file.get('webViewLink', file.get('webContentLink'))
+        
+        except HttpError as error:
+            print(f"Google Drive upload error: {error}")
+            raise
+    
+    def upload_bytes(
+        self,
+        file_bytes: bytes,
+        filename: str,
+        folder_name: str = "DocGenie Documents",
+        mime_type: str = "application/zip"
+    ) -> str:
+        """
+        Upload bytes directly to Google Drive (without saving to disk).
+        
+        Args:
+            file_bytes: File content as bytes
+            filename: Name for file in Google Drive
+            folder_name: Name of folder to create/use in Drive
+            mime_type: MIME type of the file
+        
+        Returns:
+            Google Drive file URL (shareable link)
+        """
+        try:
+            folder_id = self._get_or_create_folder(folder_name)
+            
+            file_metadata = {
+                'name': filename,
+                'parents': [folder_id]
+            }
+            
+            # Create media from bytes
+            media = MediaIoBaseUpload(
+                io.BytesIO(file_bytes),
+                mimetype=mime_type,
+                resumable=True
+            )
+            
+            file = self.service.files().create(
+                body=file_metadata,
+                media_body=media,
+                fields='id, webViewLink, webContentLink'
+            ).execute()
+            
+            self._share_file(file['id'])
+            
+            return file.get('webViewLink', file.get('webContentLink'))
+        
+        except HttpError as error:
+            print(f"Google Drive upload error: {error}")
+            raise
+    
+    def _get_or_create_folder(self, folder_name: str) -> str:
+        """
+        Get or create a folder in user's Google Drive.
+        
+        Args:
+            folder_name: Name of the folder
+        
+        Returns:
+            Folder ID
+        """
+        # Search for existing folder
+        query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
+        results = self.service.files().list(
+            q=query,
+            spaces='drive',
+            fields='files(id, name)'
+        ).execute()
+        
+        folders = results.get('files', [])
+        
+        if folders:
+            # Folder exists, return its ID
+            return folders[0]['id']
+        
+        # Create new folder
+        file_metadata = {
+            'name': folder_name,
+            'mimeType': 'application/vnd.google-apps.folder'
+        }
+        
+        folder = self.service.files().create(
+            body=file_metadata,
+            fields='id'
+        ).execute()
+        
+        return folder['id']
+    
+    def _share_file(self, file_id: str):
+        """
+        Make file shareable (anyone with link can view).
+        
+        Args:
+            file_id: Google Drive file ID
+        """
+        try:
+            permission = {
+                'type': 'anyone',
+                'role': 'reader'
+            }
+            
+            self.service.permissions().create(
+                fileId=file_id,
+                body=permission
+            ).execute()
+        
+        except HttpError as error:
+            print(f"Warning: Could not share file {file_id}: {error}")
+            # Don't raise - file uploaded successfully even if sharing fails
+
+
+def upload_to_google_drive(
+    access_token: str,
+    file_path: pathlib.Path,
+    refresh_token: Optional[str] = None,
+    filename: Optional[str] = None
+) -> str:
+    """
+    Convenience function to upload a file to user's Google Drive.
+    
+    Args:
+        access_token: Google OAuth access token (from frontend)
+        file_path: Path to file to upload
+        refresh_token: Google OAuth refresh token (optional)
+        filename: Optional custom filename
+    
+    Returns:
+        Google Drive URL
+    
+    Raises:
+        ValueError: If token is invalid or expired
+        HttpError: If upload fails
+    """
+    client = GoogleDriveClient(access_token=access_token, refresh_token=refresh_token)
+    return client.upload_file(file_path, filename)
diff --git a/api/main.py b/api/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab720fbdb0aaa27257cf312fb643972b09c9353e
--- /dev/null
+++ b/api/main.py
@@ -0,0 +1,1671 @@
+"""
+FastAPI application for DocGenie document generation.
+
+FULLY INTEGRATED PIPELINE (All 19 Stages):
+
+✅ Stage 1-2: Core Pipeline (Stages 01-06)
+1. Seed Selection: Download and encode seed images
+2. LLM Prompting: Call Claude API (batched client support)
+3. Response Processing: Extract and validate HTML/GT
+4. PDF Rendering: Generate PDFs with geometry extraction
+5. BBox Extraction: Extract bounding boxes from PDFs
+6. Validation: Verify geometries and bboxes
+
+✅ Stage 3: Feature Synthesis (Stages 07-13)  
+7. Extract handwriting definitions from HTML
+8. Extract visual element definitions from HTML
+9. Generate handwriting images (WordStylist diffusion model)
+10. Create visual elements (stamps, barcodes, logos)
+11. Render second-pass PDF with features
+12. Insert handwriting images into PDF
+13. Insert visual elements into PDF
+
+✅ Stage 4: Image Finalization & OCR (Stages 14-15)
+14. Render final PDF to high-quality image (pdf2image)
+15. Perform OCR on final image (Microsoft Document Intelligence)
+
+✅ Stage 5: Dataset Packaging (Stages 16-19)
+16. Normalize bounding boxes to [0,1] scale
+17. Verify and prepare ground truth annotations
+18. Generate document analysis and statistics
+19. Create debug visualization overlays
+
+See API_PIPELINE_STATUS.md for detailed integration status.
+"""
+import os
+import sys
+import pathlib
+import tempfile
+import uuid
+import json
+import zipfile
+import asyncio
+import warnings
+from typing import List, Optional
+from contextlib import asynccontextmanager
+
+# Suppress resource_tracker warnings in development mode (with uvicorn --reload)
+# These warnings are harmless - they occur because the reloader creates child processes
+# that share semaphores. The lifespan handler below ensures proper cleanup.
+warnings.filterwarnings("ignore", category=UserWarning, module="resource_tracker")
+
+# Load environment variables from .env file if it exists
+from dotenv import load_dotenv
+load_dotenv()
+
+# Add parent directory to path for docgenie imports
+sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
+
+from fastapi import FastAPI, HTTPException, status, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, StreamingResponse
+import uvicorn
+import io
+
+from docgenie import ENV
+
+from .schemas import (
+    GenerateDocumentRequest,
+    GenerateDocumentResponse,
+    DocumentResult,
+    BoundingBox,
+    HealthResponse,
+    DatasetExportInfo
+)
+from .utils import (
+    download_image_to_base64,
+    build_prompt,
+    call_claude_api_direct,
+    extract_html_documents_from_response,
+    extract_ground_truth,
+    extract_css_from_html,
+    render_html_to_pdf,
+    extract_bboxes_from_rendered_pdf,
+    pdf_to_base64,
+    validate_html_structure,
+    validate_pdf,
+    validate_bboxes,
+    process_stage3_complete,
+    process_stage4_ocr,
+    process_stage5_complete
+)
+from .config import settings
+
+
+# Lifespan context manager for proper startup/shutdown
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Handle application lifecycle - startup and shutdown."""
+    # Startup
+    print("🚀 DocGenie API starting up...")
+    yield
+    # Shutdown - give pending tasks time to complete
+    print("🛑 DocGenie API shutting down gracefully...")
+    await asyncio.sleep(0.5)  # Allow pending async operations to complete
+    print("✓ Shutdown complete")
+
+
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    title="DocGenie API",
+    description="API for generating synthetic documents using LLMs",
+    version="1.0.0",
+    docs_url="/docs",
+    lifespan=lifespan
+)
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.get_cors_origins(),  # Configure in .env
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.get("/", response_model=HealthResponse)
+async def root():
+    """Root endpoint - health check."""
+    return HealthResponse(status="healthy", version="1.0.0")
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint."""
+    return HealthResponse(status="healthy", version="1.0.0")
+
+
+@app.post("/generate", response_model=GenerateDocumentResponse)
+async def generate_documents(request: GenerateDocumentRequest):
+    """
+    Generate synthetic documents from seed images.
+    
+    Pipeline:
+    1. Download seed images from URLs
+    2. Convert images to base64
+    3. Build prompt with user parameters
+    4. Call Claude API
+    5. Extract HTML documents from response
+    6. Extract ground truth and CSS
+    7. Render HTML to PDF
+    8. Extract bounding boxes
+    9. Return results
+    """
+    try:
+        # Step 1 & 2: Download and convert seed images to base64
+        print(f"Downloading {len(request.seed_images)} seed images...")
+        seed_images_base64 = []
+        for url in request.seed_images:
+            try:
+                img_b64 = await download_image_to_base64(str(url))
+                seed_images_base64.append(img_b64)
+            except Exception as e:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail=f"Failed to download image from {url}: {str(e)}"
+                )
+        
+        print(f"Successfully downloaded {len(seed_images_base64)} images")
+        
+        # Step 3: Build prompt
+        prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
+        
+        if not prompt_template_path.exists():
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Prompt template not found at {prompt_template_path}"
+            )
+        
+        prompt = build_prompt(
+            language=request.prompt_params.language,
+            doc_type=request.prompt_params.doc_type,
+            gt_type=request.prompt_params.gt_type,
+            gt_format=request.prompt_params.gt_format,
+            num_solutions=request.prompt_params.num_solutions,
+            num_seed_images=len(seed_images_base64),
+            prompt_template_path=prompt_template_path
+        )
+        
+        print("Prompt built successfully")
+        
+        # Step 4: Call Claude API (using settings)
+        if not settings.ANTHROPIC_API_KEY:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="ANTHROPIC_API_KEY environment variable not set"
+            )
+        
+        print(f"Calling Claude API with model {settings.CLAUDE_MODEL}...")
+        llm_response = await call_claude_api_direct(
+            prompt=prompt,
+            seed_images_base64=seed_images_base64,
+            api_key=settings.ANTHROPIC_API_KEY,
+            model=settings.CLAUDE_MODEL
+        )
+        
+        print(f"Received LLM response ({len(llm_response)} chars)")
+        
+        # Step 5: Extract HTML documents
+        html_documents = extract_html_documents_from_response(llm_response)
+        
+        if not html_documents:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="No valid HTML documents found in LLM response"
+            )
+        
+        print(f"Extracted {len(html_documents)} HTML documents")
+        
+        # Process each document
+        results = []
+        
+        # Create temporary directory for PDFs
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = pathlib.Path(tmp_dir)
+            
+            for idx, html in enumerate(html_documents):
+                try:
+                    doc_id = f"{uuid.uuid4()}_{idx}"
+                    print(f"Processing document {idx + 1}/{len(html_documents)} (ID: {doc_id})")
+                    
+                    # Initialize original_pdf_path (will be set after rendering)
+                    original_pdf_path = None
+                    
+                    # Validate HTML structure (pipeline_03 validation)
+                    is_valid, error_msg = validate_html_structure(html)
+                    if not is_valid:
+                        print(f"  ⚠ HTML validation failed: {error_msg}")
+                        continue
+                    
+                    # Step 6: Extract ground truth and CSS (pipeline_03)
+                    gt, html_clean = extract_ground_truth(html)
+                    css, _ = extract_css_from_html(html_clean)
+                    
+                    # DEBUG: Check if LLM generated handwriting classes
+                    print(f"\n  🔍 DEBUG - Handwriting Detection:")
+                    print(f"     - Contains 'handwritten' class: {'handwritten' in html_clean}")
+                    
+                    # Check for author classes (format: author1, author2, etc. - NO DASH)
+                    import re
+                    author_pattern = re.compile(r'\bauthor\d+\b')
+                    author_matches = author_pattern.findall(html_clean)
+                    
+                    if 'handwritten' in html_clean:
+                        # Count occurrences
+                        hw_count = html_clean.count('handwritten')
+                        print(f"     - 'handwritten' occurrences: {hw_count}")
+                        print(f"     - Author classes found: {len(author_matches)}")
+                        if author_matches:
+                            unique_authors = set(author_matches)
+                            print(f"     - Unique author IDs: {sorted(unique_authors)}")
+                        else:
+                            print(f"     - ⚠️ NO author classes found (expected format: author1, author2, etc.)")
+                        
+                        # Show first match context
+                        idx = html_clean.find('handwritten')
+                        context_start = max(0, idx - 50)
+                        context_end = min(len(html_clean), idx + 150)
+                        print(f"     - First match context: ...{html_clean[context_start:context_end]}...")
+                    else:
+                        print(f"     - ⚠️ NO handwriting classes found in LLM output!")
+                        # Show sample of HTML to see structure
+                        print(f"     - HTML sample (first 500 chars): {html_clean[:500]}")
+                    
+                    print(f"  🔍 DEBUG - Visual Elements Detection:")
+                    print(f"     - Contains 'data-placeholder': {'data-placeholder' in html_clean}")
+                    if 'data-placeholder' in html_clean:
+                        ve_count = html_clean.count('data-placeholder')
+                        print(f"     - 'data-placeholder' occurrences: {ve_count}")
+                    print()
+                    
+                    # Step 7: Render to PDF (pipeline_04) and extract geometries
+                    pdf_path = tmp_path / f"{doc_id}.pdf"
+                    pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
+                        html=html_clean,
+                        output_pdf_path=pdf_path
+                    )
+                    
+                    print(f"  ✓ Rendered PDF: {width_mm:.1f}mm x {height_mm:.1f}mm")
+                    
+                    # Validate PDF (pipeline_06 style validation)
+                    is_valid, error_msg = validate_pdf(pdf_path)
+                    if not is_valid:
+                        print(f"  ⚠ PDF validation failed: {error_msg}")
+                        continue
+                    
+                    # Step 8: Extract bounding boxes (pipeline_05)
+                    bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
+                    
+                    # Validate bboxes (pipeline_06 style validation)
+                    is_valid, error_msg = validate_bboxes(bboxes_raw, min_bbox_count=1)
+                    if not is_valid:
+                        print(f"  ⚠ BBox validation failed: {error_msg}")
+                        # Continue anyway with empty bboxes for API response
+                    
+                    bboxes = [BoundingBox(**bbox) for bbox in bboxes_raw]
+                    
+                    print(f"  ✓ Extracted {len(bboxes)} bounding boxes")
+                    
+                    # Step 9: Convert PDF to base64
+                    pdf_b64 = pdf_to_base64(pdf_path)
+                    
+                    # Step 10: Process Stage 3 (Handwriting & Visual Elements) if enabled
+                    final_image_b64 = None
+                    handwriting_regions = []
+                    visual_elements = []
+                    handwriting_images = {}
+                    visual_element_images = {}
+                    ocr_results = None
+                    modified_pdf_path = None
+                    
+                    # Track original PDF path before modification
+                    original_pdf_path = pdf_path
+                    
+                    if request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements:
+                        print(f"  🎨 Processing Stages 07-13 (Handwriting & Visual Elements)...")
+                        
+                        try:
+                            final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
+                                pdf_path=pdf_path,
+                                geometries=geometries,
+                                ground_truth=gt,
+                                bboxes_raw=bboxes_raw,
+                                page_width_mm=width_mm,
+                                page_height_mm=height_mm,
+                                enable_handwriting=request.prompt_params.enable_handwriting,
+                                handwriting_ratio=request.prompt_params.handwriting_ratio,
+                                enable_visual_elements=request.prompt_params.enable_visual_elements,
+                                visual_element_types=request.prompt_params.visual_element_types,
+                                seed=request.prompt_params.seed
+                            )
+                            
+                            # Use final PDF if modifications were made
+                            if pdf_final_path and pdf_final_path.exists():
+                                pdf_path = pdf_final_path
+                                pdf_b64 = pdf_to_base64(pdf_path)
+                            elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
+                                pdf_path = pdf_with_handwriting_path
+                                pdf_b64 = pdf_to_base64(pdf_path)
+                            
+                            print(f"  ✓ Stages 07-13 complete: {len(handwriting_regions)} handwriting regions, {len(visual_elements)} visual elements")
+                            print(f"    - Individual tokens: {len(handwriting_images)} handwriting, {len(visual_element_images)} visual elements")
+                            
+                        except Exception as e:
+                            print(f"  ⚠ Stages 07-13 processing failed: {str(e)}")
+                            # Continue with original PDF if Stage 3 fails
+                    
+                    # Step 11: Process Stages 14-15 (Image Finalization & OCR) if needed
+                    if request.prompt_params.enable_ocr or (final_image_b64 is None and (request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements)):
+                        print(f"  📄 Processing Stages 14-15 (Image Finalization & OCR)...")
+                        
+                        try:
+                            stage4_image, ocr_results = await process_stage4_ocr(
+                                pdf_path=pdf_path,
+                                enable_ocr=request.prompt_params.enable_ocr,
+                                dpi=settings.OCR_DPI
+                            )
+                            
+                            # Use Stage 4 image if Stage 3 didn't generate one
+                            if final_image_b64 is None and stage4_image:
+                                final_image_b64 = stage4_image
+                            
+                            if ocr_results:
+                                print(f"  ✓ Stages 14-15 complete: Image rendered, OCR: {len(ocr_results.get('words', []))} words")
+                            else:
+                                print(f"  ✓ Stage 14 complete: Image rendered")
+                                
+                        except Exception as e:
+                            print(f"  ⚠ Stages 14-15 processing failed: {str(e)}")
+                            # Continue without Stage 4
+                    
+                    # Step 12: Process Stages 16-18 (Dataset Packaging) if needed
+                    stage5_results = {}
+                    if any([
+                        request.prompt_params.enable_bbox_normalization,
+                        request.prompt_params.enable_gt_verification,
+                        request.prompt_params.enable_analysis,
+                        request.prompt_params.enable_debug_visualization
+                    ]):
+                        print(f"  📦 Processing Stages 16-18 (Dataset Packaging)...")
+                        
+                        try:
+                            stage5_results = await process_stage5_complete(
+                                document_id=doc_id,
+                                pdf_path=pdf_path,
+                                image_base64=final_image_b64,
+                                ocr_results=ocr_results,
+                                ground_truth=gt,
+                                has_handwriting=request.prompt_params.enable_handwriting,
+                                has_visual_elements=request.prompt_params.enable_visual_elements,
+                                layout_elements=visual_elements,  # Use visual elements as layout proxy
+                                enable_bbox_normalization=request.prompt_params.enable_bbox_normalization,
+                                enable_gt_verification=request.prompt_params.enable_gt_verification,
+                                enable_analysis=request.prompt_params.enable_analysis,
+                                enable_debug_visualization=request.prompt_params.enable_debug_visualization
+                            )
+                            print(f"  ✓ Stages 16-18 complete")
+                        except Exception as e:
+                            print(f"  ⚠ Stages 16-18 processing failed: {str(e)}")
+                            # Continue without Stage 5
+                    
+                    # Step 13: Export to dataset format if requested
+                    dataset_export_info = None
+                    if request.prompt_params.enable_dataset_export:
+                        print(f"  📦 Exporting dataset format ({request.prompt_params.dataset_export_format})...")
+                        
+                        try:
+                            from .utils import export_to_msgpack
+                            
+                            # Only msgpack format is currently supported
+                            if request.prompt_params.dataset_export_format.lower() == "msgpack":
+                                # Prepare data for export
+                                export_words = []
+                                export_word_bboxes = []
+                                export_segment_bboxes = []
+                                
+                                # Get normalized bboxes if available (Stage 5), otherwise use raw OCR
+                                if stage5_results.get('normalized_bboxes_word'):
+                                    # Use Stage 5 normalized bboxes
+                                    for bbox_entry in stage5_results['normalized_bboxes_word']:
+                                        export_words.append(bbox_entry.get('text', ''))
+                                        bbox = bbox_entry.get('bbox', [0, 0, 1, 1])
+                                        export_word_bboxes.append(bbox)
+                                    
+                                    if stage5_results.get('normalized_bboxes_segment'):
+                                        for bbox_entry in stage5_results['normalized_bboxes_segment']:
+                                            bbox = bbox_entry.get('bbox', [0, 0, 1, 1])
+                                            export_segment_bboxes.append(bbox)
+                                elif ocr_results:
+                                    # Fallback: normalize OCR bboxes manually
+                                    from pdf2image import convert_from_path
+                                    images = convert_from_path(pdf_path, dpi=settings.OCR_DPI)
+                                    img_width, img_height = images[0].size if images else (1000, 1000)
+                                    
+                                    for word in ocr_results.get('words', []):
+                                        export_words.append(word.get('text', ''))
+                                        bbox = word.get('bbox', {'x0': 0, 'y0': 0, 'x1': 1, 'y1': 1})
+                                        # Normalize to [0,1]
+                                        norm_bbox = [
+                                            bbox['x0'] / img_width,
+                                            bbox['y0'] / img_height,
+                                            bbox['x1'] / img_width,
+                                            bbox['y1'] / img_height
+                                        ]
+                                        export_word_bboxes.append(norm_bbox)
+                                        export_segment_bboxes.append(norm_bbox)  # Use words as segments
+                                else:
+                                    print(f"  ⚠ No OCR data available for msgpack export")
+                                
+                                if export_words and export_word_bboxes:
+                                    # Create msgpack file in temp directory
+                                    msgpack_path = pathlib.Path(tempfile.gettempdir()) / f"{doc_id}_dataset.msgpack"
+                                    
+                                    await export_to_msgpack(
+                                        document_id=doc_id,
+                                        image_path=None,
+                                        image_base64=final_image_b64,
+                                        words=export_words,
+                                        word_bboxes=export_word_bboxes,
+                                        segment_bboxes=export_segment_bboxes if export_segment_bboxes else export_word_bboxes,
+                                        ground_truth=gt,
+                                        output_path=msgpack_path,
+                                        image_width=None,
+                                        image_height=None
+                                    )
+                                    
+                                    # Read msgpack file as base64 for response
+                                    if msgpack_path.exists():
+                                        with open(msgpack_path, 'rb') as f:
+                                            msgpack_bytes = f.read()
+                                            msgpack_b64 = base64.b64encode(msgpack_bytes).decode('utf-8')
+                                        
+                                        dataset_export_info = DatasetExportInfo(
+                                            format="msgpack",
+                                            num_samples=1,
+                                            output_path=str(msgpack_path),
+                                            msgpack_base64=msgpack_b64 if len(msgpack_bytes) < 10_000_000 else None,  # Only include if < 10MB
+                                            metadata={
+                                                "document_id": doc_id,
+                                                "num_words": len(export_words),
+                                                "has_ground_truth": gt is not None,
+                                                "has_ocr": ocr_results is not None
+                                            }
+                                        )
+                                        print(f"  ✓ Dataset exported to msgpack: {msgpack_path}")
+                            else:
+                                print(f"  ⚠ Export format '{request.prompt_params.dataset_export_format}' not supported. Only 'msgpack' is available.")
+                        
+                        except Exception as e:
+                            print(f"  ⚠ Dataset export failed: {str(e)}")
+                            import traceback
+                            traceback.print_exc()
+                    
+                    # Prepare individual tokens based on output_detail level
+                    handwriting_token_images_response = None
+                    visual_element_images_response = None
+                    token_mapping_response = None
+                    
+                    output_detail = request.prompt_params.output_detail
+                    
+                    if output_detail in ["dataset", "complete"]:
+                        # Include individual token images for dataset/complete levels
+                        from .utils import create_token_mapping_json
+                        
+                        if handwriting_images or visual_element_images:
+                            handwriting_token_images_response = handwriting_images
+                            visual_element_images_response = visual_element_images
+                            token_mapping_response = create_token_mapping_json(
+                                handwriting_regions,
+                                handwriting_images,
+                                visual_elements,
+                                visual_element_images
+                            )
+                            print(f"  📦 Output detail '{output_detail}': Including {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
+                    
+                    # Create result
+                    result = DocumentResult(
+                        document_id=doc_id,
+                        html=html_clean,
+                        css=css,
+                        ground_truth=gt,
+                        pdf_base64=pdf_b64,
+                        bboxes=bboxes,
+                        page_width_mm=width_mm,
+                        page_height_mm=height_mm,
+                        image_base64=final_image_b64,
+                        handwriting_regions=handwriting_regions,
+                        visual_elements=visual_elements,
+                        handwriting_token_images=handwriting_token_images_response,
+                        visual_element_images=visual_element_images_response,
+                        token_mapping=token_mapping_response,
+                        ocr_results=ocr_results,
+                        # Stage 5 results
+                        normalized_bboxes_word=stage5_results.get('normalized_bboxes_word'),
+                        normalized_bboxes_segment=stage5_results.get('normalized_bboxes_segment'),
+                        gt_verification=stage5_results.get('gt_verification'),
+                        analysis_stats=stage5_results.get('analysis_stats'),
+                        debug_visualization=stage5_results.get('debug_visualization'),
+                        dataset_export=dataset_export_info
+                    )
+                    
+                    results.append(result)
+                    
+                except Exception as e:
+                    print(f"Error processing document {idx}: {str(e)}")
+                    # Continue with other documents
+                    continue
+        
+        if not results:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Failed to process any documents"
+            )
+        
+        print(f"Successfully generated {len(results)} documents")
+        
+        # Add warning message for large responses
+        output_detail = request.prompt_params.output_detail
+        message = f"Successfully generated {len(results)} documents"
+        
+        if output_detail == "complete":
+            message += " ⚠️ WARNING: 'complete' output detail level may result in 50+ MB response"
+        elif output_detail == "dataset":
+            message += " (dataset mode: includes individual tokens)"
+        
+        return GenerateDocumentResponse(
+            success=True,
+            message=message,
+            documents=results,
+            total_documents=len(results)
+        )
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Unexpected error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}"
+        )
+
+
+def parse_request_id(input_str: str) -> tuple:
+    """Extract user_id and request_id from input string (format: user_id/request_id or just request_id)."""
+    if "/" in input_str:
+        parts = input_str.split("/", 1)
+        return parts[0], parts[1]
+    return None, input_str
+
+
+@app.post("/generate/pdf")
+async def generate_document_pdf(
+    request: GenerateDocumentRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    Generate documents and return them as downloadable PDF files (FAST DEMO ENDPOINT).
+    
+    This endpoint generates documents and returns a ZIP file immediately (20-60 seconds).
+    
+    **Workflow:**
+    1. Frontend creates document_requests entry in Supabase with status="pending"
+    2. Frontend sends request_id to this endpoint along with tokens and seed images
+    3. API fetches existing request, validates, and starts generation
+    4. API updates status through: processing → generating → completed/failed
+    5. ZIP file is returned immediately
+    6. If google_drive_token provided: ZIP is uploaded to GDrive in background
+    
+    **Request Parameters:**
+    - request_id: UUID of existing document_requests entry (required)
+    - seed_images: List of image URLs to use as document backgrounds (required)
+    - google_drive_token: OAuth token for GDrive upload (optional, enables backup)
+    - google_drive_refresh_token: Refresh token for GDrive (optional)
+    - prompt_params: Document generation parameters
+    
+    **Use Cases:**
+    - Quick demos and testing (with direct Claude API)  
+    - Production with progress tracking and GDrive backup
+    
+    **For batch processing:** Use `/generate/async` (50% cheaper, 5-30 minutes)
+    """
+    # Get request_id from database
+    user_id_from_input, request_id = parse_request_id(request.request_id)
+    user_id = user_id_from_input
+    supabase_enabled = False
+    gdrive_enabled = False
+    
+    try:
+        # Import supabase_client
+        from .supabase_client import supabase_client
+        
+        # Get existing request from database
+        existing_request = supabase_client.get_request(request_id)
+        if not existing_request:
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail=f"Request {request_id} not found in database"
+            )
+        
+        # Use user_id from input if available, otherwise from database
+        if not user_id:
+            user_id = existing_request["user_id"]
+        
+        supabase_enabled = True
+        
+        print(f"[Request {request_id}] Processing request for user {user_id}")
+        print(f"[Request {request_id}] Current status: {existing_request['status']}")
+        
+        # Validate Google Drive token if provided
+        if request.google_drive_token:
+            gdrive_enabled = True
+            print(f"[Request {request_id}] GDrive integration enabled")
+        
+        # Log analytics
+        try:
+            supabase_client.log_analytics_event(
+                user_id=user_id,
+                event_type="document_generation_started_sync",
+                entity_id=request_id
+            )
+        except Exception as e:
+            print(f"[Request {request_id}] Warning: Analytics logging failed: {e}")
+            
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error: Failed to fetch request from database: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to fetch request: {str(e)}"
+        )
+    
+    # Update status: Downloading seed images
+    if supabase_enabled:
+        try:
+            supabase_client.update_request_status(request_id, "downloading")
+            print(f"[Request {request_id}] Status: downloading (fetching seed images)")
+        except Exception as e:
+            print(f"Warning: Status update failed: {e}")
+    
+    try:
+        # Step 1 & 2: Download and convert seed images to base64
+        print(f"Downloading {len(request.seed_images)} seed images...")
+        seed_images_base64 = []
+        for url in request.seed_images:
+            try:
+                img_b64 = await download_image_to_base64(str(url))
+                seed_images_base64.append(img_b64)
+            except Exception as e:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail=f"Failed to download image from {url}: {str(e)}"
+                )
+        
+        print(f"Successfully downloaded {len(seed_images_base64)} images")
+        
+        # Step 3: Build prompt
+        prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
+        
+        if not prompt_template_path.exists():
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Prompt template not found at {prompt_template_path}"
+            )
+        
+        prompt = build_prompt(
+            language=request.prompt_params.language,
+            doc_type=request.prompt_params.doc_type,
+            gt_type=request.prompt_params.gt_type,
+            gt_format=request.prompt_params.gt_format,
+            num_solutions=request.prompt_params.num_solutions,
+            num_seed_images=len(seed_images_base64),
+            prompt_template_path=prompt_template_path
+        )
+        
+        print("Prompt built successfully")
+        
+        # Update status: Generating (calling LLM)
+        if supabase_enabled:
+            try:
+                supabase_client.update_request_status(request_id, "generating")
+                print(f"[Request {request_id}] Status: generating (calling LLM)")
+            except Exception as e:
+                print(f"Warning: Status update failed: {e}")
+        
+        # Step 4: Call Claude API (using settings)
+        print(f"Calling Claude API with model {settings.CLAUDE_MODEL}...")
+        llm_response = await call_claude_api_direct(
+            prompt=prompt,
+            seed_images_base64=seed_images_base64,
+            api_key=settings.ANTHROPIC_API_KEY,
+            model=settings.CLAUDE_MODEL
+        )
+        
+        print(f"Received LLM response ({len(llm_response)} chars)")
+        
+        # Step 5: Extract HTML documents
+        html_documents = extract_html_documents_from_response(llm_response)
+        
+        if not html_documents:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="No valid HTML documents found in LLM response"
+            )
+        
+        print(f"Extracted {len(html_documents)} HTML documents")
+        
+        # Extract output_detail early to use in ZIP packaging later
+        output_detail = request.prompt_params.output_detail
+        
+        # Create temporary directory for PDFs
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = pathlib.Path(tmp_dir)
+            
+            # Initialize DatasetExporter for organized structure
+            from .dataset_exporter import DatasetExporter
+            exporter = DatasetExporter(tmp_path, dataset_name="docgenie_documents")
+            
+            pdf_files = []
+            metadata = []
+            
+            for idx, html in enumerate(html_documents):
+                try:
+                    doc_id = f"document_{idx + 1}"
+                    print(f"Processing document {idx + 1}/{len(html_documents)} (ID: {doc_id})")
+                    
+                    # Initialize original_pdf_path (will be set after rendering)
+                    original_pdf_path = None
+                    
+                    # Extract ground truth
+                    gt, html_clean = extract_ground_truth(html)
+                    
+                    # DEBUG: Check if LLM generated handwriting classes
+                    print(f"\n  🔍 DEBUG - Handwriting Detection:")
+                    print(f"     - Contains 'handwritten' class: {'handwritten' in html_clean}")
+                    
+                    # Check for author classes (format: author1, author2, etc. - NO DASH)
+                    import re
+                    author_pattern = re.compile(r'\bauthor\d+\b')
+                    author_matches = author_pattern.findall(html_clean)
+                    
+                    if 'handwritten' in html_clean:
+                        # Count occurrences
+                        hw_count = html_clean.count('handwritten')
+                        print(f"     - 'handwritten' occurrences: {hw_count}")
+                        print(f"     - Author classes found: {len(author_matches)}")
+                        if author_matches:
+                            unique_authors = set(author_matches)
+                            print(f"     - Unique author IDs: {sorted(unique_authors)}")
+                        else:
+                            print(f"     - ⚠️ NO author classes found (expected format: author1, author2, etc.)")
+                        
+                        # Show first match context
+                        idx = html_clean.find('handwritten')
+                        context_start = max(0, idx - 50)
+                        context_end = min(len(html_clean), idx + 150)
+                        print(f"     - First match context: ...{html_clean[context_start:context_end]}...")
+                    else:
+                        print(f"     - ⚠️ NO handwriting classes found in LLM output!")
+                        # Show sample of HTML to see structure
+                        print(f"     - HTML sample (first 500 chars): {html_clean[:500]}")
+                    
+                    print(f"  🔍 DEBUG - Visual Elements Detection:")
+                    print(f"     - Contains 'data-placeholder': {'data-placeholder' in html_clean}")
+                    if 'data-placeholder' in html_clean:
+                        ve_count = html_clean.count('data-placeholder')
+                        print(f"     - 'data-placeholder' occurrences: {ve_count}")
+                    print()
+                    
+                    # Render to PDF and extract geometries
+                    pdf_path = tmp_path / f"{doc_id}.pdf"
+                    pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
+                        html=html_clean,
+                        output_pdf_path=pdf_path
+                    )
+                    
+                    print(f"  - Rendered PDF: {width_mm:.1f}mm x {height_mm:.1f}mm")
+                    
+                    # Extract bounding boxes
+                    bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
+                    
+                    print(f"  - Extracted {len(bboxes_raw)} bounding boxes")
+                    
+                    # Extract CSS for Stage 3
+                    css, _ = extract_css_from_html(html_clean)
+                    
+                    # Step: Process Stage 3 (Handwriting & Visual Elements) if enabled
+                    final_image_b64 = None
+                    handwriting_regions = []
+                    visual_elements = []
+                    handwriting_images = {}
+                    visual_element_images = {}
+                    ocr_results = None
+                    pdf_with_handwriting_path = None
+                    pdf_final_path = None
+                    
+                    # Track original PDF path before modification
+                    original_pdf_path = pdf_path
+                    
+                    if request.prompt_params.enable_handwriting or request.prompt_params.enable_visual_elements:
+                        print(f"  🎨 Processing Stages 07-13 (Handwriting & Visual Elements)...")
+                        
+                        try:
+                            final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
+                                pdf_path=pdf_path,
+                                geometries=geometries,
+                                ground_truth=gt,
+                                bboxes_raw=bboxes_raw,
+                                page_width_mm=width_mm,
+                                page_height_mm=height_mm,
+                                enable_handwriting=request.prompt_params.enable_handwriting,
+                                handwriting_ratio=request.prompt_params.handwriting_ratio,
+                                enable_visual_elements=request.prompt_params.enable_visual_elements,
+                                visual_element_types=request.prompt_params.visual_element_types,
+                                seed=request.prompt_params.seed
+                            )
+                            
+                            # Use final PDF if modifications were made
+                            if pdf_final_path and pdf_final_path.exists():
+                                pdf_path = pdf_final_path
+                            elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
+                                pdf_path = pdf_with_handwriting_path
+                            
+                            print(f"  ✓ Stages 07-13 complete: {len(handwriting_regions)} handwriting regions, {len(visual_elements)} visual elements")
+                            print(f"    - Individual tokens: {len(handwriting_images)} handwriting, {len(visual_element_images)} visual elements")
+                            
+                        except Exception as e:
+                            print(f"  ⚠ Stages 07-13 processing failed: {str(e)}")
+                            # Continue with original PDF if Stage 3 fails
+                    
+                    # Step: Process Stages 14-15 (Image Finalization & OCR) if needed
+                    if request.prompt_params.enable_ocr:
+                        print(f"  📄 Processing Stages 14-15 (OCR)...")
+                        
+                        try:
+                            stage4_image, ocr_results = await process_stage4_ocr(
+                                pdf_path=pdf_path,
+                                enable_ocr=True,
+                                dpi=settings.OCR_DPI
+                            )
+                            
+                            if ocr_results:
+                                print(f"  ✓ Stages 14-15 complete: OCR: {len(ocr_results.get('words', []))} words")
+                                
+                        except Exception as e:
+                            print(f"  ⚠ Stages 14-15 processing failed: {str(e)}")
+                            # Continue without Stage 4
+                    
+                    # Step: Process Stages 16-18 (Dataset Packaging) if needed
+                    stage5_results = {}
+                    if any([
+                        request.prompt_params.enable_bbox_normalization,
+                        request.prompt_params.enable_gt_verification,
+                        request.prompt_params.enable_analysis,
+                        request.prompt_params.enable_debug_visualization
+                    ]):
+                        print(f"  📦 Processing Stages 16-18 (Dataset Packaging)...")
+                        
+                        try:
+                            stage5_results = await process_stage5_complete(
+                                document_id=doc_id,
+                                pdf_path=pdf_path,
+                                image_base64=final_image_b64,
+                                ocr_results=ocr_results,
+                                ground_truth=gt,
+                                has_handwriting=request.prompt_params.enable_handwriting,
+                                has_visual_elements=request.prompt_params.enable_visual_elements,
+                                layout_elements=visual_elements,
+                                enable_bbox_normalization=request.prompt_params.enable_bbox_normalization,
+                                enable_gt_verification=request.prompt_params.enable_gt_verification,
+                                enable_analysis=request.prompt_params.enable_analysis,
+                                enable_debug_visualization=request.prompt_params.enable_debug_visualization
+                            )
+                            print(f"  ✓ Stages 16-18 complete")
+                        except Exception as e:
+                            print(f"  ⚠ Stages 16-18 processing failed: {str(e)}")
+                            # Continue without Stages 16-18
+                    
+                    # Track PDFs for metadata
+                    if original_pdf_path and pdf_path != original_pdf_path:
+                        pdf_files.append(original_pdf_path)
+                        pdf_files.append(pdf_path)
+                    else:
+                        pdf_files.append(pdf_path)
+                    
+                    # Extract bbox_pdf (word + char) from original PDF (ground truth positions)
+                    from .utils import extract_all_bboxes_from_pdf, extract_raw_annotations_from_geometries
+                    print(f"  📦 Extracting bbox_pdf (word + char level) from original PDF...")
+                    
+                    try:
+                        bboxes_pdf = extract_all_bboxes_from_pdf(original_pdf_path if original_pdf_path else pdf_path)
+                        bbox_pdf_word = bboxes_pdf.get('word', [])
+                        bbox_pdf_char = bboxes_pdf.get('char', [])
+                        print(f"    ✓ Extracted {len(bbox_pdf_word)} word bboxes, {len(bbox_pdf_char)} char bboxes from PDF")
+                    except Exception as e:
+                        print(f"    ⚠ bbox_pdf extraction failed: {e}")
+                        bbox_pdf_word = bboxes_raw  # Fallback to raw bboxes
+                        bbox_pdf_char = []
+                    
+                    # Extract raw_annotations (layout boxes before normalization)
+                    raw_annotations = None
+                    if geometries:
+                        print(f"  📦 Extracting raw_annotations from geometries...")
+                        try:
+                            raw_annotations = extract_raw_annotations_from_geometries(geometries)
+                            print(f"    ✓ Extracted {len(raw_annotations)} layout annotations")
+                        except Exception as e:
+                            print(f"    ⚠ raw_annotations extraction failed: {e}")
+                    
+                    # Decode final image to bytes
+                    final_image_bytes = None
+                    if final_image_b64:
+                        import base64
+                        final_image_bytes = base64.b64decode(final_image_b64)
+                    
+                    # Decode debug visualization
+                    debug_viz_bytes = None
+                    if stage5_results.get('debug_visualization'):
+                        debug_viz_dict = stage5_results['debug_visualization']
+                        if debug_viz_dict and 'bbox_overlay_base64' in debug_viz_dict:
+                            debug_viz_b64 = debug_viz_dict['bbox_overlay_base64']
+                            debug_viz_bytes = base64.b64decode(debug_viz_b64)
+                    
+                    # Prepare token mapping if tokens exist
+                    token_mapping_data = None
+                    if output_detail in ["dataset", "complete"]:
+                        if handwriting_images or visual_element_images:
+                            from .utils import create_token_mapping_json
+                            token_mapping_data = create_token_mapping_json(
+                                handwriting_regions,
+                                handwriting_images,
+                                visual_elements,
+                                visual_element_images
+                            )
+                            print(f"  📦 Output detail '{output_detail}': Prepared {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
+                    
+                    # Extract bbox_final_word and bbox_final_segment (from OCR or PDF)
+                    bbox_final_word = None
+                    bbox_final_segment = None
+                    if ocr_results and ocr_results.get('words'):
+                        # Use OCR results as final bboxes
+                        bbox_final_word = ocr_results.get('words', [])
+                        bbox_final_segment = ocr_results.get('lines', [])
+                    else:
+                        # Fallback to PDF bboxes if no OCR
+                        bbox_final_word = bbox_pdf_word
+                        bbox_final_segment = []  # No line-level data without OCR
+                    
+                    # Read PDF bytes for exporter (capture all stages)
+                    pdf_initial_bytes = original_pdf_path.read_bytes()
+                    pdf_with_handwriting_bytes = pdf_with_handwriting_path.read_bytes() if pdf_with_handwriting_path and pdf_with_handwriting_path.exists() else None
+                    pdf_final_bytes = pdf_final_path.read_bytes() if pdf_final_path and pdf_final_path.exists() else None
+                    
+                    # For visual elements only (no handwriting), pdf_final_path actually points to the VE-only PDF
+                    pdf_with_visual_elements_bytes = None
+                    if pdf_final_bytes and not pdf_with_handwriting_bytes:
+                        # Only visual elements were added, not handwriting
+                        pdf_with_visual_elements_bytes = pdf_final_bytes
+                        pdf_final_bytes = None  # No "final" with both modifications
+                    
+                    # Add document to exporter
+                    print(f"  📦 Adding document to dataset exporter...")
+                    exporter.add_document(
+                        document_id=doc_id,
+                        html=html_clean,
+                        css=css,
+                        pdf_initial=pdf_initial_bytes,
+                        pdf_with_handwriting=pdf_with_handwriting_bytes,
+                        pdf_with_visual_elements=pdf_with_visual_elements_bytes,
+                        pdf_final=pdf_final_bytes,
+                        final_image=final_image_bytes,
+                        ground_truth=gt,
+                        raw_annotations=raw_annotations,
+                        bboxes_pdf_word=bbox_pdf_word,
+                        bboxes_pdf_char=bbox_pdf_char,
+                        bboxes_final_word=bbox_final_word,
+                        bboxes_final_segment=bbox_final_segment,
+                        bboxes_normalized_word=stage5_results.get('normalized_bboxes_word'),
+                        bboxes_normalized_segment=stage5_results.get('normalized_bboxes_segment'),
+                        gt_verification=stage5_results.get('gt_verification'),
+                        token_mapping=token_mapping_data,
+                        handwriting_regions=handwriting_regions,
+                        handwriting_images=handwriting_images,
+                        visual_elements=visual_elements,
+                        visual_element_images=visual_element_images,
+                        layout_elements=visual_elements,
+                        geometries=geometries,
+                        ocr_results=ocr_results,
+                        analysis_stats=stage5_results.get('analysis_stats'),
+                        debug_visualization=debug_viz_bytes
+                    )
+                    print(f"  ✓ Document {doc_id} added to dataset")
+                    
+                    # Store metadata
+                    metadata.append({
+                        "document_id": doc_id,
+                        "filename": f"{doc_id}.pdf",
+                        "bboxes": bboxes_raw,
+                        "ground_truth": gt,
+                        "geometries": geometries,
+                        "page_width_mm": width_mm,
+                        "page_height_mm": height_mm,
+                        "handwriting_regions": handwriting_regions,
+                        "visual_elements": visual_elements,
+                        "has_stage3_image": final_image_b64 is not None,
+                        "ocr_results": ocr_results,
+                        # Stage 5 results
+                        "normalized_bboxes_word": stage5_results.get('normalized_bboxes_word'),
+                        "normalized_bboxes_segment": stage5_results.get('normalized_bboxes_segment'),
+                        "gt_verification": stage5_results.get('gt_verification'),
+                        "analysis_stats": stage5_results.get('analysis_stats'),
+                        "debug_visualization_available": stage5_results.get('debug_visualization') is not None
+                    })
+                    
+                except Exception as e:
+                    print(f"Error processing document {idx}: {str(e)}")
+                    # Continue with other documents
+                    continue
+            
+            if not pdf_files:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail="Failed to process any documents"
+                )
+            
+            print(f"Successfully generated {len(pdf_files)} documents")
+            
+            # Finalize dataset export (writes metadata.json and README.md)
+            print(f"📦 Finalizing dataset export...")
+            exporter.finalize(
+                request_id=request_id if request_id else "unnamed",
+                user_id=user_id,
+                prompt_params=request.prompt_params.dict(),
+                api_mode="sync"
+            )
+            print(f"✓ Dataset structure finalized at {exporter.base_path}")
+            
+            # Update status: Zipping
+            if supabase_enabled:
+                try:
+                    supabase_client.update_request_status(request_id, "zipping")
+                    print(f"[Request {request_id}] Status: zipping (creating ZIP archive)")
+                except Exception as e:
+                    print(f"Warning: Status update failed: {e}")
+            
+            # Create ZIP from organized dataset
+            print(f"📦 Creating ZIP archive from dataset...")
+            zip_buffer = io.BytesIO()
+            with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                # Add all files from exporter.base_path
+                for file_path in exporter.base_path.rglob('*'):
+                    if file_path.is_file():
+                        arcname = file_path.relative_to(exporter.base_path.parent)
+                        zip_file.write(file_path, arcname)
+            
+            zip_buffer.seek(0)
+            zip_size_mb = len(zip_buffer.getvalue()) / (1024 * 1024)
+            print(f"✓ ZIP created: {zip_size_mb:.2f} MB")
+            
+            # Update status: Completed
+            if supabase_enabled and request_id:
+                try:
+                    from .supabase_client import supabase_client
+                    supabase_client.update_request_status(request_id, "completed")
+                    print(f"[Request {request_id}] Status: completed")
+                except Exception as e:
+                    print(f"[Request {request_id}] ⚠ Supabase update failed: {e}")
+            
+            # Save ZIP to temporary file for background upload
+            temp_zip_path = pathlib.Path(tempfile.gettempdir()) / f"docgenie_{request_id}.zip"
+            temp_zip_path.write_bytes(zip_buffer.getvalue())
+
+            # Schedule background task: Upload to Google Drive
+            if gdrive_enabled and request_id and request.google_drive_token:
+                # Update status: Uploading
+                try:
+                    supabase_client.update_request_status(request_id, "uploading")
+                    print(f"[Request {request_id}] Status: uploading (uploading to Google Drive)")
+                except Exception as e:
+                    print(f"Warning: Status update failed: {e}")
+                
+                print(f"[Request {request_id}] Scheduling GDrive upload in background...")
+                
+                background_tasks.add_task(
+                    upload_zip_to_gdrive_background,
+                    request_id=request_id,
+                    zip_path=temp_zip_path,
+                    access_token=request.google_drive_token,
+                    refresh_token=request.google_drive_refresh_token,
+                    num_documents=len(pdf_files)
+                )
+            
+            # Save files for Supabase background upload
+            if supabase_enabled:
+                import shutil
+                supabase_temp_dir = pathlib.Path(tempfile.gettempdir()) / f"docgenie_supabase_{request_id}"
+                if supabase_temp_dir.exists():
+                    shutil.rmtree(supabase_temp_dir, ignore_errors=True)
+                
+                # Copy exporter base_path to persistent temp dir
+                shutil.copytree(exporter.base_path, supabase_temp_dir)
+                
+                print(f"[Request {request_id}] Scheduling Supabase document upload in background...")
+                background_tasks.add_task(
+                    upload_documents_to_supabase_background,
+                    request_id=request_id,
+                    user_id=str(user_id),
+                    temp_dir=str(supabase_temp_dir),
+                    num_documents=len(exporter.documents),
+                    model_version=settings.LLM_MODEL,
+                    zip_path=str(temp_zip_path) if 'temp_zip_path' in locals() else None
+                )
+            
+            # Prepare response headers with tracking info
+            headers = {
+                "Content-Disposition": f"attachment; filename=docgenie_documents_{uuid.uuid4().hex[:8]}.zip"
+            }
+            
+            # Add tracking header if Supabase enabled
+            if supabase_enabled and request_id:
+                headers["X-Request-ID"] = request_id
+                headers["X-Status-URL"] = f"/jobs/{request_id}/status"
+                print(f"[Request {request_id}] Returning ZIP with tracking headers")
+            
+            return StreamingResponse(
+                zip_buffer,
+                media_type="application/zip",
+                headers=headers
+            )
+    
+    except HTTPException as e:
+        # Update status to failed if Supabase enabled
+        if supabase_enabled and request_id:
+            try:
+                from .supabase_client import supabase_client
+                supabase_client.update_request_status(request_id, "failed", error_message=str(e.detail))
+                print(f"[Request {request_id}] Status: failed - {e.detail}")
+            except Exception as update_error:
+                print(f"Warning: Status update failed: {update_error}")
+        raise
+    except Exception as e:
+        # Update status to failed if Supabase enabled
+        if supabase_enabled and request_id:
+            try:
+                from .supabase_client import supabase_client
+                supabase_client.update_request_status(request_id, "failed", error_message=str(e))
+                print(f"[Request {request_id}] Status: failed - {str(e)}")
+            except Exception as sup_err:
+                print(f"[Request {request_id}] ⚠ Supabase update failed: {sup_err}")
+        print(f"Unexpected error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}"
+        )
+
+
+# ==================== Background Task Functions ====================
+
+def upload_documents_to_supabase_background(
+    request_id: str,
+    user_id: str,
+    temp_dir: str,
+    num_documents: int,
+    model_version: str,
+    zip_path: Optional[str] = None
+):
+    """
+    Background task to upload individual documents to Supabase Storage.
+    """
+    import shutil
+    import pathlib
+    import traceback
+    
+    try:
+        print(f"[Background Task {request_id}] Starting Supabase individual document upload...")
+        from .supabase_client import supabase_client
+        
+        base_path = pathlib.Path(temp_dir)
+        
+        # Upload zip if provided
+        zip_url = None
+        if zip_path and pathlib.Path(zip_path).exists():
+            zip_file = pathlib.Path(zip_path)
+            zip_storage_path = f"{user_id}/{request_id}/generated/docgenie_{request_id}.zip"
+            supabase_client.upload_to_storage("doc_storage", zip_storage_path, zip_file.read_bytes(), "application/zip")
+            zip_url = supabase_client.get_public_url("doc_storage", zip_storage_path)
+            print(f"[Background Task {request_id}] ✓ Uploaded ZIP to Supabase: {zip_url}")
+        
+        for idx in range(num_documents):
+            doc_id = f"document_{idx + 1}"
+            
+            # Paths to upload
+            doc_storage_path = f"{user_id}/{request_id}/generated/{idx}_doc.pdf"
+            gt_storage_path = f"{user_id}/{request_id}/generated/{idx}_gt.json"
+            html_storage_path = f"{user_id}/{request_id}/generated/{idx}_src.html"
+            bbox_storage_path = f"{user_id}/{request_id}/generated/{idx}_bbox.json"
+            
+            # Local paths
+            local_pdf = base_path / "pdf" / "pdf_final" / f"{doc_id}.pdf"
+            if not local_pdf.exists():
+                local_pdf = base_path / "pdf" / "pdf_initial" / f"{doc_id}.pdf"
+                
+            local_gt = base_path / "annotations" / "gt" / f"{doc_id}.json"
+            local_html = base_path / "html" / f"{doc_id}.html"
+            local_bbox = base_path / "bbox" / "bbox_final" / "word" / f"{doc_id}.json"
+            
+            # Upload files
+            pdf_url = None
+            if local_pdf.exists():
+                supabase_client.upload_to_storage("doc_storage", doc_storage_path, local_pdf.read_bytes(), "application/pdf")
+                pdf_url = supabase_client.get_public_url("doc_storage", doc_storage_path)
+                
+            if local_gt.exists():
+                supabase_client.upload_to_storage("doc_storage", gt_storage_path, local_gt.read_bytes(), "application/json")
+                
+            if local_html.exists():
+                supabase_client.upload_to_storage("doc_storage", html_storage_path, local_html.read_bytes(), "text/html")
+                
+            if local_bbox.exists():
+                supabase_client.upload_to_storage("doc_storage", bbox_storage_path, local_bbox.read_bytes(), "application/json")
+                
+            supabase_client.create_generated_document(
+                request_id=request_id,
+                file_url=pdf_url,
+                file_type="application/pdf" if pdf_url else None,
+                model_version=model_version,
+                doc_index=idx,
+                doc_storage_path=doc_storage_path if local_pdf.exists() else None,
+                gt_storage_path=gt_storage_path if local_gt.exists() else None,
+                html_storage_path=html_storage_path if local_html.exists() else None,
+                bbox_storage_path=bbox_storage_path if local_bbox.exists() else None,
+                zip_url=zip_url
+            )
+            print(f"[Background Task {request_id}] ✓ Uploaded and tracked document {idx}")
+            
+    except Exception as e:
+        print(f"[Background Task {request_id}] ⚠ Supabase upload failed: {str(e)}")
+        traceback.print_exc()
+    finally:
+        # Clean up the temporary directory
+        try:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            print(f"[Background Task {request_id}] ✓ Cleaned up temporary directory {temp_dir}")
+        except Exception as e:
+            print(f"[Background Task {request_id}] ⚠ Failed to clean up temp dir: {e}")
+
+def upload_zip_to_gdrive_background(
+    request_id: str,
+    zip_path: pathlib.Path,
+    access_token: str,
+    refresh_token: Optional[str],
+    num_documents: int
+):
+    """
+    Background task to upload ZIP file to Google Drive.
+    
+    Args:
+        request_id: Supabase request ID
+        zip_path: Path to temporary ZIP file
+        access_token: Google Drive OAuth access token
+        refresh_token: Google Drive refresh token (optional)
+        num_documents: Number of documents in ZIP
+    """
+    try:
+        print(f"[Background Task {request_id}] Starting GDrive upload...")
+        
+        from .google_drive import GoogleDriveClient
+        from .supabase_client import supabase_client
+        
+        # Upload to Google Drive
+        client = GoogleDriveClient(
+            access_token=access_token,
+            refresh_token=refresh_token
+        )
+        
+        filename = f"docgenie_{request_id}.zip"
+        gdrive_url = client.upload_file(
+            file_path=zip_path,
+            filename=filename,
+            folder_name=settings.GOOGLE_DRIVE_FOLDER_NAME,
+            mime_type="application/zip"
+        )
+        
+        print(f"[Background Task {request_id}] ✓ Uploaded to GDrive: {gdrive_url}")
+        
+        supabase_client.create_generated_document(
+            request_id=request_id,
+            file_url=gdrive_url,
+            file_type="application/zip",
+            model_version=settings.LLM_MODEL
+        )
+        
+        print(f"[Background Task {request_id}] ✓ Updated Supabase with GDrive URL")
+        
+        # Update status to completed
+        supabase_client.update_request_status(request_id, "completed")
+        print(f"[Background Task {request_id}] ✓ Status updated to completed")
+        
+        # Clean up temporary file
+        zip_path.unlink(missing_ok=True)
+        print(f"[Background Task {request_id}] ✓ Cleaned up temp file")
+        
+    except Exception as e:
+        print(f"[Background Task {request_id}] ✗ GDrive upload failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        
+        # Update status to completed_gdrive_failed since token was provided
+        try:
+            from .supabase_client import supabase_client
+            supabase_client.update_request_status(request_id, "completed_gdrive_failed")
+            print(f"[Background Task {request_id}] Status updated to completed_gdrive_failed")
+        except Exception as status_err:
+            print(f"[Background Task {request_id}] Failed to update status: {status_err}")
+        
+        # Clean up temp file even if upload failed
+        try:
+            zip_path.unlink(missing_ok=True)
+        except Exception:
+            pass
+
+
+# ==================== New Async Endpoints (Batched API) ====================
+
+from redis import Redis
+from rq import Queue
+from rq.job import Job
+from .supabase_client import supabase_client
+from .worker import process_document_generation_job
+
+
+# Initialize Redis and RQ
+try:
+    redis_conn = Redis.from_url(settings.REDIS_URL)
+    job_queue = Queue(settings.RQ_QUEUE_NAME, connection=redis_conn)
+    print(f"✓ Connected to Redis: {settings.REDIS_URL}")
+    print(f"✓ RQ Queue: {settings.RQ_QUEUE_NAME}")
+except Exception as e:
+    print(f"⚠ Warning: Redis connection failed: {e}")
+    print("  Async endpoints will not work without Redis")
+    redis_conn = None
+    job_queue = None
+
+
+@app.post("/generate/async")
+async def generate_documents_async(request: GenerateDocumentRequest):
+    """
+    Generate synthetic documents asynchronously using batched Claude API.
+    
+    **Workflow:**
+    1. Frontend creates document_requests entry in Supabase with status="pending"
+    2. Frontend sends request_id to this endpoint along with tokens and seed images
+    3. API fetches existing request, validates, and enqueues background job
+    4. API returns immediately with job info
+    5. Background worker processes job and updates status: processing → generating → completed/failed
+    6. User polls /jobs/{request_id}/status for progress
+    7. Upon completion, ZIP is automatically uploaded to Google Drive
+    
+    Uses batched Claude API for 50% cost savings (but takes 5-30 minutes).
+    
+    Request body:
+        - request_id: UUID of existing document_requests entry (required)
+        - seed_images: List[str] (Supabase storage URLs) (required)
+        - google_drive_token: OAuth token for GDrive upload (optional)
+        - google_drive_refresh_token: Refresh token for GDrive (optional)
+        - prompt_params: dict (language, doc_type, num_solutions, etc.)
+    
+    Returns:
+        - request_id: UUID to track job
+        - status: "pending"
+        - estimated_time_minutes: int
+        - poll_url: URL to check status
+    """
+    if not job_queue:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Background job queue not available. Redis connection required."
+        )
+    
+    # Get request_id from request
+    user_id_from_input, request_id = parse_request_id(request.request_id)
+    user_id = user_id_from_input
+    
+    try:
+        # Fetch request from Supabase
+        existing_request = supabase_client.get_request(request_id)
+        if not existing_request:
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail=f"Request {request_id} not found in database"
+            )
+        
+        # Use user_id from input if available, otherwise from database
+        if not user_id:
+            user_id = existing_request["user_id"]
+        
+        print(f"[Request {request_id}] Processing async request for user {user_id}")
+        print(f"[Request {request_id}] Current status: {existing_request['status']}")
+        
+
+        
+        # Validate seed images
+        if not request.seed_images:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="At least one seed image is required"
+            )
+        
+        # Update status to processing (job is being queued)
+        supabase_client.update_request_status(request_id, "processing")
+        print(f"[Request {request_id}] Status: processing (queuing job)")
+        
+        # Prepare job data
+        job_data = {
+            "user_id": user_id,
+            "google_drive_token": request.google_drive_token,
+            "google_drive_refresh_token": request.google_drive_refresh_token,
+            "seed_images": [str(url) for url in request.seed_images],
+            "prompt_params": request.prompt_params.dict()
+        }
+        
+        # Enqueue background job
+        job = job_queue.enqueue(
+            process_document_generation_job,
+            request_id=request_id,
+            request_data=job_data,
+            job_timeout='2h',  # 2 hours max (batched API can take time)
+            result_ttl=86400,  # Keep result for 24 hours
+            failure_ttl=86400  # Keep failure info for 24 hours
+        )
+        
+        print(f"Enqueued job {job.id} for request {request_id}")
+        
+        # Estimate time based on num_solutions
+        num_solutions = request.prompt_params.num_solutions
+        if num_solutions <= 3:
+            estimated_time = 10  # ~10 minutes for small batch
+        elif num_solutions <= 10:
+            estimated_time = 20  # ~20 minutes for medium batch
+        else:
+            estimated_time = 30 + (num_solutions - 10) * 2  # Scale up
+        
+        # Log analytics
+        supabase_client.log_analytics_event(
+            user_id=user_id,
+            event_type="document_generation_requested",
+            entity_id=request_id
+        )
+        
+        return {
+            "request_id": request_id,
+            "status": "pending",
+            "estimated_time_minutes": estimated_time,
+            "num_documents": num_solutions,
+            "poll_url": f"/jobs/{request_id}/status",
+            "message": f"Job queued successfully. Check status at /jobs/{request_id}/status"
+        }
+    
+    except HTTPException as http_exc:
+        # Update status to failed
+        try:
+            supabase_client.update_request_status(request_id, "failed", error_message=str(http_exc.detail))
+            print(f"[Request {request_id}] Status: failed - {http_exc.detail}")
+        except Exception as update_error:
+            print(f"Warning: Status update failed: {update_error}")
+        raise
+    except Exception as e:
+        print(f"Error creating async job: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        
+        # Update status to failed
+        try:
+            supabase_client.update_request_status(request_id, "failed", error_message=str(e))
+            print(f"[Request {request_id}] Status: failed - {str(e)}")
+        except Exception as update_error:
+            print(f"Warning: Status update failed: {update_error}")
+        
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to create job: {str(e)}"
+        )
+
+
+@app.get("/jobs/{request_id}/status")
+async def get_job_status(request_id: str):
+    """
+    Get status of a document generation job.
+    
+    Returns:
+        - request_id: UUID
+        - status: pending | processing | generating | completed | failed
+        - created_at: ISO timestamp
+        - updated_at: ISO timestamp
+        - error_message: str (if failed)
+        - results: dict with download_url (if completed)
+    """
+    try:
+        # Get request from Supabase
+        request_data = supabase_client.get_request(request_id)
+        
+        if not request_data:
+            raise HTTPException(
+                status_code=status.HTTP_404_NOT_FOUND,
+                detail=f"Request {request_id} not found"
+            )
+        
+        response = {
+            "request_id": request_id,
+            "status": request_data["status"],
+            "created_at": request_data["created_at"],
+            "updated_at": request_data["updated_at"],
+            "num_documents": request_data["metadata"]["prompt_params"]["num_solutions"]
+        }
+        
+        # Add error message if failed
+        if request_data["status"] == "failed":
+            response["error_message"] = request_data.get("error_message")
+        
+        # Add result URL if completed
+        if request_data["status"] == "completed":
+            # Get generated documents
+            generated_docs = supabase_client.get_generated_documents(request_id)
+            
+            if generated_docs:
+                response["results"] = {
+                    "documents": [
+                        {
+                            "id": doc.get("id"),
+                            "doc_index": doc.get("doc_index"),
+                            "pdf_url": doc.get("file_url"),
+                            "doc_storage_path": doc.get("doc_storage_path"),
+                            "gt_storage_path": doc.get("gt_storage_path"),
+                            "html_storage_path": doc.get("html_storage_path"),
+                            "bbox_storage_path": doc.get("bbox_storage_path")
+                        } for doc in generated_docs if doc.get("doc_index") is not None
+                    ],
+                    "zip_filename": f"docgenie_{request_id}.zip"
+                }
+                
+                # If there's a zip file (legacy or background GDrive task), add it too
+                zip_docs = [doc for doc in generated_docs if doc.get("file_type") == "application/zip"]
+                if zip_docs:
+                    response["results"]["download_url"] = zip_docs[0].get("file_url")
+        
+        return response
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error fetching job status: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to fetch job status: {str(e)}"
+        )
+
+
+@app.get("/jobs/user/{user_id}")
+async def get_user_jobs(user_id: int, limit: int = 50, offset: int = 0):
+    """
+    Get all jobs for a user.
+    
+    Query params:
+        - limit: int (default: 50, max: 100)
+        - offset: int (default: 0)
+    
+    Returns:
+        List of job status objects
+    """
+    try:
+        # Validate limit
+        if limit > 100:
+            limit = 100
+        
+        # Get user's requests from Supabase
+        requests = supabase_client.get_user_requests(user_id, limit, offset)
+        
+        results = []
+        for request_data in requests:
+            result = {
+                "request_id": request_data["id"],
+                "status": request_data["status"],
+                "created_at": request_data["created_at"],
+                "updated_at": request_data["updated_at"],
+                "num_documents": request_data["metadata"]["prompt_params"]["num_solutions"]
+            }
+            
+            if request_data["status"] == "failed":
+                result["error_message"] = request_data.get("error_message")
+            
+            if request_data["status"] == "completed":
+                # Get generated documents
+                generated_docs = supabase_client.get_generated_documents(request_data["id"])
+                if generated_docs:
+                    result["download_url"] = generated_docs[0]["file_url"]
+            
+            results.append(result)
+        
+        return {
+            "user_id": user_id,
+            "jobs": results,
+            "count": len(results),
+            "limit": limit,
+            "offset": offset
+        }
+    
+    except Exception as e:
+        print(f"Error fetching user jobs: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to fetch user jobs: {str(e)}"
+        )
+
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "main:app",
+        host=settings.API_HOST,
+        port=settings.API_PORT,
+        reload=settings.DEBUG_MODE
+    )
diff --git a/api/quick_test.sh b/api/quick_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1799fb61c336e2d1dfec5f7bedc4b1ff94c1aa2e
--- /dev/null
+++ b/api/quick_test.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Quick test script - tests async API with Google Drive upload
+# Usage: ./quick_test.sh YOUR_GOOGLE_ACCESS_TOKEN
+
+set -e
+
+GOOGLE_TOKEN=$1
+BASE_URL=${2:-"http://localhost:8000"}
+
+if [ -z "$GOOGLE_TOKEN" ]; then
+    echo "Usage: ./quick_test.sh YOUR_GOOGLE_ACCESS_TOKEN [BASE_URL]"
+    echo ""
+    echo "To get a Google token, run:"
+    echo "  python test_get_google_token.py --client-id YOUR_ID --client-secret YOUR_SECRET"
+    echo ""
+    echo "Or see TESTING.md for detailed instructions"
+    exit 1
+fi
+
+echo "==========================================="
+echo "Quick Test: Async API + Google Drive"
+echo "==========================================="
+echo "API: $BASE_URL"
+echo "Token: ${GOOGLE_TOKEN:0:20}..."
+echo ""
+
+# Step 1: Health check
+echo "1. Health Check..."
+curl -s "$BASE_URL/health" | python -m json.tool
+echo ""
+
+# Step 2: Submit job
+echo "2. Submitting Job..."
+RESPONSE=$(curl -s -X POST "$BASE_URL/generate/async" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"user_id\": 1,
+    \"google_drive_token\": \"$GOOGLE_TOKEN\",
+    \"seed_images\": [\"https://ocr.space/Content/Images/receipt-ocr-original.webp\"],
+    \"prompt_params\": {
+      \"language\": \"English\",
+      \"doc_type\": \"receipts\",
+      \"num_solutions\": 1,
+      \"enable_handwriting\": false,
+      \"enable_visual_elements\": false,
+      \"output_detail\": \"minimal\"
+    }
+  }")
+
+echo "$RESPONSE" | python -m json.tool
+echo ""
+
+REQUEST_ID=$(echo "$RESPONSE" | python -c "import sys, json; print(json.load(sys.stdin)['request_id'])" 2>/dev/null || echo "")
+
+if [ -z "$REQUEST_ID" ]; then
+    echo "✗ Failed to submit job"
+    exit 1
+fi
+
+echo "✓ Job ID: $REQUEST_ID"
+echo ""
+
+# Step 3: Poll status
+echo "3. Polling Status (will check 5 times, 10s apart)..."
+for i in {1..5}; do
+    echo "   Poll $i/5..."
+    STATUS=$(curl -s "$BASE_URL/jobs/$REQUEST_ID/status")
+    CURRENT_STATUS=$(echo "$STATUS" | python -c "import sys, json; print(json.load(sys.stdin)['status'])" 2>/dev/null || echo "unknown")
+    echo "   Status: $CURRENT_STATUS"
+    
+    if [ "$CURRENT_STATUS" = "completed" ]; then
+        echo ""
+        echo "✓ JOB COMPLETED!"
+        echo "$STATUS" | python -m json.tool
+        exit 0
+    elif [ "$CURRENT_STATUS" = "failed" ]; then
+        echo ""
+        echo "✗ JOB FAILED"
+        echo "$STATUS" | python -m json.tool
+        exit 1
+    fi
+    
+    if [ $i -lt 5 ]; then
+        sleep 10
+    fi
+done
+
+echo ""
+echo "⏱ Job still in progress. Continue polling manually:"
+echo "   curl $BASE_URL/jobs/$REQUEST_ID/status"
+echo ""
+echo "Or use the full test script:"
+echo "   python test_async_api.py --google-token $GOOGLE_TOKEN"
diff --git a/api/requirements.txt b/api/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1abb22cbfc62d69a6c8b62659bf76d263ac98e81
--- /dev/null
+++ b/api/requirements.txt
@@ -0,0 +1,79 @@
+# ============================================
+# DocGenie API Requirements
+# ============================================
+# NOTE: These dependencies are also specified in the root pyproject.toml
+# This file exists for standalone API deployment convenience
+# For development, use: uv sync (from root directory)
+# For production API-only deployment: pip install -r requirements.txt
+# Aligned with pyproject.toml versions used to run pipeline locally
+
+# FastAPI Framework
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+python-multipart>=0.0.6
+
+# Pydantic for data validation
+pydantic==2.11.7
+pydantic-core==2.33.2
+pydantic-settings>=2.11.0
+
+# Environment variables
+python-dotenv>=1.0.0
+
+# HTTP client for async requests
+httpx==0.28.1
+aiohttp==3.12.15
+
+# Retry logic for external services
+tenacity>=8.2.3
+
+# Claude API
+anthropic==0.64.0
+
+# HTML rendering and PDF generation
+playwright>=1.55.0
+beautifulsoup4==4.13.4
+lxml>=5.1.0
+
+# PDF processing
+PyMuPDF==1.26.3
+pdf2image==1.17.0
+pypdf2==3.0.1
+
+# Image processing for Stage 3
+Pillow==11.3.0
+numpy==1.26.4
+
+# CSS parsing for Stage 3
+cssutils==2.11.1
+
+# Progress bars and logging
+rich==14.1.0
+
+# Additional utilities
+python-dateutil==2.9.0.post0
+requests==2.32.5
+
+# Background job queue (Redis + RQ)
+redis>=5.0.0
+rq>=1.15.0
+
+# Supabase client for database
+supabase>=2.0.0
+
+# Google Drive API integration
+google-api-python-client>=2.100.0
+google-auth-httplib2>=0.2.0
+google-auth-oauthlib>=1.2.0
+
+# ============================================
+# Optional dependencies for advanced features
+# ============================================
+# OCR support (requires system tesseract-ocr)
+pytesseract>=0.3.10
+
+# Barcode generation
+python-barcode>=0.15.1
+
+# Dataset export in msgpack format
+datadings>=0.4.3
diff --git a/api/schemas.py b/api/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb7d17d0802e7a981c2f84e0fa97f66a6df5a13
--- /dev/null
+++ b/api/schemas.py
@@ -0,0 +1,335 @@
+"""
+Pydantic schemas for API request/response models.
+"""
+from typing import List, Optional
+from pydantic import BaseModel, HttpUrl, Field, field_validator
+
+
+class PromptParameters(BaseModel):
+    """Parameters for customizing the document generation prompt."""
+    language: str = Field(
+        default="English",
+        description="Language for generated documents"
+    )
+    doc_type: str = Field(
+        default="business and administrative",
+        description="Type of documents to generate (e.g., 'business and administrative', 'receipts', 'forms')"
+    )
+    gt_type: str = Field(
+        default="Multiple questions about each document, with their answers taken **verbatim** from the document.",
+        description="Description of ground truth type to generate"
+    )
+    gt_format: str = Field(
+        default='{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}',
+        description="Format specification for ground truth JSON"
+    )
+    num_solutions: int = Field(
+        default=1,
+        ge=1,
+        le=5,
+        description="Number of document variations to generate (1-5)"
+    )
+    # Stage 3: Feature Synthesis parameters
+    enable_handwriting: bool = Field(
+        default=False,
+        description="Enable handwriting generation (requires EC2 handwriting service)"
+    )
+    handwriting_ratio: float = Field(
+        default=0.2,
+        ge=0.0,
+        le=1.0,
+        description="Proportion of text to convert to handwriting (0.0-1.0)"
+    )
+    enable_visual_elements: bool = Field(
+        default=True,
+        description="Enable visual element generation (stamps, logos, barcodes)"
+    )
+    visual_element_types: List[str] = Field(
+        default=["stamp", "logo", "figure", "barcode", "photo"],
+        description="Types of visual elements to generate (stamp, logo, figure, barcode, photo)"
+    )
+    seed: Optional[int] = Field(
+        default=None,
+        description="Random seed for reproducible generation",
+        examples=[None, 42]
+    )
+    # Stage 4: Image Finalization & OCR parameters
+    enable_ocr: bool = Field(
+        default=True,
+        description="Enable OCR on final document images (requires OCR service)"
+    )
+    ocr_language: str = Field(
+        default="en",
+        description="Language for OCR (e.g., 'en', 'de', 'fr')"
+    )
+    # Stage 5: Dataset Packaging parameters
+    enable_bbox_normalization: bool = Field(
+        default=True,
+        description="Normalize bounding boxes to [0,1] scale (Stage 16)"
+    )
+    enable_gt_verification: bool = Field(
+        default=True,
+        description="Verify and prepare ground truth annotations (Stage 17)"
+    )
+    enable_analysis: bool = Field(
+        default=True,
+        description="Generate dataset statistics and analysis (Stage 18)"
+    )
+    enable_debug_visualization: bool = Field(
+        default=True,
+        description="Create debug visualization overlays (Stage 19)"
+    )
+    enable_dataset_export: bool = Field(
+        default=True,
+        description="Export as msgpack dataset format"
+    )
+    dataset_export_format: str = Field(
+        default="msgpack",
+        description="Dataset export format: 'msgpack', 'coco', 'huggingface'"
+    )
+    output_detail: str = Field(
+        default="dataset",
+        description="Output detail level: 'minimal' (final outputs only), 'dataset' (includes individual tokens/elements for ML), 'complete' (all intermediate files and debug info). Warning: 'complete' mode can produce 50+ MB responses."
+    )
+
+
+class SeedImage(BaseModel):
+    """Seed image URL for document generation."""
+    url: HttpUrl = Field(
+        description="URL of the seed image",
+        default=HttpUrl("https://ocr.space/Content/Images/receipt-ocr-original.webp")
+    )
+
+
+class GenerateDocumentRequest(BaseModel):
+    """Request schema for document generation endpoint."""
+    request_id: str = Field(
+        description="Document request UUID from document_requests table (created by frontend)"
+    )
+    google_drive_token: Optional[str] = Field(
+        default=None,
+        description="Google Drive OAuth access token. Frontend provides this after OAuth flow (optional)."
+    )
+    google_drive_refresh_token: Optional[str] = Field(
+        default=None,
+        description="Google Drive refresh token (optional, for automatic token renewal)"
+    )
+    seed_images: List[HttpUrl] = Field(
+        default=[HttpUrl("https://ocr.space/Content/Images/receipt-ocr-original.webp")],
+        description="List of seed image URLs (1-10 images)"
+    )
+    prompt_params: PromptParameters = Field(
+        default_factory=PromptParameters,
+        description="Parameters for customizing the generation prompt"
+    )
+    
+    @field_validator('seed_images')
+    @classmethod
+    def validate_seed_images(cls, v):
+        if not v:
+            raise ValueError('At least one seed image is required')
+        if len(v) < 1:
+            raise ValueError('At least one seed image is required')
+        if len(v) > 10:
+            raise ValueError('Maximum 10 seed images allowed')
+        return v
+
+
+class OCRWord(BaseModel):
+    """OCR word-level result."""
+    text: str = Field(description="Recognized text")
+    confidence: float = Field(ge=0.0, le=1.0, description="OCR confidence score (0-1)")
+    x: float = Field(description="X coordinate (pixels)")
+    y: float = Field(description="Y coordinate (pixels)")
+    width: float = Field(description="Width (pixels)")
+    height: float = Field(description="Height (pixels)")
+
+
+class OCRLine(BaseModel):
+    """OCR line-level result."""
+    text: str = Field(description="Recognized text")
+    confidence: float = Field(ge=0.0, le=1.0, description="OCR confidence score (0-1)")
+    x: float = Field(description="X coordinate (pixels)")
+    y: float = Field(description="Y coordinate (pixels)")
+    width: float = Field(description="Width (pixels)")
+    height: float = Field(description="Height (pixels)")
+    words: List[OCRWord] = Field(default_factory=list, description="Words in this line")
+
+
+class OCRResult(BaseModel):
+    """OCR results for a document."""
+    image_width: int = Field(description="Image width in pixels")
+    image_height: int = Field(description="Image height in pixels")
+    words: List[OCRWord] = Field(default_factory=list, description="Word-level OCR results")
+    lines: List[OCRLine] = Field(default_factory=list, description="Line-level OCR results")
+    angle: float = Field(default=0.0, description="Detected text orientation angle")
+
+
+class NormalizedBBox(BaseModel):
+    """Normalized bounding box (Stage 16)."""
+    text: str = Field(description="Text content")
+    x0: float = Field(ge=0.0, le=1.0, description="Normalized X min (0-1)")
+    y0: float = Field(ge=0.0, le=1.0, description="Normalized Y min (0-1)")
+    x2: float = Field(ge=0.0, le=1.0, description="Normalized X max (0-1)")
+    y2: float = Field(ge=0.0, le=1.0, description="Normalized Y max (0-1)")
+    block_no: Optional[int] = Field(default=None, description="Block number")
+    line_no: Optional[int] = Field(default=None, description="Line number")
+    word_no: Optional[int] = Field(default=None, description="Word number")
+
+
+class GTVerificationResult(BaseModel):
+    """Ground truth verification results (Stage 17)."""
+    passed: bool = Field(description="Whether GT verification passed")
+    skipped: bool = Field(default=False, description="Whether verification was skipped")
+    confirmed_keys: List[str] = Field(default_factory=list, description="Confirmed GT keys")
+    similarities: List[float] = Field(default_factory=list, description="Similarity scores")
+    num_layout_elements: Optional[int] = Field(default=None, description="Number of layout elements")
+    valid_labels: bool = Field(default=True, description="Whether all labels are valid")
+
+
+class AnalysisStats(BaseModel):
+    """Dataset analysis and statistics (Stage 18)."""
+    total_documents: int = Field(description="Total documents processed")
+    valid_documents: int = Field(description="Documents passing all validation")
+    error_counts: dict = Field(default_factory=dict, description="Error type counts")
+    has_handwriting: int = Field(default=0, description="Documents with handwriting")
+    has_visual_elements: int = Field(default=0, description="Documents with visual elements")
+    has_ocr: int = Field(default=0, description="Documents with OCR results")
+    multipage_count: int = Field(default=0, description="Multipage documents")
+    token_usage: Optional[dict] = Field(default=None, description="LLM token usage statistics")
+
+
+class DebugVisualization(BaseModel):
+    """Debug visualization data (Stage 19)."""
+    bbox_overlay_base64: Optional[str] = Field(default=None, description="Image with bbox overlays (PNG base64)")
+    visual_elements_overlay_base64: Optional[str] = Field(default=None, description="Image with visual element overlays")
+    handwriting_overlay_base64: Optional[str] = Field(default=None, description="Image with handwriting overlays")
+
+
+class DatasetExportInfo(BaseModel):
+    """Dataset export metadata."""
+    format: str = Field(description="Export format (msgpack, coco, etc.)")
+    num_samples: int = Field(description="Number of samples in export")
+    output_path: Optional[str] = Field(default=None, description="Path to exported dataset")
+    msgpack_base64: Optional[str] = Field(default=None, description="Msgpack file as base64 (for small datasets)")
+    metadata: dict = Field(default_factory=dict, description="Dataset metadata")
+
+
+class BoundingBox(BaseModel):
+    """Bounding box for a text element in the document."""
+    text: str = Field(description="Text content")
+    x: float = Field(description="X coordinate (normalized 0-1)")
+    y: float = Field(description="Y coordinate (normalized 0-1)")
+    width: float = Field(description="Width (normalized 0-1)")
+    height: float = Field(description="Height (normalized 0-1)")
+    page: int = Field(default=0, description="Page number (0-indexed)")
+
+
+class HandwritingRegion(BaseModel):
+    """Information about a handwriting region in the document."""
+    region_id: str = Field(description="Unique region identifier")
+    text: str = Field(description="Text content")
+    author_id: int = Field(ge=0, le=656, description="Author ID for style consistency (0-656)")
+    bbox: BoundingBox = Field(description="Bounding box of the region")
+
+
+class VisualElement(BaseModel):
+    """Information about a visual element in the document."""
+    element_id: str = Field(description="Unique element identifier")
+    element_type: str = Field(description="Type of visual element (stamp, logo, etc.)")
+    content: Optional[str] = Field(default=None, description="Content (e.g., stamp text)")
+    bbox: BoundingBox = Field(description="Bounding box of the element")
+
+
+class DocumentResult(BaseModel):
+    """Result for a single generated document."""
+    document_id: str = Field(description="Unique document identifier")
+    html: str = Field(description="Generated HTML content")
+    css: str = Field(description="Extracted CSS styles")
+    ground_truth: Optional[dict] = Field(
+        default=None,
+        description="Ground truth data extracted from the document"
+    )
+    pdf_base64: str = Field(description="Base64-encoded PDF document")
+    bboxes: List[BoundingBox] = Field(
+        default_factory=list,
+        description="Bounding boxes for text elements"
+    )
+    page_width_mm: float = Field(description="Page width in millimeters")
+    page_height_mm: float = Field(description="Page height in millimeters")
+    # Stage 3 additions
+    handwriting_regions: Optional[List[dict]] = Field(
+        default=None,
+        description="Handwriting regions with metadata (if enabled)"
+    )
+    visual_elements: Optional[List[dict]] = Field(
+        default=None,
+        description="Visual elements with metadata (if enabled)"
+    )
+    image_base64: Optional[str] = Field(
+        default=None,
+        description="Final rendered image with handwriting/visuals (PNG base64, if Stage 3 enabled)"
+    )
+    # Stage 3 individual tokens (dataset/complete output detail levels)
+    handwriting_token_images: Optional[dict] = Field(
+        default=None,
+        description="Individual handwriting token images {hw_id: base64_png} (output_detail: dataset/complete)"
+    )
+    visual_element_images: Optional[dict] = Field(
+        default=None,
+        description="Individual visual element images {ve_id: base64_png} (output_detail: dataset/complete)"
+    )
+    token_mapping: Optional[dict] = Field(
+        default=None,
+        description="Token mapping with positions and style IDs (output_detail: dataset/complete)"
+    )
+    # Stage 4 additions
+    ocr_results: Optional[OCRResult] = Field(
+        default=None,
+        description="OCR results from final image (if OCR enabled)"
+    )
+    # Stage 5 additions
+    normalized_bboxes_word: Optional[List[NormalizedBBox]] = Field(
+        default=None,
+        description="Word-level normalized bounding boxes (if Stage 16 enabled)"
+    )
+    normalized_bboxes_segment: Optional[List[NormalizedBBox]] = Field(
+        default=None,
+        description="Segment-level normalized bounding boxes (if Stage 16 enabled)"
+    )
+    gt_verification: Optional[GTVerificationResult] = Field(
+        default=None,
+        description="Ground truth verification results (if Stage 17 enabled)"
+    )
+    analysis_stats: Optional[AnalysisStats] = Field(
+        default=None,
+        description="Document analysis statistics (if Stage 18 enabled)"
+    )
+    debug_visualization: Optional[DebugVisualization] = Field(
+        default=None,
+        description="Debug visualization overlays (if Stage 19 enabled)"
+    )
+    dataset_export: Optional[DatasetExportInfo] = Field(
+        default=None,
+        description="Dataset export information (if export enabled)"
+    )
+
+
+class GenerateDocumentResponse(BaseModel):
+    """Response schema for document generation endpoint."""
+    success: bool = Field(description="Whether generation was successful")
+    message: str = Field(description="Status message")
+    documents: List[DocumentResult] = Field(
+        default_factory=list,
+        description="List of generated documents"
+    )
+    total_documents: int = Field(
+        default=0,
+        description="Total number of documents generated"
+    )
+
+
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str = Field(default="healthy")
+    version: str = Field(default="1.0.0")
diff --git a/api/start.sh b/api/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1c1a4233f71cb376dc6ecf27d545fbb6c1200bdd
--- /dev/null
+++ b/api/start.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Start the DocGenie API server
+# Note: All dependencies should be installed via 'uv sync' or 'pip install -e .'
+
+echo "Starting DocGenie API..."
+
+# Check if .env file exists
+if [ ! -f .env ]; then
+    echo "Warning: .env file not found. Using .env.example as template."
+    echo "Please copy .env.example to .env and set your ANTHROPIC_API_KEY"
+    
+    if [ -f .env.example ]; then
+        cp .env.example .env
+        echo "Created .env file from .env.example"
+    fi
+fi
+
+# Load environment variables
+if [ -f .env ]; then
+    export $(cat .env | grep -v '^#' | xargs)
+fi
+
+# Check if ANTHROPIC_API_KEY is set
+if [ -z "$ANTHROPIC_API_KEY" ]; then
+    echo "Error: ANTHROPIC_API_KEY not set in .env file"
+    exit 1
+fi
+
+# Default values
+HOST=${API_HOST:-0.0.0.0}
+PORT=${API_PORT:-8000}
+WORKERS=${API_WORKERS:-4}
+
+echo "Configuration:"
+echo "  Host: $HOST"
+echo "  Port: $PORT"
+echo "  Workers: $WORKERS"
+echo ""
+
+# Start the API
+uvicorn main:app --host $HOST --port $PORT --workers $WORKERS --reload
diff --git a/api/start_worker.sh b/api/start_worker.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d7c44cf2e8f8b2a3e753ce78724be00e7420f5f1
--- /dev/null
+++ b/api/start_worker.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# ============================================
+# DocGenie RQ Worker Startup Script
+# ============================================
+# This script starts an RQ (Redis Queue) worker for processing
+# background document generation jobs.
+
+set -e  # Exit on error
+
+echo "🚀 Starting DocGenie RQ Worker..."
+
+# Activate virtual environment
+VENV_PATH="../.venv"
+if [ -d "$VENV_PATH" ]; then
+    echo "✓ Activating virtual environment..."
+    source "$VENV_PATH/bin/activate"
+else
+    echo "⚠ Warning: Virtual environment not found at $VENV_PATH"
+fi
+
+# Load environment variables from .env using Python (handles special characters properly)
+if [ -f .env ]; then
+    echo "✓ Loading .env file..."
+    eval $(python -c "
+import os
+from dotenv import load_dotenv
+load_dotenv()
+for key, value in os.environ.items():
+    # Only export DocGenie related variables
+    if key.startswith(('REDIS', 'SUPABASE', 'ANTHROPIC', 'BATCH', 'MESSAGE', 'RQ_', 'GOOGLE')):
+        # Properly escape single quotes in the value
+        safe_value = value.replace(\"'\", \"'\\\\''\" )
+        print(f\"export {key}='{safe_value}'\")
+")
+else
+    echo "⚠ Warning: .env file not found"
+fi
+
+# Check Redis connection
+echo "🔍 Checking Redis connection..."
+if ! python -c "import redis; r = redis.from_url('${REDIS_URL:-redis://localhost:6379/0}'); r.ping()" 2>/dev/null; then
+    echo "❌ Error: Cannot connect to Redis at ${REDIS_URL:-redis://localhost:6379/0}"
+    echo "   Please ensure Redis is running:"
+    echo "   $ docker run -d -p 6379:6379 redis:latest"
+    echo "   OR"
+    echo "   $ redis-server"
+    exit 1
+fi
+echo "✓ Redis connected"
+
+# Check Supabase configuration
+if [ -z "$SUPABASE_URL" ] || [ -z "$SUPABASE_KEY" ]; then
+    echo "❌ Error: SUPABASE_URL and SUPABASE_KEY must be set in .env"
+    exit 1
+fi
+echo "✓ Supabase configured"
+
+# Check Claude API key
+if [ -z "$ANTHROPIC_API_KEY" ]; then
+    echo "❌ Error: ANTHROPIC_API_KEY must be set in .env"
+    exit 1
+fi
+echo "✓ Claude API key configured"
+
+# Create temporary directories
+mkdir -p "${BATCH_DATA_DIR:-/tmp/docgenie_batches}"
+mkdir -p "${MESSAGE_DATA_DIR:-/tmp/docgenie_messages}"
+echo "✓ Temporary directories created"
+
+# Start worker
+QUEUE_NAME="${RQ_QUEUE_NAME:-docgenie}"
+echo ""
+echo "============================================"
+echo "Worker Configuration:"
+echo "  Queue: $QUEUE_NAME"
+echo "  Redis: ${REDIS_URL:-redis://localhost:6379/0}"
+echo "  Batch Data: ${BATCH_DATA_DIR:-/tmp/docgenie_batches}"
+echo "  Message Data: ${MESSAGE_DATA_DIR:-/tmp/docgenie_messages}"
+echo "============================================"
+echo ""
+echo "✅ Starting RQ worker (press Ctrl+C to stop)..."
+echo ""
+
+# Run RQ worker
+# - Listen on specified queue
+# - Burst mode: exit when queue is empty (use for testing)
+# - Remove --burst for production (keeps running)
+# Use PYTHONPATH to ensure worker.py can be imported
+PYTHONPATH="$(pwd):$PYTHONPATH" rq worker "$QUEUE_NAME" \
+    --url "${REDIS_URL:-redis://localhost:6379/0}" \
+    --verbose
+    # --burst  # Uncomment for testing (exit when queue empty)
+
+# Note: Worker will keep running until Ctrl+C is pressed
+# In production, use a process manager like systemd or supervisor
diff --git a/api/supabase_client.py b/api/supabase_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b58f440edded9ccdc48671440c1604d8f7ab9d
--- /dev/null
+++ b/api/supabase_client.py
@@ -0,0 +1,275 @@
+"""
+Supabase client for database operations.
+Handles document requests, generated documents, and user integrations.
+"""
+
+from typing import Optional, Dict, Any, List
+import os
+from datetime import datetime
+from supabase import create_client, Client
+from .config import settings
+
+
+class SupabaseClient:
+    """Wrapper for Supabase operations related to document generation"""
+    
+    def __init__(self):
+        if not settings.SUPABASE_URL or not settings.SUPABASE_KEY:
+            raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set in environment")
+        
+        self.client: Client = create_client(
+            settings.SUPABASE_URL,
+            settings.SUPABASE_KEY
+        )
+    
+    # ==================== Document Requests ====================
+    
+    def create_document_request(
+        self,
+        user_id: int,
+        metadata: Dict[str, Any],
+        status: str = "pending"
+    ) -> str:
+        """
+        Create a new document generation request.
+        
+        Args:
+            user_id: User ID from users table
+            metadata: Request parameters (seed_images, prompt_params, etc.)
+            status: Initial status (default: 'pending')
+        
+        Returns:
+            request_id (UUID)
+        """
+        result = self.client.table("document_requests").insert({
+            "user_id": user_id,
+            "metadata": metadata,
+            "status": status,
+            "created_at": datetime.now().isoformat(),
+            "updated_at": datetime.now().isoformat()
+        }).execute()
+        
+        return result.data[0]["id"]
+    
+    def update_request_status(
+        self,
+        request_id: str,
+        status: str,
+        error_message: Optional[str] = None
+    ):
+        """
+        Update document request status.
+        
+        Args:
+            request_id: UUID of the request
+            status: New status (pending, processing, generating, completed, failed, etc.)
+            error_message: Error message if status is 'failed'
+        """
+        update_data = {
+            "status": status,
+            "updated_at": datetime.now().isoformat()
+        }
+        
+        if error_message:
+            update_data["error_message"] = error_message
+        
+        self.client.table("document_requests").update(update_data).eq(
+            "id", request_id
+        ).execute()
+    
+    def get_request(self, request_id: str) -> Optional[Dict[str, Any]]:
+        """
+        Get document request by ID.
+        
+        Returns:
+            Dict with keys: id, user_id, metadata, status, created_at, updated_at, error_message
+        """
+        result = self.client.table("document_requests").select("*").eq(
+            "id", request_id
+        ).execute()
+        
+        return result.data[0] if result.data else None
+    
+    def get_user_id_from_request(self, request_id: str) -> Optional[int]:
+        """
+        Get user_id from a document request.
+        
+        Args:
+            request_id: Document request UUID
+            
+        Returns:
+            user_id or None if request not found
+        """
+        result = self.client.table("document_requests").select("user_id").eq(
+            "id", request_id
+        ).execute()
+        
+        return result.data[0]["user_id"] if result.data else None
+    
+    def get_user_requests(
+        self,
+        user_id: int,
+        limit: int = 50,
+        offset: int = 0
+    ) -> List[Dict[str, Any]]:
+        """Get all requests for a user, ordered by created_at DESC"""
+        result = self.client.table("document_requests").select(
+            "*"
+        ).eq("user_id", user_id).order(
+            "created_at", desc=True
+        ).range(offset, offset + limit - 1).execute()
+        
+        return result.data
+    
+    # ==================== Generated Documents ====================
+    
+    def create_generated_document(
+        self,
+        request_id: str,
+        file_url: Optional[str] = None,
+        file_type: Optional[str] = None,
+        model_version: Optional[str] = None,
+        doc_index: Optional[int] = None,
+        doc_storage_path: Optional[str] = None,
+        gt_storage_path: Optional[str] = None,
+        html_storage_path: Optional[str] = None,
+        bbox_storage_path: Optional[str] = None,
+        zip_url: Optional[str] = None,
+        flagged: bool = False,
+        flag_reason: Optional[str] = None
+    ) -> str:
+        """
+        Create record for a generated document.
+        
+        Args:
+            request_id: Parent request UUID (FK to document_requests)
+            file_url: Google Drive URL or other storage URL
+            file_type: MIME type (e.g., 'application/zip', 'application/pdf')
+            model_version: Model version used for generation (optional)
+            doc_index: Index of the document within the request (optional)
+            doc_storage_path: Path to the generated PDF in Supabase storage (optional)
+            gt_storage_path: Path to the ground truth JSON in Supabase storage (optional)
+            html_storage_path: Path to the HTML source in Supabase storage (optional)
+            bbox_storage_path: Path to the bbox JSON in Supabase storage (optional)
+            flagged: Whether the document is flagged for review
+            flag_reason: Reason for flagging
+        
+        Returns:
+            id (UUID) - Database record ID
+        """
+        insert_data = {
+            "request_id": request_id,
+            "created_at": datetime.now().isoformat(),
+            "updated_at": datetime.now().isoformat(),
+            "flagged": flagged
+        }
+        
+        if file_url is not None:
+            insert_data["file_url"] = file_url
+        if file_type is not None:
+            insert_data["file_type"] = file_type
+        if model_version is not None:
+            insert_data["model_version"] = model_version
+        if doc_index is not None:
+            insert_data["doc_index"] = doc_index
+        if doc_storage_path is not None:
+            insert_data["doc_storage_path"] = doc_storage_path
+        if gt_storage_path is not None:
+            insert_data["gt_storage_path"] = gt_storage_path
+        if html_storage_path is not None:
+            insert_data["html_storage_path"] = html_storage_path
+        if bbox_storage_path is not None:
+            insert_data["bbox_storage_path"] = bbox_storage_path
+        if zip_url is not None:
+            insert_data["zip_url"] = zip_url
+        if flag_reason is not None:
+            insert_data["flag_reason"] = flag_reason
+            
+        result = self.client.table("generated_documents").insert(insert_data).execute()
+        
+        return result.data[0]["id"]
+    
+    def upload_to_storage(
+        self,
+        bucket_name: str,
+        path: str,
+        file_bytes: bytes,
+        content_type: str
+    ) -> Dict[str, Any]:
+        """
+        Upload a file to Supabase storage.
+        
+        Args:
+            bucket_name: The name of the Supabase storage bucket
+            path: The path/filename to store the file as
+            file_bytes: The raw bytes of the file
+            content_type: MIME type of the file
+            
+        Returns:
+            Upload result dictionary containing the path
+        """
+        return self.client.storage.from_(bucket_name).upload(
+            file=file_bytes,
+            path=path,
+            file_options={"content-type": content_type, "upsert": "true"}
+        )
+        
+    def get_public_url(self, bucket_name: str, path: str) -> str:
+        """Get the public URL for a file in Supabase storage."""
+        return self.client.storage.from_(bucket_name).get_public_url(path)
+    
+    def get_generated_documents(
+        self,
+        request_id: str
+    ) -> List[Dict[str, Any]]:
+        """Get all generated documents for a request"""
+        result = self.client.table("generated_documents").select("*").eq(
+            "request_id", request_id
+        ).execute()
+        
+        return result.data
+    
+    # ==================== User Integrations ====================
+    
+    def get_user_google_drive_integration(
+        self,
+        user_id: int
+    ) -> Optional[Dict[str, Any]]:
+        """Get user's Google Drive integration credentials"""
+        result = self.client.table("user_integrations").select("*").eq(
+            "user_id", user_id
+        ).eq("provider", "google_drive").execute()
+        
+        return result.data[0] if result.data else None
+    
+    def update_google_drive_tokens(
+        self,
+        user_id: int,
+        access_token: str,
+        refresh_token: Optional[str] = None,
+        expires_at: Optional[datetime] = None
+    ):
+        """[DEPRECATED] Update Google Drive OAuth tokens"""
+        # This method is deprecated - frontend now handles OAuth
+        # Kept for backward compatibility only
+        pass
+    
+    # ==================== Analytics ====================
+    
+    def log_analytics_event(
+        self,
+        user_id: int,
+        event_type: str,
+        entity_id: Optional[str] = None
+    ):
+        """Log analytics event"""
+        self.client.table("analytics_events").insert({
+            "user_id": user_id,
+            "event_type": event_type,
+            "entity_id": entity_id,
+            "created_at": datetime.now().isoformat()
+        }).execute()
+
+
+# Global instance
+supabase_client = SupabaseClient()
diff --git a/api/test_api.py b/api/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1249dd5446dd8917ae55037058e924c3256ef38b
--- /dev/null
+++ b/api/test_api.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+Test script for DocGenie API.
+Verifies all components are properly installed and configured.
+"""
+import sys
+import os
+from pathlib import Path
+
+
+def test_imports():
+    """Test that all required modules can be imported."""
+    print("Testing imports...")
+    
+    try:
+        import fastapi
+        print("  ✓ FastAPI")
+    except ImportError as e:
+        print(f"  ✗ FastAPI: {e}")
+        return False
+    
+    try:
+        import uvicorn
+        print("  ✓ Uvicorn")
+    except ImportError as e:
+        print(f"  ✗ Uvicorn: {e}")
+        return False
+    
+    try:
+        import pydantic
+        print("  ✓ Pydantic")
+    except ImportError as e:
+        print(f"  ✗ Pydantic: {e}")
+        return False
+    
+    try:
+        import requests
+        print("  ✓ Requests")
+    except ImportError as e:
+        print(f"  ✗ Requests: {e}")
+        return False
+    
+    try:
+        from PIL import Image
+        print("  ✓ Pillow")
+    except ImportError as e:
+        print(f"  ✗ Pillow: {e}")
+        return False
+    
+    try:
+        from bs4 import BeautifulSoup
+        print("  ✓ BeautifulSoup4")
+    except ImportError as e:
+        print(f"  ✗ BeautifulSoup4: {e}")
+        return False
+    
+    try:
+        from playwright.async_api import async_playwright
+        print("  ✓ Playwright")
+    except ImportError as e:
+        print(f"  ✗ Playwright: {e}")
+        return False
+    
+    try:
+        import anthropic
+        print("  ✓ Anthropic")
+    except ImportError as e:
+        print(f"  ✗ Anthropic: {e}")
+        return False
+    
+    try:
+        from docgenie import ENV
+        print("  ✓ DocGenie")
+    except ImportError as e:
+        print(f"  ✗ DocGenie: {e}")
+        return False
+    
+    return True
+
+
+def test_api_structure():
+    """Test that API files are in place."""
+    print("\nTesting API structure...")
+    
+    api_dir = Path(__file__).parent
+    
+    files = {
+        "main.py": "Main API application",
+        "schemas.py": "Request/Response models",
+        "utils.py": "Processing utilities",
+        "README.md": "Documentation",
+        "__init__.py": "Package init"
+    }
+    
+    all_present = True
+    for filename, description in files.items():
+        filepath = api_dir / filename
+        if filepath.exists():
+            print(f"  ✓ {filename}: {description}")
+        else:
+            print(f"  ✗ {filename}: Missing!")
+            all_present = False
+    
+    return all_present
+
+
+def test_docgenie_integration():
+    """Test integration with DocGenie modules."""
+    print("\nTesting DocGenie integration...")
+    
+    try:
+        from docgenie import ENV
+        prompt_template = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
+        
+        if prompt_template.exists():
+            print(f"  ✓ Prompt template found: {prompt_template}")
+        else:
+            print(f"  ✗ Prompt template not found: {prompt_template}")
+            return False
+        
+        # Test reading template
+        content = prompt_template.read_text(encoding='utf-8')
+        if "{language}" in content and "{doc_type}" in content:
+            print("  ✓ Prompt template has required placeholders")
+        else:
+            print("  ✗ Prompt template missing placeholders")
+            return False
+        
+        return True
+    
+    except Exception as e:
+        print(f"  ✗ Error: {e}")
+        return False
+
+
+def test_environment():
+    """Test environment configuration."""
+    print("\nTesting environment...")
+    
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    if api_key:
+        print(f"  ✓ ANTHROPIC_API_KEY is set (length: {len(api_key)})")
+    else:
+        print("  ⚠ ANTHROPIC_API_KEY not set (optional for testing)")
+    
+    python_version = sys.version_info
+    if python_version >= (3, 10):
+        print(f"  ✓ Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
+    else:
+        print(f"  ✗ Python version: {python_version.major}.{python_version.minor}.{python_version.micro} (3.10+ required)")
+        return False
+    
+    return True
+
+
+def test_playwright_browsers():
+    """Test if Playwright browsers are installed."""
+    print("\nTesting Playwright browsers...")
+    
+    try:
+        import subprocess
+        result = subprocess.run(
+            ["playwright", "show-trace", "--help"],
+            capture_output=True,
+            timeout=5
+        )
+        
+        if result.returncode == 0:
+            print("  ✓ Playwright CLI is available")
+        else:
+            print("  ⚠ Playwright CLI might have issues")
+        
+        # Check if chromium is installed
+        # This is a basic check - actual browser installation is verified at runtime
+        print("  ℹ Chromium will be verified when rendering PDFs")
+        
+        return True
+    
+    except FileNotFoundError:
+        print("  ✗ Playwright CLI not found")
+        return False
+    except Exception as e:
+        print(f"  ⚠ Could not verify Playwright: {e}")
+        return True  # Non-critical for this test
+
+
+def test_api_modules():
+    """Test that API modules can be imported."""
+    print("\nTesting API modules...")
+    
+    try:
+        # Add parent and current directory to path
+        api_dir = Path(__file__).parent
+        project_root = api_dir.parent
+        sys.path.insert(0, str(project_root))
+        sys.path.insert(0, str(api_dir))
+        
+        import schemas
+        print("  ✓ schemas module")
+        
+        import utils
+        print("  ✓ utils module")
+        
+        # Test that schema models exist
+        schemas.GenerateDocumentRequest
+        schemas.GenerateDocumentResponse
+        schemas.DocumentResult
+        print("  ✓ All schema models defined")
+        
+        return True
+    
+    except Exception as e:
+        print(f"  ✗ Error importing API modules: {e}")
+        return False
+
+
+def main():
+    """Run all tests."""
+    print("="*60)
+    print("DocGenie API - Test Suite")
+    print("="*60)
+    
+    results = {
+        "Imports": test_imports(),
+        "API Structure": test_api_structure(),
+        "Environment": test_environment(),
+        "DocGenie Integration": test_docgenie_integration(),
+        "Playwright": test_playwright_browsers(),
+        "API Modules": test_api_modules()
+    }
+    
+    print("\n" + "="*60)
+    print("Test Results Summary")
+    print("="*60)
+    
+    for test_name, result in results.items():
+        status = "✓ PASS" if result else "✗ FAIL"
+        print(f"{status}: {test_name}")
+    
+    all_passed = all(results.values())
+    
+    print("\n" + "="*60)
+    if all_passed:
+        print("✅ All tests passed! API is ready to use.")
+        print("\nTo start the API:")
+        print("  cd api")
+        print("  python main.py")
+        print("\nThen visit: http://localhost:8000/docs")
+    else:
+        print("⚠️  Some tests failed. Please fix issues before running the API.")
+        print("\nCommon fixes:")
+        print("  uv sync  # or: pip install -e .")
+        print("  playwright install chromium")
+        print("  export ANTHROPIC_API_KEY='your-key'")
+    print("="*60)
+    
+    return 0 if all_passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/api/test_async_api.py b/api/test_async_api.py
new file mode 100755
index 0000000000000000000000000000000000000000..af55a1ea8ab3d2fce32749b02f4419159169e36a
--- /dev/null
+++ b/api/test_async_api.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+Test script for async document generation API with Google Drive upload.
+
+Tests the complete async workflow with all features enabled:
+- Handwriting insertion
+- Visual elements (stamps, logos, figures, barcodes, photos)
+- OCR processing
+- Ground truth verification
+- Analysis and debug visualization
+- Dataset export
+- Google Drive upload
+
+Usage:
+    python test_async_api.py
+
+The script uses hardcoded tokens and polls continuously for status updates.
+"""
+
+import requests
+import time
+import sys
+
+
+# Configuration
+BASE_URL = "http://localhost:8000"
+POLL_INTERVAL = 10  # seconds between status checks
+
+# Test payload with all features enabled
+PAYLOAD = {
+    "user_id": 123,
+    "google_drive_token": "ya29.a0ATkoCc5wSA3DqNSI35d2EOCfLku0NWULKJYNMPhngjTwcnEKrcNcut1vawhiErgauHc85BrZdF5pug1xzp9Zu1oWATlzIMrMo5jqKDaXWThC0GuRifayOstjOetZnRLPRxVlmjx4k_xm7rto_pN6mT1CUrnte0Qkwf7FJVtF08JzJqaCG9Vvamag4OkkOhy-LB8MsUQaCgYKAXASARISFQHGX2MiAX_4jMvIlv2OkO7WurUUVA0206",
+    "google_drive_refresh_token": "1//03aLYGLUIYIl0CgYIARAAGAMSNwF-L9IrCfdJ-QHJHisqG86UjBvaEalyhWZdDcwbfLENt4V1ckik_wIkmsgjRwC9-SFeHrj-Yk4",
+    "seed_images": [
+        "https://ocr.space/Content/Images/receipt-ocr-original.webp"
+    ],
+    "prompt_params": {
+        "language": "English",
+        "doc_type": "business and administrative",
+        "gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
+        "gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
+        "num_solutions": 1,
+        "enable_handwriting": True,
+        "handwriting_ratio": 0.3,
+        "enable_visual_elements": True,
+        "visual_element_types": [
+            "stamp",
+            "logo",
+            "figure",
+            "barcode",
+            "photo"
+        ],
+        "seed": None,  # Use None for random behavior, or set to integer for reproducibility
+        "enable_ocr": True,
+        "ocr_language": "en",
+        "enable_bbox_normalization": True,
+        "enable_gt_verification": True,
+        "enable_analysis": True,
+        "enable_debug_visualization": True,
+        "enable_dataset_export": True,
+        "dataset_export_format": "msgpack",
+        "output_detail": "dataset"
+    }
+}
+
+
+def test_health():
+    """Test API health endpoint"""
+    print("=" * 80)
+    print("TESTING API HEALTH")
+    print("=" * 80)
+    
+    try:
+        response = requests.get(f"{BASE_URL}/health", timeout=5)
+        response.raise_for_status()
+        print(f"✓ API is healthy: {response.json()}\n")
+        return True
+    except Exception as e:
+        print(f"✗ Health check failed: {e}\n")
+        return False
+
+
+def submit_async_job():
+    """Submit async document generation job"""
+    print("=" * 80)
+    print("SUBMITTING ASYNC JOB")
+    print("=" * 80)
+    
+    print("\nConfiguration:")
+    print(f"  User ID: {PAYLOAD['user_id']}")
+    print(f"  Seed Images: {len(PAYLOAD['seed_images'])}")
+    print(f"  Num Solutions: {PAYLOAD['prompt_params']['num_solutions']}")
+    print(f"  Handwriting: {PAYLOAD['prompt_params']['enable_handwriting']} (ratio: {PAYLOAD['prompt_params']['handwriting_ratio']})")
+    print(f"  Visual Elements: {PAYLOAD['prompt_params']['enable_visual_elements']} (types: {len(PAYLOAD['prompt_params']['visual_element_types'])})")
+    print(f"  OCR: {PAYLOAD['prompt_params']['enable_ocr']}")
+    print(f"  GT Verification: {PAYLOAD['prompt_params']['enable_gt_verification']}")
+    print(f"  Analysis: {PAYLOAD['prompt_params']['enable_analysis']}")
+    print(f"  Debug Viz: {PAYLOAD['prompt_params']['enable_debug_visualization']}")
+    print(f"  Dataset Export: {PAYLOAD['prompt_params']['enable_dataset_export']}")
+    print(f"  Google Drive Upload: Yes")
+    print()
+    
+    try:
+        print("⏳ Submitting job to /generate/async...")
+        response = requests.post(
+            f"{BASE_URL}/generate/async",
+            json=PAYLOAD,
+            timeout=30
+        )
+        response.raise_for_status()
+        result = response.json()
+        
+        request_id = result["request_id"]
+        
+        print(f"\n✓ Job submitted successfully!")
+        print(f"  Request ID: {request_id}")
+        print(f"  Status: {result['status']}")
+        print(f"  Estimated Time: {result.get('estimated_time_minutes', 'N/A')} minutes")
+        print(f"  Poll URL: {result.get('poll_url', 'N/A')}")
+        
+        return request_id
+        
+    except requests.exceptions.HTTPError as e:
+        print(f"\n✗ Job submission failed: {e}")
+        if e.response:
+            print(f"  Response: {e.response.text}")
+        return None
+    except Exception as e:
+        print(f"\n✗ Unexpected error: {e}")
+        return None
+
+
+def poll_job_status(request_id):
+    """Poll job status continuously until completion or failure"""
+    print("\n" + "=" * 80)
+    print("CONTINUOUS STATUS POLLING")
+    print("=" * 80)
+    print(f"Request ID: {request_id}")
+    print(f"Polling every {POLL_INTERVAL} seconds...")
+    print("Press Ctrl+C to stop polling\n")
+    
+    poll_count = 0
+    last_status = None
+    last_progress = None
+    
+    while True:
+        poll_count += 1
+        timestamp = time.strftime("%H:%M:%S")
+        
+        try:
+            response = requests.get(
+                f"{BASE_URL}/jobs/{request_id}/status",
+                timeout=10
+            )
+            response.raise_for_status()
+            status_data = response.json()
+            
+            current_status = status_data["status"]
+            current_progress = status_data.get("progress")
+            
+            # Only print if status or progress changed
+            if current_status != last_status or current_progress != last_progress:
+                print(f"[{timestamp}] Poll #{poll_count}: {current_status.upper()}", end="")
+                if current_progress:
+                    print(f" - {current_progress}", end="")
+                print()
+                
+                last_status = current_status
+                last_progress = current_progress
+            
+            # Check terminal states
+            if current_status == "completed":
+                print("\n" + "=" * 80)
+                print("✓ JOB COMPLETED!")
+                print("=" * 80)
+                
+                results = status_data.get('results', {})
+                download_url = results.get('download_url')
+                
+                if download_url:
+                    print(f"  ✓ Google Drive URL: {download_url}")
+                else:
+                    print(f"  ⚠ Google Drive URL not available")
+                
+                if results.get('file_size_mb'):
+                    print(f"  File Size: {results['file_size_mb']:.2f} MB")
+                
+                print(f"  Document Count: {results.get('document_count', 'N/A')}")
+                print(f"  Created: {status_data.get('created_at')}")
+                print(f"  Completed: {status_data.get('updated_at')}")
+                
+                return status_data
+            
+            elif current_status == "failed":
+                print("\n" + "=" * 80)
+                print("✗ JOB FAILED!")
+                print("=" * 80)
+                print(f"  Error: {status_data.get('error_message', 'Unknown error')}")
+                print(f"  Created: {status_data.get('created_at')}")
+                print(f"  Failed: {status_data.get('updated_at')}")
+                return status_data
+            
+            # Wait before next poll
+            time.sleep(POLL_INTERVAL)
+        
+        except KeyboardInterrupt:
+            print("\n\n⚠ Polling interrupted by user")
+            print(f"You can continue polling manually:")
+            print(f"  GET {BASE_URL}/jobs/{request_id}/status")
+            return {"status": "interrupted"}
+        
+        except Exception as e:
+            print(f"\n⚠ Error polling status: {e}")
+            time.sleep(POLL_INTERVAL)
+
+
+def list_user_jobs():
+    """List all jobs for the test user"""
+    print("\n" + "=" * 80)
+    print("LISTING USER JOBS")
+    print("=" * 80)
+    
+    user_id = PAYLOAD['user_id']
+    
+    try:
+        response = requests.get(
+            f"{BASE_URL}/jobs/user/{user_id}",
+            params={"limit": 10, "offset": 0},
+            timeout=10
+        )
+        response.raise_for_status()
+        result = response.json()
+        
+        jobs = result.get("jobs", [])
+        print(f"\n✓ Found {len(jobs)} jobs for user {user_id}:\n")
+        
+        for i, job in enumerate(jobs, 1):
+            print(f"{i}. Request {job['request_id'][:8]}...")
+            print(f"   Status: {job['status']}")
+            print(f"   Created: {job.get('created_at', 'N/A')}")
+            if job.get('download_url'):
+                print(f"   Download: {job['download_url']}")
+            print()
+        
+        return jobs
+        
+    except Exception as e:
+        print(f"\n✗ Error listing jobs: {e}")
+        return []
+
+
+def main():
+    print("\n" + "=" * 80)
+    print(" " * 15 + "ASYNC PDF API TEST - FULL FEATURE SET")
+    print("=" * 80)
+    print(f"Base URL: {BASE_URL}")
+    print(f"User ID: {PAYLOAD['user_id']}")
+    print("=" * 80)
+    print()
+    
+    # Step 1: Health check
+    if not test_health():
+        print("\n❌ API is not accessible. Make sure the server is running.")
+        print(f"   Expected URL: {BASE_URL}")
+        sys.exit(1)
+    
+    # Step 2: Submit job
+    request_id = submit_async_job()
+    
+    if not request_id:
+        print("\n❌ Failed to submit job. Test aborted.")
+        sys.exit(1)
+    
+    # Step 3: Poll status continuously
+    final_status = poll_job_status(request_id)
+    
+    # Step 4: List all user jobs
+    list_user_jobs()
+    
+    # Final summary
+    print("\n" + "=" * 80)
+    print(" " * 30 + "SUMMARY")
+    print("=" * 80)
+    
+    status = final_status.get("status")
+    
+    if status == "completed":
+        print("✅ ALL TESTS PASSED!")
+        print("\nFeatures tested:")
+        print("  ✓ Async job submission")
+        print("  ✓ Handwriting insertion")
+        print("  ✓ Visual elements (5 types)")
+        print("  ✓ OCR processing")
+        print("  ✓ Ground truth verification")
+        print("  ✓ Analysis & debug visualization")
+        print("  ✓ Dataset export")
+        print("  ✓ Google Drive upload")
+        print("  ✓ Continuous status polling")
+        print(f"\n✓ Your documents are available at:")
+        print(f"  {final_status.get('results', {}).get('download_url')}")
+        sys.exit(0)
+    
+    elif status == "failed":
+        print("❌ JOB FAILED")
+        print(f"Error: {final_status.get('error_message')}")
+        sys.exit(1)
+    
+    elif status == "interrupted":
+        print("⏸ POLLING INTERRUPTED")
+        print(f"Job is still running. Check status manually:")
+        print(f"  GET {BASE_URL}/jobs/{request_id}/status")
+        sys.exit(0)
+    
+    else:
+        print("⏱ JOB STILL IN PROGRESS")
+        print(f"Check status manually: GET {BASE_URL}/jobs/{request_id}/status")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/api/test_get_google_token.py b/api/test_get_google_token.py
new file mode 100644
index 0000000000000000000000000000000000000000..a856b366b00df2777f65e86e97b68a1e1af1ba94
--- /dev/null
+++ b/api/test_get_google_token.py
@@ -0,0 +1,274 @@
+"""
+Helper script to get Google Drive OAuth token for testing.
+
+This script implements the OAuth flow to get access and refresh tokens
+from Google Drive API for testing purposes.
+
+Prerequisites:
+1. Google Cloud Project with Drive API enabled
+2. OAuth 2.0 Client ID credentials
+3. Add http://localhost:8080 as authorized redirect URI
+
+Usage:
+    python test_get_google_token.py --client-id YOUR_CLIENT_ID --client-secret YOUR_CLIENT_SECRET
+"""
+
+import argparse
+import webbrowser
+from urllib.parse import urlencode, parse_qs
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import requests
+
+
+# Global variable to store authorization code
+auth_code = None
+
+
+class OAuthCallbackHandler(BaseHTTPRequestHandler):
+    """HTTP server to handle OAuth callback"""
+    
+    def do_GET(self):
+        global auth_code
+        
+        # Parse query parameters
+        query = self.path.split('?', 1)[-1]
+        params = parse_qs(query)
+        
+        if 'code' in params:
+            auth_code = params['code'][0]
+            
+            # Send success response
+            self.send_response(200)
+            self.send_header('Content-type', 'text/html')
+            self.end_headers()
+            
+            html = """
+            <html>
+            <head><title>Authorization Successful</title></head>
+            <body style="font-family: Arial; text-align: center; padding: 50px;">
+                <h1 style="color: green;">✓ Authorization Successful!</h1>
+                <p>You can close this window and return to the terminal.</p>
+            </body>
+            </html>
+            """
+            self.wfile.write(html.encode())
+        else:
+            # Error response
+            self.send_response(400)
+            self.send_header('Content-type', 'text/html')
+            self.end_headers()
+            
+            error = params.get('error', ['Unknown error'])[0]
+            html = f"""
+            <html>
+            <head><title>Authorization Failed</title></head>
+            <body style="font-family: Arial; text-align: center; padding: 50px;">
+                <h1 style="color: red;">✗ Authorization Failed</h1>
+                <p>Error: {error}</p>
+                <p>Please try again.</p>
+            </body>
+            </html>
+            """
+            self.wfile.write(html.encode())
+    
+    def log_message(self, format, *args):
+        """Suppress default logging"""
+        pass
+
+
+def get_google_drive_token(client_id: str, client_secret: str, redirect_uri: str = "http://localhost:8080"):
+    """
+    Get Google Drive OAuth tokens through OAuth flow.
+    
+    Args:
+        client_id: Google OAuth client ID
+        client_secret: Google OAuth client secret
+        redirect_uri: OAuth redirect URI (must match Google Cloud Console)
+    
+    Returns:
+        dict with 'access_token' and 'refresh_token'
+    """
+    global auth_code
+    
+    print("=" * 80)
+    print(" " * 20 + "GOOGLE DRIVE OAUTH TOKEN GENERATOR")
+    print("=" * 80)
+    print()
+    
+    # Step 1: Generate authorization URL
+    auth_params = {
+        'client_id': client_id,
+        'redirect_uri': redirect_uri,
+        'response_type': 'code',
+        'scope': 'https://www.googleapis.com/auth/drive.file',
+        'access_type': 'offline',  # Get refresh token
+        'prompt': 'consent'  # Force consent to get refresh token
+    }
+    
+    auth_url = f"https://accounts.google.com/o/oauth2/v2/auth?{urlencode(auth_params)}"
+    
+    print("Step 1: Authorize with Google")
+    print("-" * 80)
+    print("\nOpening authorization URL in your browser...")
+    print("If it doesn't open automatically, copy this URL:\n")
+    print(auth_url)
+    print()
+    
+    # Open browser
+    webbrowser.open(auth_url)
+    
+    # Step 2: Start local server to receive callback
+    print("Step 2: Waiting for authorization...")
+    print("-" * 80)
+    print(f"Local server listening on {redirect_uri}")
+    print("Complete the authorization in your browser.")
+    print()
+    
+    server = HTTPServer(('localhost', 8080), OAuthCallbackHandler)
+    
+    # Wait for one request (the callback)
+    while auth_code is None:
+        server.handle_request()
+    
+    server.server_close()
+    
+    if not auth_code:
+        print("✗ Failed to get authorization code")
+        return None
+    
+    print("✓ Authorization code received!")
+    print()
+    
+    # Step 3: Exchange code for tokens
+    print("Step 3: Exchanging code for tokens...")
+    print("-" * 80)
+    
+    token_url = "https://oauth2.googleapis.com/token"
+    token_data = {
+        'code': auth_code,
+        'client_id': client_id,
+        'client_secret': client_secret,
+        'redirect_uri': redirect_uri,
+        'grant_type': 'authorization_code'
+    }
+    
+    try:
+        response = requests.post(token_url, data=token_data)
+        response.raise_for_status()
+        tokens = response.json()
+        
+        print("✓ Tokens received!")
+        print()
+        print("=" * 80)
+        print(" " * 30 + "TOKENS")
+        print("=" * 80)
+        print()
+        print("Access Token:")
+        print(tokens['access_token'])
+        print()
+        
+        if 'refresh_token' in tokens:
+            print("Refresh Token:")
+            print(tokens['refresh_token'])
+            print()
+        else:
+            print("⚠ No refresh token received (user may have authorized before)")
+            print("  To get a refresh token:")
+            print("  1. Go to: https://myaccount.google.com/permissions")
+            print("  2. Remove your app's access")
+            print("  3. Run this script again")
+            print()
+        
+        print("Expires In: {} seconds".format(tokens.get('expires_in', 'N/A')))
+        print()
+        
+        # Show usage instructions
+        print("=" * 80)
+        print(" " * 25 + "USAGE INSTRUCTIONS")
+        print("=" * 80)
+        print()
+        print("Option 1: Use with test script directly")
+        print("-" * 80)
+        print("python test_async_api.py \\")
+        print(f"  --google-token {tokens['access_token']}")
+        if 'refresh_token' in tokens:
+            print(f"  --google-refresh-token {tokens['refresh_token']}")
+        print()
+        
+        print("Option 2: Set environment variable")
+        print("-" * 80)
+        print(f"export GOOGLE_DRIVE_TOKEN=\"{tokens['access_token']}\"")
+        if 'refresh_token' in tokens:
+            print(f"export GOOGLE_DRIVE_REFRESH_TOKEN=\"{tokens['refresh_token']}\"")
+        print("python test_async_api.py")
+        print()
+        
+        print("Option 3: Use in your frontend")
+        print("-" * 80)
+        print("Store these tokens in your frontend application and include them")
+        print("in API requests to /generate/async endpoint.")
+        print()
+        
+        print("=" * 80)
+        
+        return tokens
+        
+    except Exception as e:
+        print(f"✗ Failed to exchange code for tokens: {e}")
+        if hasattr(e, 'response') and e.response:
+            print(f"Response: {e.response.text}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Get Google Drive OAuth token for testing"
+    )
+    parser.add_argument(
+        "--client-id",
+        type=str,
+        required=True,
+        help="Google OAuth Client ID"
+    )
+    parser.add_argument(
+        "--client-secret",
+        type=str,
+        required=True,
+        help="Google OAuth Client Secret"
+    )
+    parser.add_argument(
+        "--redirect-uri",
+        type=str,
+        default="http://localhost:8080",
+        help="OAuth redirect URI (default: http://localhost:8080)"
+    )
+    
+    args = parser.parse_args()
+    
+    print()
+    print("Prerequisites Check:")
+    print("-" * 80)
+    print(f"✓ Client ID: {args.client_id[:20]}...")
+    print(f"✓ Client Secret: {args.client_secret[:10]}...")
+    print(f"✓ Redirect URI: {args.redirect_uri}")
+    print()
+    print("Make sure you've added this redirect URI to your Google Cloud Console:")
+    print("  https://console.cloud.google.com/apis/credentials")
+    print()
+    input("Press Enter to continue...")
+    print()
+    
+    tokens = get_google_drive_token(
+        client_id=args.client_id,
+        client_secret=args.client_secret,
+        redirect_uri=args.redirect_uri
+    )
+    
+    if tokens:
+        print("✓ SUCCESS! Use the tokens above to test the async API.")
+    else:
+        print("✗ FAILED to get tokens. Please check your credentials and try again.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/api/test_runpod_integration.py b/api/test_runpod_integration.py
new file mode 100644
index 0000000000000000000000000000000000000000..b394b6bc309f83377cdfb73cf0040e6b8cb1ef3c
--- /dev/null
+++ b/api/test_runpod_integration.py
@@ -0,0 +1,123 @@
+"""
+Test script to verify RunPod handwriting service integration.
+This script tests the integration between the API and the deployed RunPod service.
+"""
+import asyncio
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from .utils import call_handwriting_service_batch
+from .config import settings
+
+
+async def test_runpod_integration():
+    """Test the RunPod handwriting service integration"""
+    
+    print("=" * 80)
+    print("RunPod Handwriting Service Integration Test")
+    print("=" * 80)
+    
+    # Check configuration
+    print("\n1. Configuration:")
+    print(f"   - HANDWRITING_SERVICE_URL: {settings.HANDWRITING_SERVICE_URL}")
+    print(f"   - HANDWRITING_SERVICE_ENABLED: {settings.HANDWRITING_SERVICE_ENABLED}")
+    print(f"   - HANDWRITING_SERVICE_TIMEOUT: {settings.HANDWRITING_SERVICE_TIMEOUT}s")
+    print(f"   - HANDWRITING_SERVICE_MAX_RETRIES: {settings.HANDWRITING_SERVICE_MAX_RETRIES}")
+    print(f"   - RUNPOD_API_KEY: {'Set' if settings.RUNPOD_API_KEY else 'Not set (optional)'}")
+    print(f"   - HANDWRITING_APPLY_BLUR: {settings.HANDWRITING_APPLY_BLUR}")
+    
+    if not settings.HANDWRITING_SERVICE_ENABLED:
+        print("\n❌ HANDWRITING_SERVICE_ENABLED is false. Please enable it in .env")
+        return
+    
+    # Prepare test data
+    test_texts = [
+        {
+            "text": "Hello",
+            "author_id": 42,
+            "hw_id": "test_hw_0"
+        },
+        {
+            "text": "World",
+            "author_id": 42,
+            "hw_id": "test_hw_1"
+        },
+        {
+            "text": "DocGenie",
+            "author_id": 100,
+            "hw_id": "test_hw_2"
+        },
+        {
+            "text": "Batch",
+            "author_id": 150,
+            "hw_id": "test_hw_3"
+        },
+        {
+            "text": "Processing",
+            "author_id": 200,
+            "hw_id": "test_hw_4"
+        }
+    ]
+    
+    print(f"\n2. Testing TRUE BATCH PROCESSING (cost-efficient):")
+    print(f"   - {len(test_texts)} texts will be sent in ONE request")
+    print(f"   - Activates ONLY 1 RunPod worker (instead of {len(test_texts)} workers)")
+    print(f"   - Expected cost savings: ~45% compared to parallel processing")
+    for text in test_texts:
+        print(f"   - '{text['text']}' (author_id: {text['author_id']})")
+    
+    # Call the service
+    print("\n3. Calling RunPod service with BATCH request...")
+    import time
+    start_time = time.time()
+    
+    try:
+        results = await call_handwriting_service_batch(test_texts)
+        
+        elapsed = time.time() - start_time
+        
+        print(f"\n4. Results:")
+        print(f"   - Successfully generated: {len(results)}/{len(test_texts)}")
+        print(f"   - Total time: {elapsed:.1f}s ({elapsed/len(results):.1f}s per text)")
+        print(f"   - Worker activations: 1 (would be {len(test_texts)} with old parallel method)")
+        
+        if results:
+            print("\n5. Sample result details:")
+            for i, result in enumerate(results[:2]):  # Show first 2 results
+                print(f"\n   Result {i+1}:")
+                print(f"   - hw_id: {result.get('hw_id')}")
+                print(f"   - text: {result.get('text')}")
+                print(f"   - author_id: {result.get('author_id')}")
+                print(f"   - width: {result.get('width')}px")
+                print(f"   - height: {result.get('height')}px")
+                print(f"   - image_base64: {result.get('image_base64')[:50]}... ({len(result.get('image_base64', ''))} chars)")
+            
+            print("\n" + "=" * 80)
+            print("✅ BATCH PROCESSING TEST PASSED!")
+            print("=" * 80)
+            print("\nCost Analysis:")
+            print(f"  OLD (parallel):  {len(test_texts)} workers × 18s = {len(test_texts) * 18}s total worker time")
+            print(f"  NEW (batched):   1 worker × {int(elapsed)}s = {int(elapsed)}s total worker time")
+            print(f"  Savings:         ~{int((1 - elapsed / (len(test_texts) * 18)) * 100)}% reduction in worker activation costs")
+            print("\nThe API now sends all {len(test_texts)} texts in ONE request, activating only 1 worker.")
+            print("This significantly reduces RunPod costs while maintaining quality.")
+        else:
+            print("\n⚠️ No results returned. Check the error messages above.")
+            
+    except Exception as e:
+        print(f"\n❌ Integration test FAILED!")
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        print("\nPossible issues:")
+        print("1. Check that HANDWRITING_SERVICE_URL in .env is correct")
+        print("2. Verify the RunPod endpoint is deployed with v12 (batch support)")
+        print("3. Check if RUNPOD_API_KEY is required and set correctly")
+        print("4. Ensure the service handler supports batch input format")
+
+
+if __name__ == "__main__":
+    asyncio.run(test_runpod_integration())
diff --git a/api/test_sync_pdf_api.py b/api/test_sync_pdf_api.py
new file mode 100755
index 0000000000000000000000000000000000000000..264bab5d88f7abc79d5fbf287f8431b0bce1140d
--- /dev/null
+++ b/api/test_sync_pdf_api.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+Test script for /generate/pdf endpoint (sync API with tracking & GDrive upload).
+
+Tests the complete flow with all features enabled:
+- Handwriting insertion
+- Visual elements (stamps, logos, figures, barcodes, photos)
+- OCR processing
+- Ground truth verification
+- Analysis and debug visualization
+- Dataset export
+- Google Drive upload
+
+Usage:
+    python test_sync_pdf_api.py
+
+The script uses hardcoded tokens and polls continuously for status updates.
+"""
+
+import requests
+import time
+import sys
+import zipfile
+import io
+
+
+# Configuration
+BASE_URL = "http://localhost:8000"
+POLL_INTERVAL = 10  # seconds between status checks
+
+# Test payload with all features enabled
+PAYLOAD = {
+    "user_id": 123,
+    "google_drive_token": "ya29.a0ATkoCc5wSA3DqNSI35d2EOCfLku0NWULKJYNMPhngjTwcnEKrcNcut1vawhiErgauHc85BrZdF5pug1xzp9Zu1oWATlzIMrMo5jqKDaXWThC0GuRifayOstjOetZnRLPRxVlmjx4k_xm7rto_pN6mT1CUrnte0Qkwf7FJVtF08JzJqaCG9Vvamag4OkkOhy-LB8MsUQaCgYKAXASARISFQHGX2MiAX_4jMvIlv2OkO7WurUUVA0206",
+    "google_drive_refresh_token": "1//03aLYGLUIYIl0CgYIARAAGAMSNwF-L9IrCfdJ-QHJHisqG86UjBvaEalyhWZdDcwbfLENt4V1ckik_wIkmsgjRwC9-SFeHrj-Yk4",
+    "seed_images": [
+        "https://ocr.space/Content/Images/receipt-ocr-original.webp"
+    ],
+    "prompt_params": {
+        "language": "English",
+        "doc_type": "business and administrative",
+        "gt_type": "Multiple questions about each document, with their answers taken **verbatim** from the document.",
+        "gt_format": "{\"<Text of question 1>\": \"<Answer to question 1>\", \"<Text of question 2>\": \"<Answer to question 2>\", ...}",
+        "num_solutions": 1,
+        "enable_handwriting": True,
+        "handwriting_ratio": 0.3,
+        "enable_visual_elements": True,
+        "visual_element_types": [
+            "stamp",
+            "logo",
+            "figure",
+            "barcode",
+            "photo"
+        ],
+        "seed": None,  # Use None for random behavior, or set to integer for reproducibility
+        "enable_ocr": True,
+        "ocr_language": "en",
+        "enable_bbox_normalization": True,
+        "enable_gt_verification": True,
+        "enable_analysis": True,
+        "enable_debug_visualization": True,
+        "enable_dataset_export": True,
+        "dataset_export_format": "msgpack",
+        "output_detail": "dataset"
+    }
+}
+
+
+def test_health():
+    """Test API health endpoint"""
+    print("=" * 80)
+    print("TESTING API HEALTH")
+    print("=" * 80)
+    
+    try:
+        response = requests.get(f"{BASE_URL}/health", timeout=5)
+        response.raise_for_status()
+        print(f"✓ API is healthy: {response.json()}\n")
+        return True
+    except Exception as e:
+        print(f"✗ Health check failed: {e}\n")
+        return False
+
+
+def test_sync_endpoint():
+    """Test sync /generate/pdf endpoint with continuous polling"""
+    print("=" * 80)
+    print("TESTING SYNC /generate/pdf ENDPOINT")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  User ID: {PAYLOAD['user_id']}")
+    print(f"  Seed Images: {len(PAYLOAD['seed_images'])}")
+    print(f"  Num Solutions: {PAYLOAD['prompt_params']['num_solutions']}")
+    print(f"  Handwriting: {PAYLOAD['prompt_params']['enable_handwriting']} (ratio: {PAYLOAD['prompt_params']['handwriting_ratio']})")
+    print(f"  Visual Elements: {PAYLOAD['prompt_params']['enable_visual_elements']} (types: {len(PAYLOAD['prompt_params']['visual_element_types'])})")
+    print(f"  OCR: {PAYLOAD['prompt_params']['enable_ocr']}")
+    print(f"  GT Verification: {PAYLOAD['prompt_params']['enable_gt_verification']}")
+    print(f"  Analysis: {PAYLOAD['prompt_params']['enable_analysis']}")
+    print(f"  Debug Viz: {PAYLOAD['prompt_params']['enable_debug_visualization']}")
+    print(f"  Dataset Export: {PAYLOAD['prompt_params']['enable_dataset_export']}")
+    print(f"  Google Drive Upload: Yes")
+    print()
+    
+    try:
+        print("⏳ Calling /generate/pdf...")
+        print("   (This will return immediately, then we'll poll for status)\n")
+        start_time = time.time()
+        
+        response = requests.post(
+            f"{BASE_URL}/generate/pdf",
+            json=PAYLOAD,
+            timeout=180,  # 3 minutes max for initial response
+            stream=True
+        )
+        response.raise_for_status()
+        
+        elapsed_time = time.time() - start_time
+        
+        # Check response headers
+        print(f"✓ Response received in {elapsed_time:.1f} seconds")
+        print("\nResponse Headers:")
+        
+        request_id = response.headers.get('X-Request-ID')
+        status_url = response.headers.get('X-Status-URL')
+        
+        if request_id:
+            print(f"  ✓ X-Request-ID: {request_id}")
+        else:
+            print(f"  ⚠ X-Request-ID: NOT SET")
+        
+        if status_url:
+            print(f"  ✓ X-Status-URL: {status_url}")
+        else:
+            print(f"  ⚠ X-Status-URL: NOT SET")
+        
+        # Verify ZIP file
+        zip_data = response.content
+        zip_size_mb = len(zip_data) / (1024 * 1024)
+        print(f"\n✓ ZIP file size: {zip_size_mb:.2f} MB")
+        
+        # Validate ZIP structure
+        try:
+            zip_buffer = io.BytesIO(zip_data)
+            with zipfile.ZipFile(zip_buffer, 'r') as zip_file:
+                file_list = zip_file.namelist()
+                print(f"✓ ZIP contains {len(file_list)} files")
+                
+                # Show directory structure
+                print("\nDataset Structure:")
+                dirs = set()
+                for filepath in file_list:
+                    parts = filepath.split('/')
+                    if len(parts) > 1:
+                        dirs.add(parts[0] + '/' + parts[1] if len(parts) > 2 else parts[0])
+                
+                for dir_name in sorted(dirs):
+                    file_count = sum(1 for f in file_list if f.startswith(dir_name + '/') and f != dir_name + '/')
+                    if file_count > 0:
+                        print(f"  📁 {dir_name}/ ({file_count} files)")
+                
+                # Check for essential files
+                if 'docgenie_documents/metadata.json' in file_list:
+                    print("\n  ✓ metadata.json present")
+                if 'docgenie_documents/README.md' in file_list:
+                    print("  ✓ README.md present")
+        
+        except zipfile.BadZipFile as e:
+            print(f"✗ Invalid ZIP file: {e}")
+            return False
+        
+        # Continuous polling if we have request_id
+        if request_id:
+            print("\n" + "=" * 80)
+            print("CONTINUOUS STATUS POLLING")
+            print("=" * 80)
+            print(f"Request ID: {request_id}")
+            print(f"Polling every {POLL_INTERVAL} seconds...\n")
+            
+            poll_count = 0
+            last_status = None
+            last_progress = None
+            
+            while True:
+                poll_count += 1
+                timestamp = time.strftime("%H:%M:%S")
+                
+                try:
+                    status_response = requests.get(
+                        f"{BASE_URL}/jobs/{request_id}/status",
+                        timeout=10
+                    )
+                    status_response.raise_for_status()
+                    status_data = status_response.json()
+                    
+                    current_status = status_data.get('status')
+                    current_progress = status_data.get('progress')
+                    
+                    # Only print if status or progress changed
+                    if current_status != last_status or current_progress != last_progress:
+                        print(f"[{timestamp}] Poll #{poll_count}: {current_status.upper()}", end="")
+                        if current_progress:
+                            print(f" - {current_progress}", end="")
+                        print()
+                        
+                        last_status = current_status
+                        last_progress = current_progress
+                    
+                    # Check for terminal states
+                    if current_status == "completed":
+                        print("\n" + "=" * 80)
+                        print("✓ JOB COMPLETED!")
+                        print("=" * 80)
+                        
+                        results = status_data.get('results', {})
+                        download_url = results.get('download_url')
+                        
+                        if download_url:
+                            print(f"  ✓ Google Drive URL: {download_url}")
+                        else:
+                            print(f"  ⏳ Google Drive upload may still be in progress")
+                        
+                        if results.get('file_size_mb'):
+                            print(f"  File Size: {results['file_size_mb']:.2f} MB")
+                        
+                        print(f"  Document Count: {results.get('document_count', 'N/A')}")
+                        print(f"  Created: {status_data.get('created_at')}")
+                        print(f"  Completed: {status_data.get('updated_at')}")
+                        
+                        break
+                    
+                    elif current_status == "failed":
+                        print("\n" + "=" * 80)
+                        print("✗ JOB FAILED!")
+                        print("=" * 80)
+                        print(f"  Error: {status_data.get('error_message', 'Unknown error')}")
+                        return False
+                    
+                    # Wait before next poll
+                    time.sleep(POLL_INTERVAL)
+                
+                except KeyboardInterrupt:
+                    print("\n\n⚠ Polling interrupted by user")
+                    print(f"You can continue polling manually:")
+                    print(f"  GET {BASE_URL}/jobs/{request_id}/status")
+                    break
+                
+                except Exception as e:
+                    print(f"\n⚠ Error polling status: {e}")
+                    time.sleep(POLL_INTERVAL)
+        
+        print("\n" + "=" * 80)
+        print("✅ TEST COMPLETED SUCCESSFULLY")
+        print("=" * 80)
+        print(f"✓ ZIP received in {elapsed_time:.1f} seconds")
+        print(f"✓ ZIP size: {zip_size_mb:.2f} MB")
+        print(f"✓ Dataset structure validated")
+        print(f"✓ Google Drive upload tracked")
+        return True
+        
+    except requests.exceptions.Timeout:
+        print(f"✗ Request timed out")
+        return False
+    except Exception as e:
+        print(f"✗ Test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    print("\n" + "=" * 80)
+    print(" " * 15 + "SYNC PDF API TEST - FULL FEATURE SET")
+    print("=" * 80)
+    print(f"Base URL: {BASE_URL}")
+    print("=" * 80)
+    print()
+    
+    # Step 1: Health check
+    if not test_health():
+        print("\n❌ API is not accessible. Make sure the server is running.")
+        print(f"   Expected URL: {BASE_URL}")
+        sys.exit(1)
+    
+    # Step 2: Test sync endpoint
+    success = test_sync_endpoint()
+    
+    # Summary
+    print("\n" + "=" * 80)
+    print(" " * 30 + "SUMMARY")
+    print("=" * 80)
+    
+    if success:
+        print("✅ ALL TESTS PASSED!")
+        print("\nFeatures tested:")
+        print("  ✓ Handwriting insertion")
+        print("  ✓ Visual elements (5 types)")
+        print("  ✓ OCR processing")
+        print("  ✓ Ground truth verification")
+        print("  ✓ Analysis & debug visualization")
+        print("  ✓ Dataset export")
+        print("  ✓ Google Drive upload")
+        print("  ✓ Continuous status polling")
+    else:
+        print("❌ TEST FAILED")
+    
+    print("=" * 80)
+    
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/api/utils.py b/api/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d28c1c86ecd4155c39b43ace93862d720291a81
--- /dev/null
+++ b/api/utils.py
@@ -0,0 +1,2740 @@
+"""
+Core processing utilities for DocGenie document generation pipeline.
+
+Integrated functionality (All 19 Stages):
+- Stage 1-2: Seed selection, LLM prompting, response processing, PDF rendering, bbox extraction
+- Stage 3: Handwriting & visual element synthesis (WordStylist diffusion, stamps, barcodes, logos)
+- Stage 4: Image finalization & OCR (pdf2image, Microsoft Document Intelligence)
+- Stage 5: Dataset packaging (bbox normalization, GT verification, analysis, debug viz)
+
+References generationfolder for core pipeline logic.
+"""
+import asyncio
+import base64
+import json
+import pathlib
+import tempfile
+import time
+import uuid
+import re
+from typing import List, Tuple, Optional, Dict, Any
+from io import BytesIO
+
+import requests
+import httpx
+from PIL import Image
+from pdf2image import convert_from_path
+from bs4 import BeautifulSoup
+from playwright.async_api import async_playwright
+import fitz  # PyMuPDF for PDF processing
+
+from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME, VISUAL_ELEMENT_TYPE_SYNONYMS
+from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient, create_message
+from docgenie.generation.pipeline_03_process_response import (
+    extract_html_documents_from_text,
+    extract_gt,
+)
+from docgenie.generation.pipeline_03.css import (
+    increase_handwriting_font_size,
+    unmark_visual_elements,
+)
+from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import (
+    render_pdf_async,
+    preprocess_html_for_pdf,
+)
+from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf
+
+# Stage 3 imports - we implement simplified versions directly in this file
+# The full pipeline functions are available but require SynDatasetDefinition
+# For API use, we extract elements directly from HTML/CSS
+from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS
+from docgenie.generation.utils.stamp import create_stamp
+from docgenie import ENV
+
+# Import config for handwriting service URL
+from .config import settings
+
+
+async def download_image_to_base64(url: str) -> str:
+    """
+    Download image or PDF from URL and convert to base64 JPEG.
+    If URL points to a PDF, converts the first page to an image.
+    
+    Args:
+        url: Image or PDF URL
+        
+    Returns:
+        Base64-encoded JPEG image string
+    """
+    response = requests.get(url, timeout=30)
+    response.raise_for_status()
+    
+    content_type = response.headers.get('Content-Type', '').lower()
+    is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf')
+    
+    if is_pdf:
+        # Handle PDF: convert first page to image
+        print(f"  📄 Detected PDF, converting first page to image: {url[:80]}...")
+        
+        # Load PDF from bytes
+        pdf_document = fitz.open(stream=response.content, filetype="pdf")
+        
+        if len(pdf_document) == 0:
+            raise ValueError("PDF has no pages")
+        
+        # Render first page to image at high DPI
+        page = pdf_document[0]
+        # Use 300 DPI for high quality (matrix zoom factor = DPI/72)
+        zoom = 300 / 72
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        
+        # Convert pixmap to PIL Image
+        img_data = pix.tobytes("png")
+        img = Image.open(BytesIO(img_data))
+        
+        pdf_document.close()
+        
+        print(f"  ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px")
+    else:
+        # Handle regular image
+        img = Image.open(BytesIO(response.content))
+    
+    # Convert to RGB if necessary
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    
+    # Save as JPEG in memory
+    buffer = BytesIO()
+    img.save(buffer, format='JPEG', quality=95)
+    buffer.seek(0)
+    
+    # Encode to base64
+    img_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+    return img_base64
+
+
+def download_seed_images(urls: List[str]) -> List[str]:
+    """
+    Download multiple seed images/PDFs and convert to base64 (synchronous version for worker).
+    If a URL points to a PDF, converts the first page to an image.
+    Implements retry logic for transient HTTP errors (503, 502, 504, 429).
+    
+    Args:
+        urls: List of image or PDF URLs
+        
+    Returns:
+        List of base64-encoded JPEG image strings
+    """
+    images = []
+    for url in urls:
+        # Retry logic for transient HTTP errors
+        max_retries = 3
+        response = None
+        
+        for attempt in range(max_retries):
+            try:
+                response = requests.get(url, timeout=30)
+                response.raise_for_status()
+                break  # Success, exit retry loop
+                
+            except requests.exceptions.HTTPError as e:
+                # Retry on transient server errors
+                if e.response.status_code in [502, 503, 504, 429]:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 * (2 ** attempt)  # Exponential backoff: 2s, 4s, 8s
+                        print(f"  ⚠️ HTTP {e.response.status_code} error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})...")
+                        time.sleep(wait_time)
+                        continue
+                # Non-retryable error or last attempt
+                raise
+            except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+                if attempt < max_retries - 1:
+                    wait_time = 2 * (2 ** attempt)
+                    print(f"  ⚠️ Network error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries}): {e}")
+                    time.sleep(wait_time)
+                    continue
+                raise
+        
+        if response is None:
+            raise Exception(f"Failed to download seed image after {max_retries} attempts")
+        
+        content_type = response.headers.get('Content-Type', '').lower()
+        is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf')
+        
+        if is_pdf:
+            # Handle PDF: convert first page to image
+            print(f"  📄 Detected PDF, converting first page to image: {url[:80]}...")
+            
+            # Load PDF from bytes
+            pdf_document = fitz.open(stream=response.content, filetype="pdf")
+            
+            if len(pdf_document) == 0:
+                raise ValueError("PDF has no pages")
+            
+            # Render first page to image at high DPI
+            page = pdf_document[0]
+            # Use 300 DPI for high quality (matrix zoom factor = DPI/72)
+            zoom = 300 / 72
+            mat = fitz.Matrix(zoom, zoom)
+            pix = page.get_pixmap(matrix=mat)
+            
+            # Convert pixmap to PIL Image
+            img_data = pix.tobytes("png")
+            img = Image.open(BytesIO(img_data))
+            
+            pdf_document.close()
+            
+            print(f"  ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px")
+        else:
+            # Handle regular image
+            img = Image.open(BytesIO(response.content))
+        
+        # Convert to RGB if necessary
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        
+        # Save as JPEG in memory
+        buffer = BytesIO()
+        img.save(buffer, format='JPEG', quality=95)
+        buffer.seek(0)
+        
+        # Encode to base64
+        img_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+        images.append(img_base64)
+    
+    return images
+
+
+def build_prompt(
+    language: str,
+    doc_type: str,
+    gt_type: str,
+    gt_format: str,
+    num_solutions: int,
+    num_seed_images: int,
+    prompt_template_path: pathlib.Path
+) -> str:
+    """
+    Build the system prompt by injecting parameters into template.
+    
+    Args:
+        language: Language for documents
+        doc_type: Type of documents
+        gt_type: Ground truth type description
+        gt_format: Ground truth format specification
+        num_solutions: Number of documents to generate
+        num_seed_images: Number of seed images provided
+        prompt_template_path: Path to prompt template file
+        
+    Returns:
+        Formatted prompt string
+    """
+    template = prompt_template_path.read_text(encoding='utf-8')
+    
+    # Inject parameters into template
+    prompt = template.format(
+        language=language,
+        doc_type=doc_type,
+        gt_type=gt_type,
+        gt_format=gt_format,
+        num_solutions=num_solutions,
+        num_seed_images=num_seed_images
+    )
+    
+    return prompt
+
+
+async def call_claude_api_direct(
+    prompt: str,
+    seed_images_base64: List[str],
+    api_key: str,
+    model: str = "claude-sonnet-4-5-20250929",
+    max_tokens: int = 16384
+) -> str:
+    """
+    Call Claude API directly (non-batched) with prompt and seed images.
+    Used for API endpoint for immediate synchronous responses.
+    
+    Args:
+        prompt: System prompt
+        seed_images_base64: List of base64-encoded seed images
+        api_key: Anthropic API key
+        model: Claude model name
+        max_tokens: Maximum tokens for response
+        
+    Returns:
+        Raw LLM response text
+    """
+    import anthropic
+    
+    client = anthropic.Anthropic(api_key=api_key)
+    
+    # Build message using the same format as batched client
+    message_content = create_message(prompt=prompt, images_base64=seed_images_base64)
+    
+    # Call API with prompt caching enabled
+    message = client.messages.create(
+        model=model,
+        max_tokens=max_tokens,
+        messages=[message_content],
+    )
+    
+    # Extract text response
+    response_text = ""
+    for block in message.content:
+        if block.type == "text":
+            response_text += block.text
+    
+    return response_text
+
+
+def extract_html_documents_from_response(response_text: str) -> List[str]:
+    """
+    Extract individual HTML documents from LLM response.
+    Uses pipeline_03 function for consistency.
+    
+    Args:
+        response_text: Raw LLM response
+        
+    Returns:
+        List of HTML document strings
+    """
+    # Use the pipeline function for HTML extraction
+    return extract_html_documents_from_text(text=response_text)
+
+
+def extract_ground_truth(html: str) -> Tuple[Optional[dict], str]:
+    """
+    Extract ground truth JSON from HTML and return cleaned HTML.
+    Uses pipeline_03 function for consistency.
+    
+    Args:
+        html: HTML document with embedded GT
+        
+    Returns:
+        Tuple of (ground_truth_dict, html_without_gt)
+    """
+    # Use the pipeline function
+    raw_json, html_clean, soup = extract_gt(html=html)
+    
+    if raw_json:
+        try:
+            gt_dict = json.loads(raw_json)
+            return gt_dict, html_clean
+        except json.JSONDecodeError:
+            return None, html
+    
+    return None, html
+
+
+def extract_css_from_html(html: str) -> Tuple[str, str]:
+    """
+    Extract CSS from HTML and return both separately.
+    
+    Args:
+        html: HTML document
+        
+    Returns:
+        Tuple of (css_string, html_string)
+    """
+    soup = BeautifulSoup(html, BS_PARSER)
+    
+    css_parts = []
+    
+    # Extract from <style> tags
+    for style_tag in soup.find_all("style"):
+        if style_tag.string:
+            css_parts.append(style_tag.string)
+    
+    # Extract inline styles (optional - for completeness)
+    for tag in soup.find_all(style=True):
+        css_parts.append(f"{tag.name} {{ {tag['style']} }}")
+    
+    css = "\n".join(css_parts)
+    return css, html
+
+
+# preprocess_html_for_pdf is now imported from pipeline_04_render_pdf_and_extract_geos
+
+
+async def render_html_to_pdf(
+    html: str,
+    output_pdf_path: pathlib.Path,
+    timeout_seconds: int = 60
+) -> Tuple[pathlib.Path, float, float, List[dict]]:
+    """
+    Render HTML to PDF using Playwright with automatic size detection.
+    Also extracts element geometries for handwriting and visual elements.
+    Matches pipeline_04 rendering logic.
+    
+    Args:
+        html: HTML content to render
+        output_pdf_path: Path where PDF should be saved
+        timeout_seconds: Timeout for rendering
+        
+    Returns:
+        Tuple of (pdf_path, width_mm, height_mm, geometries)
+        - geometries: List of dicts with element positions, classes, and metadata
+    """
+    # Preprocess HTML using pipeline function
+    html = preprocess_html_for_pdf(html)
+    soup = BeautifulSoup(html, BS_PARSER)
+    
+    # Apply handwriting and visual element processing
+    soup = increase_handwriting_font_size(soup, dbg=False)
+    soup = unmark_visual_elements(soup)
+    
+    prep_html = soup.prettify()
+    
+    # Create temporary HTML file
+    with tempfile.NamedTemporaryFile(
+        mode='w',
+        suffix='.html',
+        delete=False,
+        encoding='utf-8'
+    ) as tmp_html:
+        tmp_html.write(prep_html)
+        tmp_html_path = tmp_html.name
+    
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+            
+            # Load HTML
+            await page.goto(
+                f"file://{tmp_html_path}",
+                wait_until="domcontentloaded"
+            )
+            await page.emulate_media(media="screen")
+            
+            # Auto-detect dimensions
+            dimensions = await page.evaluate(MEASURE_DIMENSIONS)
+            
+            page_width_px = dimensions["width"]
+            page_height_px = dimensions["height"]
+            
+            # Set viewport
+            await page.set_viewport_size({
+                "width": page_width_px,
+                "height": page_height_px
+            })
+            await page.wait_for_timeout(30)
+            
+            # Extract geometries BEFORE generating PDF (matches pipeline_04)
+            # Define selectors for handwriting and visual elements
+            selector_map = {
+                "handwriting": ".handwritten",
+                "visual_element": "[data-placeholder]",
+                "layout_element": r'[class*="LE-"]'
+            }
+            
+            # Use json.dumps to properly escape quotes in selectors
+            import json
+            selector_map_js = json.dumps(selector_map)
+            
+            # JavaScript geometry extraction (from pipeline_04)
+            geo_eval_script = f"""
+            () => {{
+                const data = [];
+                const selectorMap = {selector_map_js};
+                const processedElements = new Map();
+
+                // First pass: collect all elements and their matching selectors
+                Object.entries(selectorMap).forEach(([label, selector]) => {{
+                    document.querySelectorAll(selector).forEach(el => {{
+                        if (!processedElements.has(el)) {{
+                            processedElements.set(el, []);
+                        }}
+                        processedElements.get(el).push(label);
+                    }});
+                }});
+
+                // Second pass: create geometry data for each unique element
+                processedElements.forEach((selectorTypes, el) => {{
+                    const rect = el.getBoundingClientRect();
+                    const computed = window.getComputedStyle(el);
+
+                    // Get text content
+                    let text = '';
+                    if (el.tagName.toLowerCase() === 'input') {{
+                        text = (el.value || '').trim();
+                    }} else {{
+                        text = (el.innerText || el.textContent || '').trim();
+                    }}
+
+                    data.push({{
+                        id: el.id || null,
+                        tag: el.tagName.toLowerCase(),
+                        classes: el.className || null,
+                        rect: {{
+                            x: rect.x,
+                            y: rect.y,
+                            width: rect.width,
+                            height: rect.height
+                        }},
+                        visibility: computed.visibility,
+                        dataContent: el.getAttribute('data-content') || null,
+                        dataPlaceholder: el.getAttribute('data-placeholder') || null,
+                        style: el.getAttribute('style') || null,
+                        text: text,
+                        selectorTypes: selectorTypes
+                    }});
+                }});
+
+                return data;
+            }}
+            """
+            
+            geometries = await page.evaluate(geo_eval_script)
+            
+            print(f"  🔍 Extracted {len(geometries)} geometries from rendered DOM")
+            
+            # Debug: Show what was found
+            hw_geos = [g for g in geometries if "handwriting" in g.get("selectorTypes", [])]
+            ve_geos = [g for g in geometries if "visual_element" in g.get("selectorTypes", [])]
+            if hw_geos:
+                print(f"     - Found {len(hw_geos)} handwriting elements in DOM")
+            if ve_geos:
+                print(f"     - Found {len(ve_geos)} visual element placeholders in DOM")
+            if not hw_geos and not ve_geos:
+                print(f"     - ⚠️  No handwriting or visual elements found in DOM")
+            
+            # Generate PDF
+            page_width_inches = page_width_px / 96
+            page_height_inches = page_height_px / 96
+            
+            await page.pdf(
+                path=str(output_pdf_path),
+                width=f"{page_width_inches}in",
+                height=f"{page_height_inches}in",
+                margin={
+                    "top": "0",
+                    "bottom": "0",
+                    "left": "0",
+                    "right": "0"
+                },
+                print_background=True,
+                display_header_footer=False,
+                prefer_css_page_size=False,
+                scale=1.0
+            )
+            
+            await browser.close()
+            
+            # Convert to mm
+            width_mm = page_width_inches * 25.4
+            height_mm = page_height_inches * 25.4
+            
+            return output_pdf_path, width_mm, height_mm, geometries
+    
+    finally:
+        # Clean up temp file
+        pathlib.Path(tmp_html_path).unlink(missing_ok=True)
+
+
+def extract_bboxes_from_rendered_pdf(
+    pdf_path: pathlib.Path
+) -> List[dict]:
+    """
+    Extract bounding boxes from rendered PDF.
+    
+    Args:
+        pdf_path: Path to PDF file
+        
+    Returns:
+        List of bounding box dictionaries
+    """
+    from docgenie.generation.models import OCRBox
+    
+    # Extract word-level bboxes
+    word_bboxes = extract_bboxes_from_pdf(
+        pdf_path=pdf_path,
+        level="word"
+    )
+    
+    # Convert OCRBox objects to dict format
+    # OCRBox has: x0, y0, x2, y2, text, block_no, line_no, word_no
+    bbox_list = []
+    for bbox in word_bboxes:
+        bbox_list.append({
+            "text": bbox.text,
+            "x": bbox.x0,
+            "y": bbox.y0,
+            "width": bbox.width,  # x2 - x0
+            "height": bbox.height,  # y2 - y0
+            "block_no": bbox.block_no,
+            "line_no": bbox.line_no,
+            "word_no": bbox.word_no,
+            "page": 0  # Single page documents only
+        })
+    
+    return bbox_list
+
+
+def pdf_to_base64(pdf_path: pathlib.Path) -> str:
+    """
+    Convert PDF file to base64 string.
+    
+    Args:
+        pdf_path: Path to PDF file
+        
+    Returns:
+        Base64-encoded PDF
+    """
+    with open(pdf_path, 'rb') as f:
+        pdf_bytes = f.read()
+    
+    return base64.b64encode(pdf_bytes).decode('utf-8')
+
+
+def validate_html_structure(html: str) -> Tuple[bool, str]:
+    """
+    Validate HTML structure (pipeline_06 style validation).
+    
+    Args:
+        html: HTML content to validate
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        soup = BeautifulSoup(html, BS_PARSER)
+        
+        # Check for required tags
+        if not soup.find('html'):
+            return False, "Missing <html> tag"
+        if not soup.find('head'):
+            return False, "Missing <head> tag"
+        if not soup.find('body'):
+            return False, "Missing <body> tag"
+        
+        # Check for minimum content
+        body = soup.find('body')
+        if body and len(body.get_text(strip=True)) < 10:
+            return False, "Body content too short"
+        
+        return True, ""
+    except Exception as e:
+        return False, f"HTML parsing error: {str(e)}"
+
+
+def validate_pdf(pdf_path: pathlib.Path) -> Tuple[bool, str]:
+    """
+    Validate PDF file (pipeline_06 style validation).
+    
+    Args:
+        pdf_path: Path to PDF file
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        from PyPDF2 import PdfReader
+        
+        if not pdf_path.exists():
+            return False, "PDF file does not exist"
+        
+        # Check file size
+        file_size = pdf_path.stat().st_size
+        if file_size == 0:
+            return False, "PDF file is empty"
+        if file_size > 50 * 1024 * 1024:  # 50MB limit
+            return False, f"PDF file too large: {file_size / (1024*1024):.1f}MB"
+        
+        # Check page count
+        with open(pdf_path, 'rb') as f:
+            reader = PdfReader(f)
+            num_pages = len(reader.pages)
+            if num_pages == 0:
+                return False, "PDF has no pages"
+            if num_pages > 1:
+                return False, f"PDF has {num_pages} pages (expected 1)"
+        
+        return True, ""
+    except Exception as e:
+        return False, f"PDF validation error: {str(e)}"
+
+
+def validate_bboxes(bboxes: List[dict], min_bbox_count: int = 0) -> Tuple[bool, str]:
+    """
+    Validate bounding boxes (pipeline_06 style validation).
+    
+    Args:
+        bboxes: List of bounding box dictionaries
+        min_bbox_count: Minimum number of bboxes required
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if len(bboxes) < min_bbox_count:
+        return False, f"Only {len(bboxes)} bboxes found (minimum {min_bbox_count} required)"
+    
+    for i, bbox in enumerate(bboxes):
+        # Check required fields
+        required_fields = ['text', 'x', 'y', 'width', 'height']
+        for field in required_fields:
+            if field not in bbox:
+                return False, f"BBox {i} missing required field: {field}"
+        
+        # Check dimensions
+        if bbox['width'] <= 0 or bbox['height'] <= 0:
+            return False, f"BBox {i} has invalid dimensions: {bbox['width']}x{bbox['height']}"
+    
+    return True, ""
+
+
+def validate_html_structure(html: str) -> Tuple[bool, Optional[str]]:
+    """
+    Validate HTML structure for common issues.
+    
+    Args:
+        html: HTML content to validate
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        soup = BeautifulSoup(html, BS_PARSER)
+        
+        # Check for basic HTML structure
+        if not soup.find('html'):
+            return False, "Missing <html> tag"
+        
+        if not soup.find('head'):
+            return False, "Missing <head> tag"
+        
+        if not soup.find('body'):
+            return False, "Missing <body> tag"
+        
+        return True, None
+    
+    except Exception as e:
+        return False, f"HTML parsing error: {str(e)}"
+
+
+def validate_pdf(pdf_path: pathlib.Path) -> Tuple[bool, Optional[str]]:
+    """
+    Validate PDF file for common issues.
+    
+    Args:
+        pdf_path: Path to PDF file
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        from PyPDF2 import PdfReader
+        
+        if not pdf_path.exists():
+            return False, "PDF file does not exist"
+        
+        if pdf_path.stat().st_size == 0:
+            return False, "PDF file is empty"
+        
+        # Try to open and read PDF
+        with open(pdf_path, 'rb') as f:
+            reader = PdfReader(f)
+            num_pages = len(reader.pages)
+            
+            if num_pages == 0:
+                return False, "PDF has no pages"
+            
+            if num_pages > 1:
+                return False, f"PDF has {num_pages} pages (expected 1)"
+        
+        return True, None
+    
+    except Exception as e:
+        return False, f"PDF validation error: {str(e)}"
+
+
+def validate_bboxes(bboxes: List[dict], min_bbox_count: int = 1) -> Tuple[bool, Optional[str]]:
+    """
+    Validate bounding boxes for common issues.
+    
+    Args:
+        bboxes: List of bounding box dictionaries
+        min_bbox_count: Minimum expected number of bboxes
+        
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if len(bboxes) < min_bbox_count:
+        return False, f"Too few bboxes: {len(bboxes)} (expected at least {min_bbox_count})"
+    
+    for i, bbox in enumerate(bboxes):
+        # Check required fields
+        required_fields = ['text', 'x', 'y', 'width', 'height']
+        for field in required_fields:
+            if field not in bbox:
+                return False, f"BBox {i} missing required field: {field}"
+        
+        # Check for valid dimensions
+        if bbox['width'] <= 0 or bbox['height'] <= 0:
+            return False, f"BBox {i} has invalid dimensions: width={bbox['width']}, height={bbox['height']}"
+    
+    return True, None
+
+
+# ============================================================================
+# STAGE 3: Feature Synthesis (Handwriting & Visual Elements)
+# ============================================================================
+
+async def call_handwriting_service_batch(
+    texts_with_metadata: List[Dict]
+) -> List[Dict]:
+    """
+    Call RunPod handwriting service with TRUE batch processing for cost efficiency.
+    Sends all texts in ONE request to activate only ONE worker, significantly reducing costs.
+    
+    Cost comparison for 10 texts:
+    - OLD (parallel): 10 workers × 18s = 180 worker-seconds
+    - NEW (batched): 1 worker × 190s = 190 worker-seconds BUT only 1 worker activation fee
+    
+    For RunPod pricing with activation overhead, batching is ~40-60% cheaper.
+    
+    Args:
+        texts_with_metadata: List of dicts with keys: text, author_id, hw_id
+        
+    Returns:
+        List of dicts with keys: hw_id, image_base64, text, author_id, width, height
+    """
+    if not texts_with_metadata:
+        return []
+    
+    max_retries = settings.HANDWRITING_SERVICE_MAX_RETRIES
+    timeout = settings.HANDWRITING_SERVICE_TIMEOUT
+    
+    # Calculate appropriate timeout: ~18s per text + 30s buffer
+    # For large batches, increase timeout proportionally
+    num_texts = len(texts_with_metadata)
+    batch_timeout = max(timeout, num_texts * 20 + 30)  # 20s per text + buffer
+    
+    # Prepare headers
+    headers = {"Content-Type": "application/json"}
+    if settings.RUNPOD_API_KEY:
+        headers["Authorization"] = f"Bearer {settings.RUNPOD_API_KEY}"
+    
+    print(f"       Processing {num_texts} texts in ONE batch (1 worker activation)...")
+    
+    for attempt in range(max_retries):
+        try:
+            async with httpx.AsyncClient(timeout=batch_timeout) as client:
+                # Build RunPod BATCH request format
+                runpod_request = {
+                    "input": {
+                        "texts": [
+                            {
+                                "text": item["text"],
+                                "author_id": item["author_id"],
+                                "hw_id": item.get("hw_id", f"hw_{i}")
+                            }
+                            for i, item in enumerate(texts_with_metadata)
+                        ],
+                        "apply_blur": settings.HANDWRITING_APPLY_BLUR
+                    }
+                }
+                
+                response = await client.post(
+                    settings.HANDWRITING_SERVICE_URL,
+                    json=runpod_request,
+                    headers=headers
+                )
+                response.raise_for_status()
+                
+                result = response.json()
+                
+                # Parse RunPod response format
+                # Handle cases where /runsync returns before completion
+                job_status = result.get("status")
+                
+                if job_status == "IN_PROGRESS":
+                    # RunPod's /runsync can return IN_PROGRESS for long jobs
+                    # Poll the status endpoint until completion
+                    job_id = result.get("id")
+                    if not job_id:
+                        raise Exception("RunPod job IN_PROGRESS but no job ID provided")
+                    
+                    print(f"       ⏳ Job {job_id} still processing, polling status...")
+                    
+                    # Extract base URL and construct status endpoint
+                    # URL format: https://api.runpod.ai/v2/{endpoint_id}/runsync
+                    # Status format: https://api.runpod.ai/v2/{endpoint_id}/status/{job_id}
+                    base_url = settings.HANDWRITING_SERVICE_URL.replace("/runsync", "")
+                    status_url = f"{base_url}/status/{job_id}"
+                    
+                    # Poll with exponential backoff
+                    max_polls = 30  # Max 30 polls
+                    poll_delay = 5   # Start with 5 seconds
+                    
+                    for poll_attempt in range(max_polls):
+                        await asyncio.sleep(poll_delay)
+                        
+                        status_response = await client.get(status_url, headers=headers)
+                        status_response.raise_for_status()
+                        result = status_response.json()
+                        
+                        job_status = result.get("status")
+                        print(f"       ⏳ Poll {poll_attempt + 1}/{max_polls}: {job_status}")
+                        
+                        if job_status == "COMPLETED":
+                            print(f"       ✅ Job completed after {poll_attempt + 1} polls")
+                            break
+                        elif job_status == "FAILED":
+                            raise Exception(f"RunPod job failed: {result.get('error', 'Unknown error')}")
+                        elif job_status not in ["IN_PROGRESS", "IN_QUEUE"]:
+                            raise Exception(f"Unknown job status: {job_status}")
+                        
+                        # Increase delay slightly (cap at 10s)
+                        poll_delay = min(poll_delay + 1, 10)
+                    else:
+                        raise Exception(f"Job did not complete after {max_polls} status checks")
+                
+                if job_status != "COMPLETED":
+                    raise Exception(f"RunPod job not completed: {job_status}")
+                
+                output = result.get("output", {})
+                if "error" in output:
+                    raise Exception(f"RunPod error: {output['error']}")
+                
+                # Extract images from batch response
+                images = output.get("images", [])
+                if not images:
+                    raise Exception("No images in batch response")
+                
+                # Format results
+                all_results = [
+                    {
+                        "hw_id": img.get("hw_id"),
+                        "text": img.get("text"),
+                        "author_id": img.get("author_id"),
+                        "image_base64": img.get("image_base64"),
+                        "width": img.get("width"),
+                        "height": img.get("height")
+                    }
+                    for img in images
+                ]
+                
+                print(f"       → Batch complete: {len(all_results)}/{num_texts} texts generated successfully")
+                return all_results
+                
+        except httpx.TimeoutException as e:
+            if attempt < max_retries - 1:
+                wait_time = 10 * (attempt + 1)  # Exponential backoff
+                print(f"       ⚠️ Timeout on attempt {attempt + 1}/{max_retries}, retrying in {wait_time}s...")
+                await asyncio.sleep(wait_time)
+                continue
+            else:
+                print(f"       ❌ Batch failed after {max_retries} retries: {e}")
+                return []
+                
+        except Exception as e:
+            if attempt < max_retries - 1:
+                wait_time = 5 * (attempt + 1)
+                print(f"       ⚠️ Error on attempt {attempt + 1}/{max_retries}: {e}, retrying in {wait_time}s...")
+                await asyncio.sleep(wait_time)
+                continue
+            else:
+                print(f"       ❌ Batch failed: {e}")
+                return []
+    
+    return []
+
+
+async def generate_visual_element_images(
+    visual_elements: list[dict],
+    seed: Optional[int] = None
+) -> dict:
+    """
+    Generate visual element images (stamps, logos, barcodes, photos, figures).
+    
+    Args:
+        visual_elements: List of visual element definitions with type, content, rect
+        seed: Random seed for reproducible selection (default: None)
+        
+    Returns:
+        Dict {ve_id: base64_png} of generated images
+    """
+    import random
+    import base64
+    import io
+    from pathlib import Path
+    
+    if seed is not None:
+        random.seed(seed)
+    
+    visual_element_images = {}
+    
+    # Cache prefab directories
+    logo_prefabs = None
+    photo_prefabs = None
+    figure_prefabs = None
+    
+    def get_logo_prefabs():
+        nonlocal logo_prefabs
+        if logo_prefabs is None:
+            logo_dir = ENV.VISUAL_ELEMENT_PREFABS_DIR / "logo"
+            logo_prefabs = list(logo_dir.glob("*.png")) + list(logo_dir.glob("*.jpg"))
+        return logo_prefabs
+    
+    def get_photo_prefabs():
+        nonlocal photo_prefabs
+        if photo_prefabs is None:
+            photo_dir = ENV.VISUAL_ELEMENT_PREFABS_DIR / "photo"
+            photo_prefabs = list(photo_dir.glob("*.png")) + list(photo_dir.glob("*.jpg"))
+        return photo_prefabs
+    
+    def get_figure_prefabs():
+        nonlocal figure_prefabs
+        if figure_prefabs is None:
+            figure_dir = ENV.VISUAL_ELEMENT_PREFABS_DIR / "figure"
+            figure_prefabs = list(figure_dir.glob("*.png")) + list(figure_dir.glob("*.jpg"))
+        return figure_prefabs
+    
+    for ve in visual_elements:
+        ve_id = ve.get('id', 'unknown')
+        ve_type = ve.get('type', 'unknown')
+        content = ve.get('content', '')
+        rect = ve.get('rect', {})
+        width = rect.get('width', 100)
+        height = rect.get('height', 100)
+        rotation = ve.get('rotation', 0)
+        
+        try:
+            img = None
+            
+            if ve_type == 'stamp':
+                # Generate stamp with text
+                img = create_stamp(
+                    text=content if content else "STAMP",
+                    width=width,
+                    height=height,
+                    rot_angle=None  # Rotation applied during insertion
+                )
+            
+            elif ve_type == 'logo':
+                # Select random logo from prefabs
+                logos = get_logo_prefabs()
+                if logos:
+                    selected_logo = random.choice(logos)
+                    img = Image.open(selected_logo).convert("RGBA")
+            
+            elif ve_type == 'barcode':
+                # Generate Code128 barcode
+                try:
+                    from barcode import Code128
+                    from barcode.writer import ImageWriter
+                    
+                    # Validate barcode content
+                    barcode_content = content.strip() if content and content.strip().isdigit() else str(random.randint(100000000000, 999999999999))
+                    
+                    # Configure barcode writer
+                    writer = ImageWriter()
+                    writer.set_options({
+                        "module_width": 0.3,
+                        "module_height": 15.0,
+                        "quiet_zone": 6.5,
+                        "font_size": 7,
+                        "text_distance": 5,
+                        "background": "rgba(255, 255, 255, 0)",
+                        "foreground": "black",
+                    })
+                    
+                    code128 = Code128(barcode_content, writer=writer)
+                    buffer = io.BytesIO()
+                    code128.write(buffer, options={"format": "PNG"})
+                    buffer.seek(0)
+                    img = Image.open(buffer).convert("RGBA")
+                    
+                except ImportError:
+                    print(f"  ⚠ 'python-barcode' not installed, skipping barcode {ve_id}")
+                except Exception as e:
+                    print(f"  ⚠ Barcode generation failed for {ve_id}: {e}")
+            
+            elif ve_type == 'photo':
+                # Select random photo from prefabs
+                photos = get_photo_prefabs()
+                if photos:
+                    selected_photo = random.choice(photos)
+                    img = Image.open(selected_photo).convert("RGBA")
+            
+            elif ve_type in ['figure', 'chart', 'diagram']:
+                # Select random figure/chart from prefabs
+                figures = get_figure_prefabs()
+                if figures:
+                    selected_figure = random.choice(figures)
+                    img = Image.open(selected_figure).convert("RGBA")
+            
+            # Convert to base64 if successfully generated
+            if img:
+                buffer = io.BytesIO()
+                img.save(buffer, format="PNG")
+                buffer.seek(0)
+                img_b64 = base64.b64encode(buffer.read()).decode('utf-8')
+                visual_element_images[ve_id] = img_b64
+        
+        except Exception as e:
+            print(f"  ⚠ Failed to generate visual element {ve_id} (type: {ve_type}): {e}")
+            continue
+    
+    return visual_element_images
+
+
+async def process_stage3_complete(
+    pdf_path: pathlib.Path,
+    geometries: list[dict],
+    ground_truth: dict,
+    bboxes_raw: list[dict],
+    page_width_mm: float,
+    page_height_mm: float,
+    enable_handwriting: bool = False,
+    handwriting_ratio: float = 0.5,
+    enable_visual_elements: bool = False,
+    visual_element_types: list[str] = None,
+    seed: Optional[int] = None
+) -> tuple[str, list[dict], list[dict], dict, dict, pathlib.Path | None, pathlib.Path | None]:
+    """
+    Process complete Stage 3 pipeline (stages 07-11) using browser-extracted geometries.
+    - Extract handwriting definitions from geometries (from DOM, not HTML parsing)
+    - Extract visual element definitions from geometries
+    - Generate handwriting images (via EC2 service if enabled)
+    - Create visual element images
+    - Render second-pass PDF with handwriting and visual elements
+    - Convert final PDF to base64 image
+    
+    Args:
+        geometries: List of element geometries extracted from browser DOM
+        
+    Returns:
+        tuple: (final_image_base64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path)
+            - final_image_base64: Base64 PNG of final document
+            - handwriting_regions: List of handwriting metadata dicts
+            - visual_elements: List of visual element metadata dicts
+            - handwriting_images: Dict {hw_id: base64_png} for individual tokens
+            - visual_element_images: Dict {ve_id: base64_png} for individual elements
+            - pdf_with_handwriting_path: Path to PDF after handwriting insertion (or None)
+            - pdf_final_path: Path to final PDF after all modifications (or None)
+    """
+    import random
+    import base64
+    import fitz  # PyMuPDF
+    
+    handwriting_regions = []
+    visual_elements = []
+    
+    print(f"  🔍 Processing {len(geometries)} geometries from DOM")
+    
+    # Step 2: Extract handwriting definitions (pipeline_07) - map geometries to word bboxes
+    if enable_handwriting:
+        # Convert bboxes_raw dicts to OCRBox objects for matching
+        from docgenie.generation.models import OCRBox
+        from docgenie.generation.constants import BBOX_TO_GEO_MATCHING_THRESHOLD
+        from docgenie.generation.utils.bboxes import is_in_rect
+        
+        # Build OCRBox list from bboxes_raw
+        word_bboxes = []
+        for bbox_dict in bboxes_raw:
+            word_bboxes.append(OCRBox(
+                x0=bbox_dict['x'],
+                y0=bbox_dict['y'],
+                x2=bbox_dict['x'] + bbox_dict['width'],
+                y2=bbox_dict['y'] + bbox_dict['height'],
+                text=bbox_dict['text'],
+                block_no=bbox_dict.get('block_no', 0),  # Default if not present
+                line_no=bbox_dict.get('line_no', 0),
+                word_no=bbox_dict.get('word_no', 0)
+            ))
+        
+        # Filter geometries for handwriting elements
+        hw_geometries = [g for g in geometries if "handwriting" in g.get("selectorTypes", [])]
+        
+        print(f"     - Found {len(hw_geometries)} handwriting geometries")
+        
+        taken_bbox_indices = set()
+        
+        for i, geo in enumerate(hw_geometries):
+            classes_str = geo.get('classes', '')
+            classes = classes_str.split() if classes_str else []
+            
+            # Extract author ID
+            other_classes = [c for c in classes if c != 'handwritten']
+            valid_author_ids = [c for c in other_classes if c.startswith("author")]
+            author_id = valid_author_ids[0] if valid_author_ids else None
+            
+            # Random selection based on handwriting_ratio
+            if seed is not None:
+                random.seed(seed + i)
+            if random.random() > handwriting_ratio:
+                continue
+            
+            text_content = geo.get('text', '').strip()
+            if not text_content:
+                continue
+            
+            is_signature = 'signature' in classes
+            
+            # Convert browser coordinates (96 DPI) to PDF points (72 DPI)
+            # Playwright renders at 96 DPI, PyMuPDF extracts at 72 DPI
+            # Conversion factor: 72/96 = 0.75
+            rect_browser = geo.get('rect', {})
+            dpi_scale = 72.0 / 96.0  # 0.75
+            rect = {
+                'x': rect_browser.get('x', 0) * dpi_scale,
+                'y': rect_browser.get('y', 0) * dpi_scale,
+                'width': rect_browser.get('width', 0) * dpi_scale,
+                'height': rect_browser.get('height', 0) * dpi_scale
+            }
+            
+            # Map geometry to word bboxes (like pipeline_07 find_bbox_indices)
+            words = text_content.split()
+            n = len(words)
+            matched_bboxes = []
+            
+            for j in range(len(word_bboxes) - n + 1):
+                slice_texts = [b.text for b in word_bboxes[j : j + n]]
+                if slice_texts == words:
+                    start, stop = j, j + n
+                    if (start, stop) not in taken_bbox_indices:
+                        # Check if bboxes are within geometry rect
+                        start_in_rect = is_in_rect(
+                            rect=rect,
+                            bbox=word_bboxes[start],
+                            threshold=BBOX_TO_GEO_MATCHING_THRESHOLD
+                        )
+                        stop_in_rect = is_in_rect(
+                            rect=rect,
+                            bbox=word_bboxes[stop - 1],
+                            threshold=BBOX_TO_GEO_MATCHING_THRESHOLD
+                        )
+                        if start_in_rect and stop_in_rect:
+                            matched_bboxes = word_bboxes[start:stop]
+                            taken_bbox_indices.add((start, stop))
+                            break
+            
+            if not matched_bboxes:
+                print(f"     - ⚠️ No bbox match for hw{i}: '{text_content[:30]}'")
+                continue
+            
+            handwriting_regions.append({
+                'id': f'hw{i}',
+                'text': text_content,
+                'author_id': author_id,
+                'is_signature': is_signature,
+                'rect': rect,
+                'bboxes': [b.as_string() for b in matched_bboxes],
+                'classes': classes_str
+            })
+        
+        print(f"     - Selected {len(handwriting_regions)} handwriting regions (ratio: {handwriting_ratio})")
+    
+    # Step 3: Extract visual element definitions (pipeline_08) - from geometries
+    if enable_visual_elements:
+        # Filter geometries for visual element placeholders
+        ve_geometries = [g for g in geometries if "visual_element" in g.get("selectorTypes", [])]
+        
+        print(f"     - Found {len(ve_geometries)} visual element geometries")
+        
+        for i, geo in enumerate(ve_geometries):
+            data_type = geo.get('dataPlaceholder', '')
+            data_content = geo.get('dataContent', '')
+            
+            # Normalize type using synonyms (e.g., "chart" -> "figure")
+            normalized_type = VISUAL_ELEMENT_TYPE_SYNONYMS.get(data_type, data_type)
+            
+            # Filter by requested types
+            if visual_element_types and normalized_type not in visual_element_types:
+                print(f"     ⚠️  Filtered out visual element type '{data_type}' (normalized to '{normalized_type}', not in requested types: {visual_element_types})")
+                continue
+            
+            # Use rect from geometry
+            rect_px = geo.get('rect', {})
+            px_to_mm = 25.4 / 96
+            rect = {
+                'x': rect_px.get('x', 0) * px_to_mm,
+                'y': rect_px.get('y', 0) * px_to_mm,
+                'width': rect_px.get('width', 0) * px_to_mm,
+                'height': rect_px.get('height', 0) * px_to_mm
+            }
+            
+            # Extract rotation if present in style
+            rotation = 0
+            style = geo.get('style', '')
+            if style and 'rotate' in style:
+                rotation = extract_rotation_from_style(style)
+            
+            visual_elements.append({
+                'id': f've{i}',
+                'type': normalized_type,  # Use normalized type (e.g., "figure" not "chart")
+                'content': data_content,
+                'rect': rect,
+                'rotation': rotation
+            })
+        
+        print(f"     - Selected {len(visual_elements)} visual elements")
+    
+    # Step 4: Generate handwriting images (pipeline_09)
+    handwriting_images = {}
+    
+    # DEBUG: Show why handwriting service may not be called
+    print(f"\n  🔍 DEBUG - Handwriting Service Check:")
+    print(f"     - enable_handwriting: {enable_handwriting}")
+    print(f"     - handwriting_regions count: {len(handwriting_regions)}")
+    print(f"     - HANDWRITING_SERVICE_ENABLED: {settings.HANDWRITING_SERVICE_ENABLED}")
+    print(f"     - HANDWRITING_SERVICE_URL: {settings.HANDWRITING_SERVICE_URL}")
+    
+    if enable_handwriting and handwriting_regions and settings.HANDWRITING_SERVICE_ENABLED:
+        print(f"     ✅ Handwriting service check PASSED - preparing batch request...")
+        
+        # Map author strings to numeric style IDs (matches original pipeline behavior)
+        # Original uses WRITER_STYLES list from constants.py
+        from docgenie.generation.constants import WRITER_STYLES
+        
+        # Create deterministic mapping: author_id string → numeric style ID
+        def map_author_to_style_id(author_id_str: str, seed_val: Optional[int] = None) -> int:
+            """
+            Map author ID string (like 'author1') to numeric style ID (0-656).
+            Matches original pipeline's style selection logic.
+            """
+            if not author_id_str or not author_id_str.startswith('author'):
+                # Fallback: random from WRITER_STYLES
+                return random.choice(WRITER_STYLES)
+            
+            try:
+                # Parse number from "authorN"
+                author_num = int(author_id_str.replace('author', ''))
+                # Use modulo to map to WRITER_STYLES indices
+                style_idx = author_num % len(WRITER_STYLES)
+                return WRITER_STYLES[style_idx]
+            except ValueError:
+                # If parsing fails, random selection
+                return random.choice(WRITER_STYLES)
+        
+        # Prepare batch request for handwriting service
+        texts_to_generate = []
+        for i, hw_region in enumerate(handwriting_regions):
+            author_id_str = hw_region.get('author_id')
+            text = hw_region.get('text', '')
+            print(f"     - Region {i+1}: author_id='{author_id_str}', text='{text[:30]}...'")
+            
+            # Only generate if we have a valid author_id
+            if author_id_str is not None:
+                # Convert author string to numeric style ID
+                style_id = map_author_to_style_id(author_id_str, seed)
+                print(f"       → Mapped to style_id={style_id}")
+                
+                # Group bboxes by block/line (like pipeline_12)
+                bboxes_str = hw_region.get('bboxes', [])
+                if not bboxes_str:
+                    print(f"       → ⚠️ Skipped (no bboxes)")
+                    continue
+                
+                # Parse bbox strings and group by (block_no, line_no)
+                from collections import defaultdict
+                from docgenie.generation.utils.bboxes import read_syn_dataset_bbox_str
+                
+                grouped_bboxes = defaultdict(list)
+                for bbox_str in bboxes_str:
+                    bbox = read_syn_dataset_bbox_str(bbox_str)
+                    grouped_bboxes[(bbox.block_no, bbox.line_no)].append(bbox)
+                
+                # Generate one image per word (WordStylist doesn't support spaces)
+                for (block_no, line_no), bbox_group in grouped_bboxes.items():
+                    # Process each word individually
+                    for word_idx, bbox in enumerate(bbox_group):
+                        word_text = bbox.text
+                        
+                        # Filter to only letters (WordStylist only supports A-Z, a-z, no spaces)
+                        filtered_text = ''.join(c for c in word_text if c.isalpha())
+                        
+                        # Skip if no valid text remains after filtering
+                        if not filtered_text:
+                            continue
+                        
+                        texts_to_generate.append({
+                            'text': filtered_text,
+                            'author_id': style_id,
+                            'hw_id': f"{hw_region['id']}_b{block_no}_l{line_no}_w{word_idx}"
+                        })
+                
+                print(f"       → {len(grouped_bboxes)} block/line groups")
+            else:
+                print(f"       → ⚠️ Skipped (no author_id)")
+        
+        print(f"     - Prepared {len(texts_to_generate)} texts for generation")
+        
+        if texts_to_generate:
+            try:
+                print(f"     - Calling RunPod handwriting service at {settings.HANDWRITING_SERVICE_URL}...")
+                # Call RunPod handwriting service
+                results = await call_handwriting_service_batch(texts_to_generate)
+                
+                print(f"     - ✅ Received {len(results)} handwriting images")
+                
+                # Store generated images
+                for result in results:
+                    handwriting_images[result['hw_id']] = result['image_base64']
+                    
+            except Exception as e:
+                print(f"     - ❌ Handwriting service call failed: {e}")
+                import traceback
+                traceback.print_exc()
+                # If handwriting is explicitly enabled, fail the entire generation
+                # Don't produce documents without handwriting when user requested it
+                raise Exception(f"Handwriting generation failed: {e}") from e
+        else:
+            print(f"     - ⚠️ No texts to generate (all regions missing author_id)")
+    else:
+        print(f"     ❌ Handwriting service check FAILED - skipping generation")
+    
+    # Step 5: Create visual element images (pipeline_10)
+    visual_element_images = {}
+    if enable_visual_elements and visual_elements:
+        try:
+            visual_element_images = await generate_visual_element_images(
+                visual_elements, 
+                seed=seed
+            )
+            print(f"  ✓ Generated {len(visual_element_images)} visual element images")
+        except Exception as e:
+            print(f"  ⚠ Visual element generation failed: {e}")
+            # Continue without visual elements
+    
+    # Step 6: Insert handwriting images into PDF (pipeline_12)
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    pdf_with_handwriting_path = None
+    pdf_final_path = None
+    
+    if handwriting_images:
+        print(f"  🖊️ Inserting {len(handwriting_images)} handwriting images into PDF...")
+        
+        from docgenie.generation.constants import (
+            FIXED_HANDWRITING_X_OFFSET,
+            MAX_HANDWRITING_RAND_X_OFFSET_LEFT,
+            MAX_HANDWRITING_RAND_X_OFFSET_RIGHT,
+            MAX_HANDWRITING_RAND_Y_OFFSET_UP,
+            MAX_HANDWRITING_RAND_Y_OFFSET_DOWN,
+            PIPELINE_04_3_SCALE_UP_FACTOR
+        )
+        
+        scale_up = PIPELINE_04_3_SCALE_UP_FACTOR  # 3x upscaling
+        
+        from docgenie.generation.utils.bboxes import read_syn_dataset_bbox_str
+        
+        # Step 6a: White out original text in handwriting regions (matches pipeline_11)
+        # This replaces the "make text transparent" step from original pipeline
+        print(f"     - Whitening out original text regions...")
+        for hw_region in handwriting_regions:
+            bboxes_str = hw_region.get('bboxes', [])
+            if not bboxes_str:
+                continue
+            
+            # Draw white rectangles over each word bbox
+            for bbox_str in bboxes_str:
+                bbox = read_syn_dataset_bbox_str(bbox_str)
+                # Draw white filled rectangle to hide original text
+                rect = fitz.Rect(bbox.x0, bbox.y0, bbox.x2, bbox.y2)
+                page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
+        
+        print(f"     - Inserting handwriting images...")
+        
+        # Process each handwriting region
+        for hw_region in handwriting_regions:
+            hw_id = hw_region['id']
+            rect = hw_region['rect']
+            bboxes_str = hw_region.get('bboxes', [])
+            
+            if not bboxes_str:
+                continue
+            
+            # Parse bboxes and group by block/line
+            from collections import defaultdict
+            grouped_bboxes = defaultdict(list)
+            for bbox_str in bboxes_str:
+                bbox = read_syn_dataset_bbox_str(bbox_str)
+                grouped_bboxes[(bbox.block_no, bbox.line_no)].append(bbox)
+            
+            # Insert images for each individual word
+            for (block_no, line_no), bbox_group in grouped_bboxes.items():
+                for word_idx, bbox in enumerate(bbox_group):
+                    img_id = f"{hw_id}_b{block_no}_l{line_no}_w{word_idx}"
+                    
+                    if img_id not in handwriting_images:
+                        continue
+                    
+                    try:
+                        # Decode base64 image
+                        img_data = base64.b64decode(handwriting_images[img_id])
+                        img = Image.open(BytesIO(img_data))
+                        
+                        # Get bbox dimensions for this word
+                        bbox_w = bbox.x2 - bbox.x0
+                        bbox_h = bbox.y2 - bbox.y0
+                        
+                        # Resize with aspect ratio preservation (matches pipeline_12)
+                        iw, ih = img.size
+                        scale = min(bbox_w / iw, bbox_h / ih)
+                        new_w = int(iw * scale * scale_up)
+                        new_h = int(ih * scale * scale_up)
+                        
+                        img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA")
+                        
+                        # Convert to bytes for PyMuPDF
+                        img_bytes_io = BytesIO()
+                        img_resized.save(img_bytes_io, format="PNG")
+                        img_bytes = img_bytes_io.getvalue()
+                        
+                        # Calculate position with random offsets
+                        y_padding = 50
+                        offset_x = random.randint(
+                            -MAX_HANDWRITING_RAND_X_OFFSET_LEFT,
+                            MAX_HANDWRITING_RAND_X_OFFSET_RIGHT
+                        ) + FIXED_HANDWRITING_X_OFFSET
+                        offset_y = random.randint(
+                            -MAX_HANDWRITING_RAND_Y_OFFSET_UP,
+                            MAX_HANDWRITING_RAND_Y_OFFSET_DOWN
+                        )
+                        
+                        # Position at word bbox location
+                        x0_pos = bbox.x0 + offset_x
+                        y0_pos = bbox.y0 + offset_y - y_padding
+                        x2_pos = min(x0_pos + img_resized.width / scale_up, bbox.x2) + offset_x
+                        y2_pos = min(y0_pos + img_resized.height / scale_up, bbox.y2) + offset_y + 2 * y_padding
+                        
+                        # Insert image into PDF
+                        rect_fitz = fitz.Rect(x0_pos, y0_pos, x2_pos, y2_pos)
+                        page.insert_image(rect_fitz, stream=img_bytes)
+                        
+                        print(f"     - ✓ Inserted {img_id} at ({x0_pos:.1f}, {y0_pos:.1f})")
+                        
+                    except Exception as e:
+                        print(f"     - ⚠️ Failed to insert {img_id}: {e}")
+                        import traceback
+                        traceback.print_exc()
+        
+        print(f"  ✓ Handwriting insertion complete")
+        
+        # Save modified PDF with handwriting (matches pipeline_12)
+        pdf_with_handwriting_path = pdf_path.parent / f"{pdf_path.stem}_with_handwriting.pdf"
+        doc.save(pdf_with_handwriting_path)
+        print(f"     - Saved PDF with handwriting: {pdf_with_handwriting_path.name}")
+        doc.close()
+        
+        # Reopen modified PDF for visual element insertion
+        doc = fitz.open(pdf_with_handwriting_path)
+        page = doc[0]
+    
+    # Step 6b: Insert visual elements into PDF (pipeline_13)
+    if visual_element_images and visual_elements:
+        print(f"  🎨 Inserting {len(visual_element_images)} visual elements into PDF...")
+        
+        from docgenie.generation.constants import PIPELINE_04_3_SCALE_UP_FACTOR
+        scale_up = PIPELINE_04_3_SCALE_UP_FACTOR  # 3x upscaling
+        
+        for ve in visual_elements:
+            ve_id = ve['id']
+            
+            if ve_id not in visual_element_images:
+                print(f"     - ⚠️ Skipping {ve_id}: image not generated")
+                continue
+            
+            try:
+                # Decode base64 image
+                img_data = base64.b64decode(visual_element_images[ve_id])
+                img = Image.open(BytesIO(img_data))
+                
+                # Get rect from visual element definition
+                rect = ve['rect']
+                bbox_width = rect['width']  # Already in mm
+                bbox_height = rect['height']
+                
+                # Convert mm to points (1 mm = 72/25.4 pt)
+                mm_to_pt = 72 / 25.4
+                bbox_w_pt = bbox_width * mm_to_pt
+                bbox_h_pt = bbox_height * mm_to_pt
+                x0_pt = rect['x'] * mm_to_pt
+                y0_pt = rect['y'] * mm_to_pt
+                
+                # Resize with aspect ratio preservation (matches pipeline_13)
+                iw, ih = img.size
+                scale = min(bbox_w_pt / iw, bbox_h_pt / ih)
+                new_w = int(iw * scale * scale_up)
+                new_h = int(ih * scale * scale_up)
+                
+                img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA")
+                
+                # Create high-res white background
+                final_img = Image.new(
+                    "RGBA", 
+                    (int(bbox_w_pt * scale_up), int(bbox_h_pt * scale_up)), 
+                    (255, 255, 255, 0)
+                )
+                
+                # Paste resized image centered
+                offset_x = (int(bbox_w_pt * scale_up) - new_w) // 2
+                offset_y = (int(bbox_h_pt * scale_up) - new_h) // 2
+                final_img.paste(img_resized, (offset_x, offset_y), mask=img_resized)
+                
+                # Convert to bytes for PyMuPDF
+                img_bytes_io = BytesIO()
+                final_img.save(img_bytes_io, format="PNG")
+                img_bytes = img_bytes_io.getvalue()
+                
+                # Insert image into PDF at specified position
+                rect_fitz = fitz.Rect(x0_pt, y0_pt, x0_pt + bbox_w_pt, y0_pt + bbox_h_pt)
+                page.insert_image(rect_fitz, stream=img_bytes)
+                
+                print(f"     - ✓ Inserted {ve_id} ({ve['type']}) at ({x0_pt:.1f}, {y0_pt:.1f})")
+                
+            except Exception as e:
+                print(f"     - ⚠️ Failed to insert {ve_id}: {e}")
+                import traceback
+                traceback.print_exc()
+        
+        print(f"  ✓ Visual element insertion complete")
+        
+        # Save modified PDF with visual elements
+        # If handwriting was already added, this is the final PDF (both modifications)
+        # Otherwise, this is just the visual elements PDF
+        if pdf_with_handwriting_path:
+            # Both handwriting and visual elements were added
+            pdf_final_path = pdf_path.parent / f"{pdf_path.stem}_final.pdf"
+            doc.save(pdf_final_path)
+            print(f"     - Saved final PDF (with handwriting + visual elements): {pdf_final_path.name}")
+        else:
+            # Only visual elements were added
+            pdf_with_ve_only = pdf_path.parent / f"{pdf_path.stem}_with_visual_elements.pdf"
+            doc.save(pdf_with_ve_only)
+            print(f"     - Saved PDF with visual elements: {pdf_with_ve_only.name}")
+            pdf_final_path = pdf_with_ve_only
+        
+        doc.close()
+        
+        # Reopen for final image rendering
+        doc = fitz.open(pdf_final_path)
+        page = doc[0]
+    
+    # Step 7: Convert final PDF to image
+    # Render at high DPI for quality
+    pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # 3x scale = ~220 DPI
+    img_bytes = pix.tobytes("png")
+    
+    # Convert to base64
+    final_image_b64 = base64.b64encode(img_bytes).decode('utf-8')
+    
+    doc.close()
+    
+    # Return both PDF paths (for dataset exporter)
+    return final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path
+
+
+def extract_rect_from_style(style: str, page_width_mm: float, page_height_mm: float) -> dict:
+    """Extract position and dimensions from inline CSS style."""
+    import re
+    
+    rect = {'x': 0, 'y': 0, 'width': 0, 'height': 0}
+    
+    # Parse CSS properties
+    for prop in style.split(';'):
+        if ':' not in prop:
+            continue
+        key, value = prop.split(':', 1)
+        key = key.strip().lower()
+        value = value.strip()
+        
+        # Extract numeric value and unit
+        match = re.match(r'([-\d.]+)(mm|cm|px)?', value)
+        if not match:
+            continue
+        
+        num_val = float(match.group(1))
+        unit = match.group(2) or 'mm'
+        
+        # Convert to mm
+        if unit == 'cm':
+            num_val *= 10
+        elif unit == 'px':
+            num_val *= 0.2645833333  # 96 DPI to mm
+        
+        # Map CSS properties to rect
+        if key in ('left', 'x'):
+            rect['x'] = num_val
+        elif key in ('top', 'y'):
+            rect['y'] = num_val
+        elif key == 'width':
+            rect['width'] = num_val
+        elif key == 'height':
+            rect['height'] = num_val
+    
+    return rect
+
+
+def extract_rotation_from_style(style: str) -> float:
+    """Extract 2D rotation angle from CSS transform property."""
+    import re
+    
+    match = re.search(r'rotate\(\s*([-+]?\d*\.?\d+)\s*deg\s*\)', style)
+    if match:
+        return float(match.group(1))
+    return 0.0
+
+# ==================== Stages 14-15: Image Finalization & OCR ====================
+
+def run_local_tesseract_ocr(image: Image.Image) -> dict:
+    """
+    Run Tesseract OCR locally on image.
+    
+    Args:
+        image: PIL Image to OCR
+    
+    Returns:
+        dict: OCR results in Microsoft OCR format
+    """
+    try:
+        import pytesseract
+        
+        # Get OCR data with bounding boxes
+        data = pytesseract.image_to_data(
+            image, 
+            lang=settings.OCR_TESSERACT_LANG,
+            config=settings.OCR_TESSERACT_CONFIG,
+            output_type=pytesseract.Output.DICT
+        )
+        
+        # Convert to Microsoft OCR format
+        words = []
+        for i in range(len(data['text'])):
+            text = data['text'][i].strip()
+            if text:  # Only include non-empty text
+                words.append({
+                    'text': text,
+                    'confidence': float(data['conf'][i]) / 100.0 if data['conf'][i] != -1 else 0.0,
+                    'geo': [
+                        int(data['left'][i]),
+                        int(data['top'][i]),
+                        int(data['width'][i]),
+                        int(data['height'][i])
+                    ]
+                })
+        
+        return {
+            'angle': 0,
+            'imageWidth': image.width,
+            'imageHeight': image.height,
+            'words': words
+        }
+        
+    except ImportError:
+        raise RuntimeError(
+            "pytesseract not installed. Install with: uv pip install pytesseract\n"
+            "Also ensure Tesseract OCR is installed on your system:\n"
+            "  Ubuntu/Debian: sudo apt-get install tesseract-ocr\n"
+            "  macOS: brew install tesseract\n"
+            "  Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki"
+        )
+    except Exception as e:
+        print(f"Error running local Tesseract OCR: {e}")
+        raise
+
+
+async def call_ocr_service(
+    image: Image.Image,
+    ocr_url: str = None,
+    engine: str = "microsoft_di",
+    timeout: int = 30,
+    use_local: bool = None
+) -> dict:
+    """
+    Call OCR service on image (Stage 15: Perform OCR).
+    
+    Supports both local Tesseract OCR and remote OCR services.
+    
+    Args:
+        image: PIL Image to OCR
+        ocr_url: OCR service URL (defaults to settings.OCR_SERVICE_URL)
+        engine: OCR engine to use
+        timeout: Request timeout in seconds
+        use_local: Force local/remote mode (None = use settings.OCR_USE_LOCAL)
+    
+    Returns:
+        dict: OCR results in Microsoft OCR format
+    """
+    # Determine if using local or remote OCR
+    if use_local is None:
+        use_local = settings.OCR_USE_LOCAL
+    
+    # Local Tesseract OCR
+    if use_local:
+        print("  Using local Tesseract OCR...")
+        return run_local_tesseract_ocr(image)
+    
+    # Remote OCR service
+    if ocr_url is None:
+        ocr_url = settings.OCR_SERVICE_URL
+    
+    try:
+        # Convert image to bytes
+        buffer = BytesIO()
+        image.save(buffer, format="PNG")
+        buffer.seek(0)
+        image_bytes = buffer.getvalue()
+        
+        # Call OCR service
+        endpoint = f"{ocr_url}/v1/sync/ocr/{engine}"
+        
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            files = {'image': image_bytes, 'type': 'image/png'}
+            headers = {'accept': 'application/json'}
+            
+            response = await client.post(endpoint, headers=headers, files=files)
+            response.raise_for_status()
+            
+            data = response.json()
+            
+            # Extract first page results
+            if 'ocr' in data and 'pages' in data['ocr'] and len(data['ocr']['pages']) > 0:
+                return data['ocr']['pages'][0]
+            else:
+                raise ValueError("Invalid OCR response format")
+                
+    except Exception as e:
+        print(f"Error calling OCR service: {e}")
+        raise
+
+
+async def render_pdf_to_image(
+    pdf_path: pathlib.Path,
+    dpi: int = 300
+) -> tuple[Image.Image, str]:
+    """
+    Convert PDF to high-quality image (Stage 14: Render Image).
+    
+    Uses pdf2image (poppler) for high-quality conversion matching original pipeline.
+    
+    Args:
+        pdf_path: Path to PDF file
+        dpi: DPI for rendering (default: 300, matching pipeline constant)
+    
+    Returns:
+        tuple: (PIL Image, base64-encoded PNG string)
+    """
+    try:
+        # Use pdf2image (same as original pipeline)
+        # This uses poppler under the hood for high-quality rendering
+        images = convert_from_path(pdf_path, dpi=dpi)
+        
+        if not images:
+            raise ValueError("PDF conversion resulted in no images")
+        
+        if len(images) > 1:
+            print(f"Warning: PDF has {len(images)} pages, using first page only")
+        
+        img = images[0]
+        
+        # Convert to base64
+        buffer = BytesIO()
+        img.save(buffer, format="PNG")
+        buffer.seek(0)
+        img_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+        
+        return img, img_base64
+        
+    except Exception as e:
+        print(f"Error converting PDF to image: {e}")
+        raise
+
+
+def convert_ocr_to_api_format(ocr_page: dict) -> dict:
+    """
+    Convert Microsoft OCR format to API OCRResult schema.
+    
+    Args:
+        ocr_page: OCR page result from Microsoft OCR service
+    
+    Returns:
+        dict: OCR results in API format
+    """
+    words = []
+    for word_data in ocr_page.get('words', []):
+        geo = word_data['geo']  # [x, y, width, height]
+        words.append({
+            'text': word_data['text'],
+            'confidence': word_data['confidence'],
+            'x': geo[0],
+            'y': geo[1],
+            'width': geo[2],
+            'height': geo[3]
+        })
+    
+    lines = []
+    for line_data in ocr_page.get('lines', []):
+        geo = line_data['geo']
+        
+        # Extract words for this line (if available)
+        line_words = []
+        # Note: Microsoft OCR doesn't provide word-to-line mapping
+        # We'll just include the line text
+        
+        lines.append({
+            'text': line_data['text'],
+            'confidence': line_data['confidence'],
+            'x': geo[0],
+            'y': geo[1],
+            'width': geo[2],
+            'height': geo[3],
+            'words': line_words
+        })
+    
+    return {
+        'image_width': ocr_page['imageWidth'],
+        'image_height': ocr_page['imageHeight'],
+        'angle': ocr_page.get('angle', 0.0),
+        'words': words,
+        'lines': lines
+    }
+
+
+async def process_stage4_ocr(
+    pdf_path: pathlib.Path,
+    enable_ocr: bool = False,
+    dpi: int = 300
+) -> tuple[Optional[str], Optional[dict]]:
+    """
+    Process Stage 4: Image Finalization & OCR.
+    
+    This corresponds to:
+    - pipeline_14: Render PDF to high-quality image
+    - pipeline_15: Perform OCR on final image
+    
+    Args:
+        pdf_path: Path to final PDF (after Stage 3 if enabled)
+        enable_ocr: Whether to run OCR
+        dpi: DPI for image rendering
+    
+    Returns:
+        tuple: (image_base64, ocr_results_dict)
+    """
+    image_base64 = None
+    ocr_results = None
+    
+    try:
+        # Stage 14: Render PDF to image
+        img, image_base64 = await render_pdf_to_image(pdf_path, dpi=dpi)
+        print(f"  ✓ Stage 14: Rendered image {img.size[0]}x{img.size[1]} @ {dpi} DPI")
+        
+        # Stage 15: Perform OCR (if enabled and service available)
+        if enable_ocr and settings.OCR_SERVICE_ENABLED:
+            try:
+                ocr_page = await call_ocr_service(
+                    img,
+                    timeout=settings.OCR_SERVICE_TIMEOUT
+                )
+                
+                ocr_results = convert_ocr_to_api_format(ocr_page)
+                print(f"  ✓ Stage 15: OCR complete - {len(ocr_results['words'])} words, {len(ocr_results['lines'])} lines")
+                
+            except Exception as e:
+                print(f"  ⚠ Stage 15: OCR failed - {str(e)}")
+                # Continue without OCR
+        elif enable_ocr:
+            print(f"  ⚠ Stage 15: OCR requested but service not enabled (OCR_SERVICE_ENABLED=false)")
+        
+        return image_base64, ocr_results
+        
+    except Exception as e:
+        print(f"  ⚠ Stage 4 processing failed: {str(e)}")
+        return None, None
+
+
+# ==================== Stages 16-18: Dataset Packaging ====================
+
+async def normalize_bboxes_stage16(
+    document_id: str,
+    pdf_path: str,
+    ocr_results: Optional[Dict[str, Any]],
+    scale: str = "0-1"
+) -> Tuple[Optional[List[Dict]], Optional[List[Dict]]]:
+    """
+    Stage 16: Normalize bounding boxes to [0,1] scale.
+    Reuses logic from pipeline_16_normalize_bboxes.py
+    
+    Args:
+        document_id: Unique document identifier
+        pdf_path: Path to PDF file
+        ocr_results: OCR results from Stage 15
+        scale: Normalization scale ("0-1" or "0-1000")
+        
+    Returns:
+        Tuple of (word_level_bboxes, segment_level_bboxes)
+    """
+    try:
+        print(f"\\n  Stage 16: Normalizing bounding boxes...")
+        
+        if not ocr_results or not ocr_results.get('words'):
+            print(f"  ⚠ Stage 16: No OCR results to normalize")
+            return None, None
+        
+        # Get image dimensions from OCR results
+        img_w_px = ocr_results.get('image_width', 0)
+        img_h_px = ocr_results.get('image_height', 0)
+        
+        if img_w_px == 0 or img_h_px == 0:
+            print(f"  ⚠ Stage 16: Invalid image dimensions")
+            return None, None
+        
+        # Normalize word-level bboxes
+        normalized_words = []
+        for word in ocr_results.get('words', []):
+            # Convert pixel coordinates to normalized [0,1]
+            x0_norm = word['x'] / img_w_px
+            y0_norm = word['y'] / img_h_px
+            x2_norm = (word['x'] + word['width']) / img_w_px
+            y2_norm = (word['y'] + word['height']) / img_h_px
+            
+            # If scale is 0-1000, multiply by 1000
+            if scale == "0-1000":
+                x0_norm *= 1000
+                y0_norm *= 1000
+                x2_norm *= 1000
+                y2_norm *= 1000
+            
+            normalized_words.append({
+                'text': word['text'],
+                'x0': x0_norm,
+                'y0': y0_norm,
+                'x2': x2_norm,
+                'y2': y2_norm,
+                'block_no': None,
+                'line_no': None,
+                'word_no': None
+            })
+        
+        # Normalize line-level (segment) bboxes
+        normalized_segments = []
+        for line in ocr_results.get('lines', []):
+            x0_norm = line['x'] / img_w_px
+            y0_norm = line['y'] / img_h_px
+            x2_norm = (line['x'] + line['width']) / img_w_px
+            y2_norm = (line['y'] + line['height']) / img_h_px
+            
+            if scale == "0-1000":
+                x0_norm *= 1000
+                y0_norm *= 1000
+                x2_norm *= 1000
+                y2_norm *= 1000
+            
+            normalized_segments.append({
+                'text': line['text'],
+                'x0': x0_norm,
+                'y0': y0_norm,
+                'x2': x2_norm,
+                'y2': y2_norm,
+                'block_no': None,
+                'line_no': None,
+                'word_no': None
+            })
+        
+        print(f"  ✓ Stage 16: Normalized {len(normalized_words)} words, {len(normalized_segments)} segments")
+        return normalized_words, normalized_segments
+        
+    except Exception as e:
+        print(f"  ⚠ Stage 16: BBox normalization failed - {str(e)}")
+        return None, None
+
+
+async def verify_ground_truth_stage17(
+    document_id: str,
+    ground_truth: Optional[Dict],
+    layout_elements: Optional[List[Dict]],
+    similarity_cutoff: float = 0.8
+) -> Optional[Dict]:
+    """
+    Stage 17: Verify and prepare ground truth annotations.
+    Simplified version of pipeline_17_gt_preparation_verification.py
+    
+    Args:
+        document_id: Unique document identifier
+        ground_truth: Ground truth data from Stage 2
+        layout_elements: Layout/visual elements
+        similarity_cutoff: Similarity threshold for fuzzy matching
+        
+    Returns:
+        GT verification result dict
+    """
+    try:
+        print(f"\\n  Stage 17: Verifying ground truth...")
+        
+        if not ground_truth:
+            print(f"  ⚠ Stage 17: No ground truth to verify")
+            return {
+                'passed': False,
+                'skipped': True,
+                'confirmed_keys': [],
+                'similarities': []
+            }
+        
+        # Basic validation - check if GT has required structure
+        confirmed_keys = list(ground_truth.keys()) if isinstance(ground_truth, dict) else []
+        
+        # For DocVQA-style GT, verify question-answer pairs
+        valid_pairs = 0
+        similarities = []
+        
+        if isinstance(ground_truth, dict):
+            for question, answer in ground_truth.items():
+                if question and answer and isinstance(question, str) and isinstance(answer, str):
+                    valid_pairs += 1
+                    # Simplified similarity - just check both exist
+                    similarities.append(1.0)
+        
+        passed = valid_pairs > 0
+        
+        result = {
+            'passed': passed,
+            'skipped': False,
+            'confirmed_keys': confirmed_keys,
+            'similarities': similarities,
+            'num_layout_elements': len(layout_elements) if layout_elements else 0,
+            'valid_labels': True
+        }
+        
+        print(f"  ✓ Stage 17: GT verification {'passed' if passed else 'failed'} - {valid_pairs} valid pairs")
+        return result
+        
+    except Exception as e:
+        print(f"  ⚠ Stage 17: GT verification failed - {str(e)}")
+        return {
+            'passed': False,
+            'skipped': False,
+            'confirmed_keys': [],
+            'similarities': []
+        }
+
+
+async def analyze_document_stage18(
+    document_id: str,
+    has_handwriting: bool,
+    has_visual_elements: bool,
+    has_ocr: bool,
+    gt_verification: Optional[Dict],
+    page_count: int = 1
+) -> Dict:
+    """
+    Stage 18: Generate document analysis and statistics.
+    Simplified version of pipeline_18_analyze.py
+    
+    Args:
+        document_id: Unique document identifier
+        has_handwriting: Whether document has handwriting
+        has_visual_elements: Whether document has visual elements
+        has_ocr: Whether OCR was performed
+        gt_verification: GT verification results
+        page_count: Number of pages
+        
+    Returns:
+        Analysis statistics dict
+    """
+    try:
+        print(f"\\n  Stage 18: Analyzing document...")
+        
+        # Document validation checks
+        errors = []
+        if page_count != 1:
+            errors.append("is_multipage")
+        if not gt_verification or not gt_verification.get('passed'):
+            errors.append("gt_verification_failed")
+        if not has_ocr:
+            errors.append("missing_ocr")
+        
+        is_valid = len(errors) == 0
+        
+        stats = {
+            'total_documents': 1,
+            'valid_documents': 1 if is_valid else 0,
+            'error_counts': {error: 1 for error in errors},
+            'has_handwriting': 1 if has_handwriting else 0,
+            'has_visual_elements': 1 if has_visual_elements else 0,
+            'has_ocr': 1 if has_ocr else 0,
+            'multipage_count': 1 if page_count != 1 else 0,
+            'token_usage': None  # Not tracked at single-doc level
+        }
+        
+        print(f"  ✓ Stage 18: Analysis complete - {'valid' if is_valid else 'has errors'}")
+        return stats
+        
+    except Exception as e:
+        print(f"  ⚠ Stage 18: Analysis failed - {str(e)}")
+        return {
+            'total_documents': 1,
+            'valid_documents': 0,
+            'error_counts': {'analysis_error': 1},
+            'has_handwriting': 0,
+            'has_visual_elements': 0,
+            'has_ocr': 0,
+            'multipage_count': 0
+        }
+
+
+async def create_debug_visualization_stage19(
+    document_id: str,
+    image_base64: Optional[str],
+    normalized_bboxes: Optional[List[Dict]],
+    show_text: bool = True,
+    bbox_color: Tuple[int, int, int] = (255, 0, 0)
+) -> Optional[Dict]:
+    """
+    Stage 19: Create debug visualization with bbox overlays.
+    Simplified version of pipeline_19_create_debug_data.py
+    
+    Args:
+        document_id: Unique document identifier
+        image_base64: Base64-encoded image
+        normalized_bboxes: Normalized bounding boxes
+        show_text: Whether to show text labels
+        bbox_color: RGB color for bboxes
+        
+    Returns:
+        Debug visualization dict with overlay image
+    """
+    try:
+        print(f"\\n  Stage 19: Creating debug visualization...")
+        
+        if not image_base64 or not normalized_bboxes:
+            print(f"  ⚠ Stage 19: Missing image or bboxes")
+            return None
+        
+        # Decode image
+        img_data = base64.b64decode(image_base64)
+        img = Image.open(BytesIO(img_data))
+        
+        # Import drawing utilities
+        from PIL import ImageDraw, ImageFont
+        
+        # Create drawing context
+        draw = ImageDraw.Draw(img)
+        img_w, img_h = img.size
+        
+        # Draw bounding boxes
+        num_drawn = 0
+        for bbox in normalized_bboxes[:100]:  # Limit to 100 boxes for performance
+            # Un-normalize coordinates
+            x0 = bbox['x0'] * img_w
+            y0 = bbox['y0'] * img_h
+            x2 = bbox['x2'] * img_w
+            y2 = bbox['y2'] * img_h
+            
+            # Draw rectangle
+            draw.rectangle([x0, y0, x2, y2], outline=bbox_color, width=2)
+            
+            # Optionally draw text
+            if show_text and bbox.get('text'):
+                text = bbox['text'][:20]  # Truncate long text
+                try:
+                    # Try to use a small font
+                    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
+                except:
+                    font = ImageFont.load_default()
+                draw.text((x0, y0 - 12), text, fill=bbox_color, font=font)
+            
+            num_drawn += 1
+        
+        # Convert back to base64
+        buffer = BytesIO()
+        img.save(buffer, format="PNG")
+        overlay_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        
+        result = {
+            'bbox_overlay_base64': overlay_base64,
+            'visual_elements_overlay_base64': None,  # Would require additional processing
+            'handwriting_overlay_base64': None
+        }
+        
+        print(f"  ✓ Stage 19: Debug visualization created - {num_drawn} boxes drawn")
+        return result
+        
+    except Exception as e:
+        print(f"  ⚠ Stage 19: Debug visualization failed - {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+async def process_stage5_complete(
+    document_id: str,
+    pdf_path: str,
+    image_base64: Optional[str],
+    ocr_results: Optional[Dict],
+    ground_truth: Optional[Dict],
+    has_handwriting: bool,
+    has_visual_elements: bool,
+    layout_elements: Optional[List[Dict]],
+    enable_bbox_normalization: bool = False,
+    enable_gt_verification: bool = False,
+    enable_analysis: bool = False,
+    enable_debug_visualization: bool = False,
+) -> Dict[str, Any]:
+    """
+    Process Stage 5: Dataset Packaging (Stages 16-19).
+    
+    Args:
+        document_id: Unique document identifier
+        pdf_path: Path to PDF file
+        image_base64: Base64-encoded final image
+        ocr_results: OCR results from Stage 15
+        ground_truth: Ground truth from Stage 2
+        has_handwriting: Whether handwriting was generated
+        has_visual_elements: Whether visual elements were generated
+        layout_elements: Layout/visual element metadata
+        enable_*: Feature flags for each sub-stage
+        
+    Returns:
+        Dict with all Stage 5 results
+    """
+    results = {
+        'normalized_bboxes_word': None,
+        'normalized_bboxes_segment': None,
+        'gt_verification': None,
+        'analysis_stats': None,
+        'debug_visualization': None
+    }
+    
+    try:
+        print(f"\\n========== Stage 5: Dataset Packaging ==========")
+        
+        # Stage 16: Normalize bboxes
+        if enable_bbox_normalization:
+            norm_words, norm_segments = await normalize_bboxes_stage16(
+                document_id=document_id,
+                pdf_path=pdf_path,
+                ocr_results=ocr_results,
+                scale=settings.BBOX_NORMALIZATION_SCALE
+            )
+            results['normalized_bboxes_word'] = norm_words
+            results['normalized_bboxes_segment'] = norm_segments
+        
+        # Stage 17: Verify GT
+        if enable_gt_verification:
+            gt_verification = await verify_ground_truth_stage17(
+                document_id=document_id,
+                ground_truth=ground_truth,
+                layout_elements=layout_elements,
+                similarity_cutoff=settings.GT_VERIFICATION_SIMILARITY_CUTOFF
+            )
+            results['gt_verification'] = gt_verification
+        
+        # Stage 18: Analysis
+        if enable_analysis:
+            analysis_stats = await analyze_document_stage18(
+                document_id=document_id,
+                has_handwriting=has_handwriting,
+                has_visual_elements=has_visual_elements,
+                has_ocr=ocr_results is not None,
+                gt_verification=results.get('gt_verification'),
+                page_count=1
+            )
+            results['analysis_stats'] = analysis_stats
+        
+        # Stage 19: Debug visualization
+        if enable_debug_visualization and image_base64:
+            # Use normalized bboxes if available
+            bboxes_for_viz = results.get('normalized_bboxes_word') or results.get('normalized_bboxes_segment')
+            
+            if bboxes_for_viz:
+                # Parse color from config
+                color_str = settings.DEBUG_BBOX_COLOR_RGB
+                try:
+                    r, g, b = map(int, color_str.split(','))
+                    bbox_color = (r, g, b)
+                except:
+                    bbox_color = (255, 0, 0)  # Red default
+                
+                debug_viz = await create_debug_visualization_stage19(
+                    document_id=document_id,
+                    image_base64=image_base64,
+                    normalized_bboxes=bboxes_for_viz,
+                    show_text=settings.DEBUG_SHOW_TEXT_IN_BBOX,
+                    bbox_color=bbox_color
+                )
+                results['debug_visualization'] = debug_viz
+        
+        print(f"  ✓ Stages 16-18: Dataset packaging complete\\n")
+        return results
+        
+    except Exception as e:
+        print(f"  ⚠ Stages 16-18 processing failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return results
+
+
+# ==================== Dataset Export ====================
+
+async def export_to_msgpack(
+    document_id: str,
+    image_path: Optional[str],
+    image_base64: Optional[str],
+    words: List[str],
+    word_bboxes: List[List[float]],
+    segment_bboxes: Optional[List[List[float]]],
+    ground_truth: Optional[Dict],
+    output_path: pathlib.Path,
+    image_width: Optional[int] = None,
+    image_height: Optional[int] = None
+) -> pathlib.Path:
+    """
+    Export document data to msgpack format.
+    
+    This creates a simple msgpack file containing the document data in a format
+    compatible with DocGenie's dataset infrastructure.
+    
+    Args:
+        document_id: Unique document identifier
+        image_path: Path to document image (if available)
+        image_base64: Base64-encoded image (if no image_path)
+        words: List of word strings
+        word_bboxes: Word-level bounding boxes (normalized [0,1])
+        segment_bboxes: Segment-level bounding boxes (normalized [0,1])
+        ground_truth: Ground truth annotations
+        output_path: Output msgpack file path
+        image_width: Image width in pixels
+        image_height: Image height in pixels
+        
+    Returns:
+        Path to created msgpack file
+    """
+    try:
+        from datadings.writer import FileWriter
+        
+        print(f"\\n========== Msgpack Export ==========")
+        print(f"  Exporting document {document_id} to msgpack format...")
+        
+        # Prepare document data
+        doc_data = {
+            "key": document_id,
+            "sample_id": document_id,
+            "words": words,
+            "word_bboxes": word_bboxes,  # Should already be normalized [0,1]
+        }
+        
+        # Add segment bboxes if available
+        if segment_bboxes:
+            doc_data["segment_level_bboxes"] = segment_bboxes
+        else:
+            # Fallback: use word bboxes as segment bboxes
+            doc_data["segment_level_bboxes"] = word_bboxes
+        
+        # Add image dimensions if available
+        if image_width and image_height:
+            doc_data["image_width"] = image_width
+            doc_data["image_height"] = image_height
+        
+        # Add image path if available
+        if image_path:
+            doc_data["image_file_path"] = str(image_path)
+        
+        # Process ground truth annotations
+        if ground_truth:
+            # Extract classification label if exists
+            if "label" in ground_truth:
+                doc_data["label"] = ground_truth["label"]
+            
+            # Extract entity labels (for NER/token classification)
+            if "entities" in ground_truth:
+                entities = ground_truth["entities"]
+                if entities:
+                    # Create word-level labels (default "O" for outside)
+                    word_labels = ["O"] * len(words)
+                    
+                    # Map entities to words
+                    for entity in entities:
+                        entity_text = entity.get("text", "")
+                        entity_label = entity.get("label", "ENTITY")
+                        
+                        # Simple matching: find words that match entity text
+                        entity_words = entity_text.split()
+                        for i, word in enumerate(words):
+                            if word in entity_words:
+                                word_labels[i] = f"B-{entity_label}" if i == 0 or word_labels[i-1] == "O" else f"I-{entity_label}"
+                    
+                    doc_data["word_labels"] = word_labels
+            
+            # Extract QA pairs (for extractive QA)
+            if "questions" in ground_truth:
+                qa_pairs = []
+                for qa in ground_truth["questions"]:
+                    qa_pair = {
+                        "question": qa.get("question", ""),
+                        "answers": qa.get("answers", []),
+                        "question_id": qa.get("id", "")
+                    }
+                    qa_pairs.append(qa_pair)
+                doc_data["qa_pairs"] = qa_pairs
+            
+            # Extract layout annotations (for document layout analysis)
+            if "layout_elements" in ground_truth:
+                layout_elements = ground_truth["layout_elements"]
+                annotated_objects = []
+                for elem in layout_elements:
+                    obj = {
+                        "label": elem.get("label", "text"),
+                        "bbox": elem.get("bbox", [0, 0, 1, 1]),  # Normalized bbox
+                        "score": elem.get("score", 1.0)
+                    }
+                    annotated_objects.append(obj)
+                doc_data["annotated_objects"] = annotated_objects
+        
+        # Ensure output directory exists
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Write to msgpack file
+        with FileWriter(output_path, overwrite=True) as writer:
+            writer.write(doc_data)
+        
+        print(f"  ✓ Msgpack exported: {output_path}")
+        print(f"    - Words: {len(words)}")
+        print(f"    - Word BBoxes: {len(word_bboxes)}")
+        print(f"    - Segment BBoxes: {len(doc_data['segment_level_bboxes'])}")
+        if "word_labels" in doc_data:
+            print(f"    - Labels: {len(doc_data['word_labels'])}")
+        if "qa_pairs" in doc_data:
+            print(f"    - QA Pairs: {len(doc_data['qa_pairs'])}")
+        
+        return output_path
+        
+    except ImportError:
+        print(f"  ⚠ Warning: 'datadings' package not available. Msgpack export skipped.")
+        print(f"    Install with: pip install datadings")
+        return None
+    except Exception as e:
+        print(f"  ⚠ Msgpack export failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def save_individual_tokens_to_disk(
+    handwriting_images: dict,
+    visual_element_images: dict,
+    output_dir: pathlib.Path,
+    doc_id: str
+) -> dict:
+    """
+    Save individual handwriting tokens and visual elements to disk.
+    Used for 'dataset' and 'complete' output detail levels.
+    
+    Args:
+        handwriting_images: Dict {hw_id: base64_png}
+        visual_element_images: Dict {ve_id: base64_png}
+        output_dir: Base output directory
+        doc_id: Document ID for folder naming
+        
+    Returns:
+        dict with paths to saved files
+    """
+    import base64
+    
+    saved_files = {
+        'handwriting_tokens': [],
+        'visual_elements': []
+    }
+    
+    # Save handwriting tokens
+    if handwriting_images:
+        hw_dir = output_dir / doc_id / "handwriting_tokens"
+        hw_dir.mkdir(parents=True, exist_ok=True)
+        
+        for hw_id, img_b64 in handwriting_images.items():
+            img_bytes = base64.b64decode(img_b64)
+            img_path = hw_dir / f"{hw_id}.png"
+            img_path.write_bytes(img_bytes)
+            saved_files['handwriting_tokens'].append(str(img_path.relative_to(output_dir)))
+    
+    # Save visual elements
+    if visual_element_images:
+        ve_dir = output_dir / doc_id / "visual_elements"
+        ve_dir.mkdir(parents=True, exist_ok=True)
+        
+        for ve_id, img_b64 in visual_element_images.items():
+            img_bytes = base64.b64decode(img_b64)
+            img_path = ve_dir / f"{ve_id}.png"
+            img_path.write_bytes(img_bytes)
+            saved_files['visual_elements'].append(str(img_path.relative_to(output_dir)))
+    
+    return saved_files
+
+
+def create_token_mapping_json(
+    handwriting_regions: list[dict],
+    handwriting_images: dict,
+    visual_elements: list[dict],
+    visual_element_images: dict
+) -> dict:
+    """
+    Create mapping JSON for ML dataset creation.
+    Includes style IDs, positions, and image references.
+    
+    Args:
+        handwriting_regions: List of handwriting metadata
+        handwriting_images: Dict of handwriting images
+        visual_elements: List of visual element metadata
+        visual_element_images: Dict of visual element images
+        
+    Returns:
+        dict with complete token mapping
+    """
+    mapping = {
+        'handwriting': {
+            'tokens': [],
+            'total_count': len(handwriting_regions)
+        },
+        'visual_elements': {
+            'items': [],
+            'total_count': len(visual_elements)
+        }
+    }
+    
+    # Add handwriting token info
+    for hw_region in handwriting_regions:
+        hw_id = hw_region.get('id', 'unknown')
+        token_info = {
+            'id': hw_id,
+            'text': hw_region.get('text', ''),
+            'author_id': hw_region.get('author_id'),
+            'is_signature': hw_region.get('is_signature', False),
+            'rect': hw_region.get('rect', {}),
+            'has_image': hw_id in handwriting_images,
+            'image_filename': f"{hw_id}.png" if hw_id in handwriting_images else None
+        }
+        mapping['handwriting']['tokens'].append(token_info)
+    
+    # Add visual element info
+    for ve in visual_elements:
+        ve_id = ve.get('id', 'unknown')
+        ve_info = {
+            'id': ve_id,
+            'type': ve.get('type', 'unknown'),
+            'content': ve.get('content'),
+            'rect': ve.get('rect', {}),
+            'has_image': ve_id in visual_element_images,
+            'image_filename': f"{ve_id}.png" if ve_id in visual_element_images else None
+        }
+        mapping['visual_elements']['items'].append(ve_info)
+    
+    return mapping
+
+
+def extract_all_bboxes_from_pdf(pdf_path: pathlib.Path) -> Dict[str, List[dict]]:
+    """
+    Extract both word-level and character-level bounding boxes from PDF.
+    
+    This is a high-priority feature for ML datasets as it provides:
+    - Word-level bboxes: Ground truth text positions from PDF
+    - Character-level bboxes: Fine-grained localization for character recognition
+    
+    Args:
+        pdf_path: Path to PDF file
+        
+    Returns:
+        Dictionary with 'word' and 'char' keys containing bbox lists
+    """
+    from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf
+    
+    # Extract word-level bboxes
+    word_bboxes_raw = extract_bboxes_from_pdf(
+        pdf_path=pdf_path,
+        level="word"
+    )
+    
+    # Extract character-level bboxes
+    char_bboxes_raw = extract_bboxes_from_pdf(
+        pdf_path=pdf_path,
+        level="char"
+    )
+    
+    # Convert OCRBox objects to dict format
+    word_bboxes = []
+    for bbox in word_bboxes_raw:
+        word_bboxes.append({
+            "text": bbox.text,
+            "x": bbox.x0,
+            "y": bbox.y0,
+            "width": bbox.width,
+            "height": bbox.height,
+            "bbox": [bbox.x0, bbox.y0, bbox.x2, bbox.y2],
+            "block_no": bbox.block_no,
+            "line_no": bbox.line_no,
+            "word_no": bbox.word_no,
+            "page": 0
+        })
+    
+    char_bboxes = []
+    for bbox in char_bboxes_raw:
+        char_bboxes.append({
+            "text": bbox.text,
+            "x": bbox.x0,
+            "y": bbox.y0,
+            "width": bbox.width,
+            "height": bbox.height,
+            "bbox": [bbox.x0, bbox.y0, bbox.x2, bbox.y2],
+            "block_no": bbox.block_no,
+            "line_no": bbox.line_no,
+            "word_no": bbox.word_no,
+            "page": 0
+        })
+    
+    return {
+        "word": word_bboxes,
+        "char": char_bboxes
+    }
+
+
+def extract_raw_annotations_from_geometries(geometries: List[dict]) -> List[dict]:
+    """
+    Extract raw layout annotations (bounding boxes) from geometries.
+    
+    This is a high-priority feature for ML datasets as it provides:
+    - Layout bounding boxes before any normalization
+    - Shows original coordinate space from HTML rendering
+    - Useful for debugging annotation processing pipeline
+    
+    Args:
+        geometries: List of geometry dictionaries from HTML rendering
+        
+    Returns:
+        List of layout annotation dictionaries with bbox coordinates
+    """
+    annotations = []
+    
+    for geom in geometries:
+        # Only extract layout elements (class starts with "LE-")
+        class_name = geom.get('class', '')
+        if not class_name.startswith('LE-'):
+            continue
+        
+        # Extract bbox from rect
+        rect = geom.get('rect', {})
+        if not rect:
+            continue
+        
+        annotation = {
+            'class': class_name,
+            'type': 'layout_element',
+            'bbox': {
+                'x': rect.get('x', 0),
+                'y': rect.get('y', 0),
+                'width': rect.get('width', 0),
+                'height': rect.get('height', 0)
+            },
+            'text': geom.get('text', ''),
+            'attributes': geom.get('attributes', {})
+        }
+        
+        # Compute x2, y2 for convenience
+        annotation['bbox']['x2'] = annotation['bbox']['x'] + annotation['bbox']['width']
+        annotation['bbox']['y2'] = annotation['bbox']['y'] + annotation['bbox']['height']
+        
+        annotations.append(annotation)
+    
+    return annotations
diff --git a/api/worker.py b/api/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0003d1b48882e95e56dfe7f6e6e5fc3d83ea5dbc
--- /dev/null
+++ b/api/worker.py
@@ -0,0 +1,695 @@
+"""
+Background worker for processing document generation jobs using batched Claude API.
+Runs as RQ worker process.
+"""
+
+import asyncio
+import io
+import json
+import os
+import pathlib
+import tempfile
+import time
+import traceback
+import zipfile
+import shutil
+from typing import Dict, Any, List, Callable
+from datetime import datetime
+
+# Add worker startup logging
+print("============================================================")
+print("🔧 Worker Configuration Check")
+print("============================================================")
+
+from .config import settings
+
+# Log configuration on module load
+print(f"✓ ANTHROPIC_API_KEY: {'Set' if settings.ANTHROPIC_API_KEY else '❌ MISSING'}")
+print(f"✓ SUPABASE: {settings.SUPABASE_URL[:40]}..." if settings.SUPABASE_URL else "❌ MISSING")
+print(f"✓ GOOGLE_CLIENT_ID: {settings.GOOGLE_CLIENT_ID[:30]}..." if settings.GOOGLE_CLIENT_ID else "❌ MISSING")
+print(f"✓ GOOGLE_CLIENT_SECRET: {'Set' if settings.GOOGLE_CLIENT_SECRET else '❌ MISSING'}")
+if settings.GOOGLE_CLIENT_ID and settings.GOOGLE_CLIENT_SECRET:
+    print(f"   → Token auto-refresh: ENABLED")
+print("============================================================")
+
+from .supabase_client import supabase_client
+from .google_drive import GoogleDriveClient
+from .utils import (
+    download_seed_images,
+    build_prompt,
+    extract_html_documents_from_response,
+    extract_ground_truth,
+    extract_css_from_html,
+    render_html_to_pdf,
+    extract_bboxes_from_rendered_pdf,
+    pdf_to_base64,
+    process_stage3_complete,
+    process_stage4_ocr,
+    process_stage5_complete,
+    validate_html_structure,
+    validate_pdf,
+    validate_bboxes
+)
+from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
+from docgenie import ENV
+
+
+# ==================== Worker Logging Configuration ====================
+# Read from environment variable, default to False for cleaner logs
+VERBOSE_LOGGING = os.getenv('WORKER_VERBOSE_LOGGING', 'false').lower() in ('true', '1', 'yes')
+
+def log_verbose(message: str):
+    """Log message only if verbose logging is enabled"""
+    if VERBOSE_LOGGING:
+        print(message)
+
+
+# ==================== Startup Validation ====================
+def validate_worker_config():
+    """Validate worker configuration at startup"""
+    print("=" * 60)
+    print("🔧 Worker Configuration Check")
+    print("=" * 60)
+    
+    # Check Anthropic API
+    if settings.ANTHROPIC_API_KEY:
+        print("✓ ANTHROPIC_API_KEY: Set")
+    else:
+        print("✗ ANTHROPIC_API_KEY: NOT SET (REQUIRED)")
+    
+    # Check Supabase
+    if settings.SUPABASE_URL and settings.SUPABASE_KEY:
+        print(f"✓ SUPABASE: {settings.SUPABASE_URL[:30]}...")
+    else:
+        print("✗ SUPABASE: NOT SET (REQUIRED)")
+    
+    # Check Google OAuth (optional, for token refresh)
+    if settings.GOOGLE_CLIENT_ID and settings.GOOGLE_CLIENT_SECRET:
+        print(f"✓ GOOGLE_CLIENT_ID: {settings.GOOGLE_CLIENT_ID[:20]}...")
+        print("✓ GOOGLE_CLIENT_SECRET: Set")
+        print("  → Token auto-refresh: ENABLED")
+    else:
+        print("⚠ GOOGLE_CLIENT_ID/SECRET: Not set")
+        print("  → Token auto-refresh: DISABLED")
+        print("  → Users must provide fresh access tokens that don't expire during processing")
+    
+    print("=" * 60)
+
+# Run validation on module import
+validate_worker_config()
+
+
+def retry_on_network_error(func: Callable, max_retries: int = 3, delay: float = 2.0) -> Any:
+    """
+    Retry a function on network errors with exponential backoff.
+    
+    Args:
+        func: Function to execute (must be callable with no args)
+        max_retries: Maximum number of retry attempts
+        delay: Initial delay in seconds (doubles each retry)
+    
+    Returns:
+        Result of the function call
+    
+    Raises:
+        Last exception if all retries fail
+    """
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            last_exception = e
+            error_str = str(e).lower()
+            # Retry on network/DNS errors
+            if any(err in error_str for err in ['name resolution', 'connection', 'timeout', 'network']):
+                if attempt < max_retries - 1:
+                    wait_time = delay * (2 ** attempt)
+                    print(f"[Retry {attempt + 1}/{max_retries}] Network error, retrying in {wait_time}s: {e}")
+                    time.sleep(wait_time)
+                    continue
+            # Non-network error or last attempt
+            raise
+    # All retries exhausted
+    raise last_exception
+
+
+async def process_document_generation_job_async(request_id: str, request_data: Dict[str, Any]):
+    """
+    Async background job function - processes document generation using batched Claude API.
+    
+    This function:
+    1. Creates Claude batch with single message (generates N documents)
+    2. Polls batch until completion
+    3. Processes all documents (PDFs, handwriting, etc.)
+    4. Uploads ZIP to user's Google Drive
+    5. Updates Supabase with results
+    
+    Args:
+        request_id: Document request UUID from Supabase
+        request_data: Request parameters dict containing:
+            - user_id: int
+            - seed_images: List[str] (URLs)
+            - prompt_params: Dict (language, doc_type, num_solutions, etc.)
+    
+    Raises:
+        Exception: Any error during processing (logged to Supabase)
+    """
+    user_id = request_data['user_id']
+    google_drive_token = request_data.get('google_drive_token')
+    google_drive_refresh_token = request_data.get('google_drive_refresh_token')
+    seed_image_urls = request_data['seed_images']
+    prompt_params = request_data['prompt_params']
+    
+    # Validate Google Drive credentials configuration
+    if google_drive_refresh_token:
+        if not settings.GOOGLE_CLIENT_ID or not settings.GOOGLE_CLIENT_SECRET:
+            print(f"[Job {request_id}] ⚠️ WARNING: refresh_token provided but GOOGLE_CLIENT_ID/SECRET not configured")
+            print(f"[Job {request_id}] Token auto-refresh will fail. Ensure access token remains valid.")
+    
+    # Create temporary directories for this job
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_path = pathlib.Path(tmp_dir)
+        batch_dir = tmp_path / "batches"
+        message_dir = tmp_path / "messages"
+        batch_dir.mkdir(exist_ok=True)
+        message_dir.mkdir(exist_ok=True)
+        
+        # Initialize DatasetExporter for organized structure
+        from .dataset_exporter import DatasetExporter
+        exporter = DatasetExporter(tmp_path, dataset_name="docgenie_documents")
+        
+        try:
+            # ==================== Update Status: Downloading ====================
+            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "downloading"))
+            print(f"[Job {request_id}] Status: downloading (fetching seed images)")
+            
+            # ==================== Step 1: Download Seed Images ====================
+            log_verbose(f"[Job {request_id}] Downloading {len(seed_image_urls)} seed images...")
+            seed_images_base64 = download_seed_images(seed_image_urls)
+            log_verbose(f"[Job {request_id}] Downloaded {len(seed_images_base64)} images")
+            
+            # ==================== Step 2: Build Prompt ====================
+            prompt_template_path = ENV.PROMPT_TEMPLATES_DIR / "ClaudeRefined12" / "seed-based-json.txt"
+            
+            if not prompt_template_path.exists():
+                raise FileNotFoundError(f"Prompt template not found: {prompt_template_path}")
+            
+            prompt = build_prompt(
+                language=prompt_params.get('language', 'English'),
+                doc_type=prompt_params.get('doc_type', 'business and administrative'),
+                gt_type=prompt_params.get('gt_type', 'Questions and answers'),
+                gt_format=prompt_params.get('gt_format', '{"question": "answer"}'),
+                num_solutions=prompt_params.get('num_solutions', 1),
+                num_seed_images=len(seed_images_base64),
+                prompt_template_path=prompt_template_path
+            )
+            log_verbose(f"[Job {request_id}] Prompt built")
+            
+            # ==================== Step 3: Create Claude Batch ====================
+            log_verbose(f"[Job {request_id}] Creating Claude batch (batched API)...")
+            
+            client = ClaudeBatchedClient(api_key=settings.ANTHROPIC_API_KEY)
+            
+            # Send batch with 1 message that generates num_solutions documents
+            client.send_batch(
+                model=settings.CLAUDE_MODEL,
+                prompts=[prompt],  # Single prompt (list of 1)
+                images_base64=[seed_images_base64],  # Single image set (list of 1)
+                image_docids=[["seed"] * len(seed_images_base64)],  # Dummy doc IDs
+                batch_data_directory=batch_dir,
+                max_tokens=16384
+            )
+            
+            print(f"[Job {request_id}] ⏳ Batch created, processing for Claude to process...")
+            
+            # ==================== Step 4: Poll Batch Until Complete ====================
+            client.await_batches(
+                batch_data_directory=batch_dir,
+                message_data_directory=message_dir,
+                sleep_seconds_between_batch=2,
+                sleep_seconds_iteration=settings.BATCH_POLL_INTERVAL
+            )
+            
+            print(f"[Job {request_id}] ✓ Batch complete")
+            
+            # ==================== Step 5: Read Batch Results ====================
+            message_files = list(message_dir.glob("*.json"))
+            
+            if not message_files:
+                raise RuntimeError("No message results found after batch completion")
+            
+            message_data = json.loads(message_files[0].read_text())
+            
+            if message_data.get('result_type') != 'succeeded':
+                error_msg = message_data.get('error', 'Unknown error from Claude API')
+                raise RuntimeError(f"Claude API error: {error_msg}")
+            
+            llm_response = message_data['response']
+            log_verbose(f"[Job {request_id}] Received LLM response ({len(llm_response)} chars)")
+            
+            # ==================== Step 6: Extract HTML Documents ====================
+            html_documents = extract_html_documents_from_response(llm_response)
+            
+            if not html_documents:
+                raise RuntimeError("No valid HTML documents found in LLM response")
+            
+            print(f"[Job {request_id}] ✓ Extracted {len(html_documents)} documents")
+            
+            # ==================== Update Status: Generating ====================
+            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "generating"))
+            print(f"[Job {request_id}] Status: generating (processing documents)")
+            
+            # ==================== Step 7: Process Each Document ====================
+            pdf_files = []
+            metadata = []
+            
+            for idx, html in enumerate(html_documents):
+                try:
+                    doc_id = f"document_{idx + 1}"
+                    log_verbose(f"[Job {request_id}] Processing document {idx + 1}/{len(html_documents)}")
+                    
+                    # Initialize original_pdf_path
+                    original_pdf_path = None
+                    
+                    # Validate HTML
+                    is_valid, error_msg = validate_html_structure(html)
+                    if not is_valid:
+                        print(f"[Job {request_id}] Document {idx + 1} HTML validation failed: {error_msg}")
+                        continue
+                    
+                    # Extract ground truth and CSS
+                    gt, html_clean = extract_ground_truth(html)
+                    css, _ = extract_css_from_html(html_clean)
+                    
+                    # Render to PDF
+                    pdf_path = tmp_path / f"{doc_id}.pdf"
+                    pdf_path, width_mm, height_mm, geometries = await render_html_to_pdf(
+                        html=html_clean,
+                        output_pdf_path=pdf_path
+                    )
+                    
+                    # Track original PDF
+                    original_pdf_path = pdf_path
+                    
+                    # Validate PDF
+                    is_valid, error_msg = validate_pdf(pdf_path)
+                    if not is_valid:
+                        print(f"[Job {request_id}] Document {idx + 1} PDF validation failed: {error_msg}")
+                        continue
+                    
+                    # Extract bounding boxes
+                    bboxes_raw = extract_bboxes_from_rendered_pdf(pdf_path)
+                    
+                    # Validate bboxes
+                    is_valid, error_msg = validate_bboxes(bboxes_raw, min_bbox_count=1)
+                    if not is_valid:
+                        print(f"[Job {request_id}] Document {idx + 1} BBox validation warning: {error_msg}")
+                    
+                    log_verbose(f"[Job {request_id}] Document {idx + 1}: Extracted {len(bboxes_raw)} bboxes")
+                    
+                    # Process Stage 3 (Handwriting & Visual Elements) if enabled
+                    final_image_b64 = None
+                    handwriting_regions = []
+                    visual_elements = []
+                    handwriting_images = {}
+                    visual_element_images = {}
+                    ocr_results = None
+                    pdf_with_handwriting_path = None
+                    pdf_final_path = None
+                    
+                    if prompt_params.get('enable_handwriting') or prompt_params.get('enable_visual_elements'):
+                        # Update status: Handwriting
+                        if prompt_params.get('enable_handwriting'):
+                            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "handwriting"))
+                            log_verbose(f"[Job {request_id}] Status: handwriting (generating handwritten text)")
+                        
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing handwriting/visual elements...")
+                        
+                        try:
+                            final_image_b64, handwriting_regions, visual_elements, handwriting_images, visual_element_images, pdf_with_handwriting_path, pdf_final_path = await process_stage3_complete(
+                                pdf_path=pdf_path,
+                                geometries=geometries,
+                                ground_truth=gt,
+                                bboxes_raw=bboxes_raw,
+                                page_width_mm=width_mm,
+                                page_height_mm=height_mm,
+                                enable_handwriting=prompt_params.get('enable_handwriting', False),
+                                handwriting_ratio=prompt_params.get('handwriting_ratio', 0.3),
+                                enable_visual_elements=prompt_params.get('enable_visual_elements', False),
+                                visual_element_types=prompt_params.get('visual_element_types', []),
+                                seed=prompt_params.get('seed')
+                            )
+                            
+                            # Use final PDF if both modifications applied, otherwise use handwriting PDF
+                            if pdf_final_path and pdf_final_path.exists():
+                                pdf_path = pdf_final_path
+                            elif pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
+                                pdf_path = pdf_with_handwriting_path
+                            
+                            log_verbose(f"[Job {request_id}] Document {idx + 1}: {len(handwriting_regions)} handwriting, {len(visual_elements)} visual elements")
+                        
+                        except Exception as e:
+                            print(f"[Job {request_id}] Document {idx + 1}: Stage 3 failed: {str(e)}")
+                    
+                    # Process Stage 4/5 (OCR) if needed
+                    if prompt_params.get('enable_ocr'):
+                        # Update status: OCR
+                        retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "ocr"))
+                        log_verbose(f"[Job {request_id}] Status: ocr (running OCR on documents)")
+                        
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing OCR...")
+                        
+                        try:
+                            stage4_image, ocr_results = await process_stage4_ocr(
+                                pdf_path=pdf_path,
+                                enable_ocr=True,
+                                dpi=settings.OCR_DPI
+                            )
+                            
+                            if ocr_results:
+                                log_verbose(f"[Job {request_id}] Document {idx + 1}: OCR complete - {len(ocr_results.get('words', []))} words")
+                        
+                        except Exception as e:
+                            print(f"[Job {request_id}] Document {idx + 1}: OCR failed: {str(e)}")
+                    
+                    # Process Stage 5 (Dataset packaging) if needed
+                    stage5_results = {}
+                    if any([
+                        prompt_params.get('enable_bbox_normalization'),
+                        prompt_params.get('enable_gt_verification'),
+                        prompt_params.get('enable_analysis'),
+                        prompt_params.get('enable_debug_visualization')
+                    ]):
+                        # Update status: Validation (if GT verification enabled)
+                        if prompt_params.get('enable_gt_verification'):
+                            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "validation"))
+                            log_verbose(f"[Job {request_id}] Status: validation (validating ground truth)")
+                        
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}: Processing dataset packaging...")
+                        
+                        try:
+                            stage5_results = await process_stage5_complete(
+                                document_id=doc_id,
+                                pdf_path=pdf_path,
+                                image_base64=final_image_b64,
+                                ocr_results=ocr_results,
+                                ground_truth=gt,
+                                has_handwriting=prompt_params.get('enable_handwriting', False),
+                                has_visual_elements=prompt_params.get('enable_visual_elements', False),
+                                layout_elements=visual_elements,
+                                enable_bbox_normalization=prompt_params.get('enable_bbox_normalization', False),
+                                enable_gt_verification=prompt_params.get('enable_gt_verification', False),
+                                enable_analysis=prompt_params.get('enable_analysis', False),
+                                enable_debug_visualization=prompt_params.get('enable_debug_visualization', False)
+                            )
+                        
+                        except Exception as e:
+                            print(f"[Job {request_id}] Document {idx + 1}: Stage 5 failed: {str(e)}")
+                    
+                    # Track PDFs for metadata
+                    if original_pdf_path and pdf_path != original_pdf_path:
+                        pdf_files.append(original_pdf_path)
+                        pdf_files.append(pdf_path)
+                    else:
+                        pdf_files.append(pdf_path)
+                    
+                    # Extract bbox_pdf (word + char) from original PDF (ground truth positions)
+                    from .utils import extract_all_bboxes_from_pdf, extract_raw_annotations_from_geometries
+                    log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Extracting bbox_pdf (word + char level) from original PDF...")
+                    
+                    try:
+                        bboxes_pdf = extract_all_bboxes_from_pdf(original_pdf_path if original_pdf_path else pdf_path)
+                        bbox_pdf_word = bboxes_pdf.get('word', [])
+                        bbox_pdf_char = bboxes_pdf.get('char', [])
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}:   ✓ Extracted {len(bbox_pdf_word)} word bboxes, {len(bbox_pdf_char)} char bboxes from PDF")
+                    except Exception as e:
+                        print(f"[Job {request_id}] Document {idx + 1}:   ⚠ bbox_pdf extraction failed: {e}")
+                        bbox_pdf_word = bboxes_raw  # Fallback to raw bboxes
+                        bbox_pdf_char = []
+                    
+                    # Extract raw_annotations (layout boxes before normalization)
+                    raw_annotations = None
+                    if geometries:
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Extracting raw_annotations from geometries...")
+                        try:
+                            raw_annotations = extract_raw_annotations_from_geometries(geometries)
+                            log_verbose(f"[Job {request_id}] Document {idx + 1}:   ✓ Extracted {len(raw_annotations)} layout annotations")
+                        except Exception as e:
+                            print(f"[Job {request_id}] Document {idx + 1}:   ⚠ raw_annotations extraction failed: {e}")
+                    
+                    # Decode final image to bytes
+                    final_image_bytes = None
+                    if final_image_b64:
+                        import base64
+                        final_image_bytes = base64.b64decode(final_image_b64)
+                    
+                    # Decode debug visualization
+                    debug_viz_bytes = None
+                    if stage5_results.get('debug_visualization'):
+                        import base64
+                        debug_viz_dict = stage5_results['debug_visualization']
+                        if debug_viz_dict and 'bbox_overlay_base64' in debug_viz_dict:
+                            debug_viz_b64 = debug_viz_dict['bbox_overlay_base64']
+                            debug_viz_bytes = base64.b64decode(debug_viz_b64)
+                    
+                    # Prepare token mapping if tokens exist
+                    output_detail = prompt_params.get('output_detail', 'minimal')
+                    token_mapping_data = None
+                    if output_detail in ["dataset", "complete"]:
+                        if handwriting_images or visual_element_images:
+                            from .utils import create_token_mapping_json
+                        token_mapping_data = create_token_mapping_json(
+                            handwriting_regions,
+                            handwriting_images,
+                            visual_elements,
+                            visual_element_images
+                        )
+                        log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Output detail '{output_detail}': Prepared {len(handwriting_images)} handwriting tokens, {len(visual_element_images)} visual elements")
+                    
+                    # Extract bbox_final_word and bbox_final_segment (from OCR or PDF)
+                    bbox_final_word = None
+                    bbox_final_segment = None
+                    if ocr_results and ocr_results.get('words'):
+                        # Use OCR results as final bboxes
+                        bbox_final_word = ocr_results.get('words', [])
+                        bbox_final_segment = ocr_results.get('lines', [])
+                    else:
+                        # Fallback to PDF bboxes if no OCR
+                        bbox_final_word = bbox_pdf_word
+                        bbox_final_segment = []  # No line-level data without OCR
+                    
+                    # Read PDF bytes for exporter
+                    pdf_initial_bytes = original_pdf_path.read_bytes()
+                    
+                    # Read modified PDFs if they exist
+                    pdf_with_handwriting_bytes = None
+                    pdf_final_bytes = None
+                    pdf_with_visual_elements_bytes = None
+                    
+                    if pdf_with_handwriting_path and pdf_with_handwriting_path.exists():
+                        pdf_with_handwriting_bytes = pdf_with_handwriting_path.read_bytes()
+                    
+                    if pdf_final_path and pdf_final_path.exists():
+                        pdf_final_bytes = pdf_final_path.read_bytes()
+                    
+                    # Special case: if only visual elements (no handwriting), pdf_final is actually pdf_with_visual_elements
+                    if pdf_final_bytes and not pdf_with_handwriting_bytes:
+                        pdf_with_visual_elements_bytes = pdf_final_bytes
+                        pdf_final_bytes = None
+                    
+                    # Add document to exporter
+                    log_verbose(f"[Job {request_id}] Document {idx + 1}: 📦 Adding document to dataset exporter...")
+                    exporter.add_document(
+                        document_id=doc_id,
+                        html=html_clean,
+                        css=css,
+                        pdf_initial=pdf_initial_bytes,
+                        pdf_with_handwriting=pdf_with_handwriting_bytes,
+                        pdf_with_visual_elements=pdf_with_visual_elements_bytes,
+                        pdf_final=pdf_final_bytes,
+                        final_image=final_image_bytes,
+                        ground_truth=gt,
+                        raw_annotations=raw_annotations,
+                        bboxes_pdf_word=bbox_pdf_word,
+                        bboxes_pdf_char=bbox_pdf_char,
+                        bboxes_final_word=bbox_final_word,
+                        bboxes_final_segment=bbox_final_segment,
+                        bboxes_normalized_word=stage5_results.get('normalized_bboxes_word'),
+                        bboxes_normalized_segment=stage5_results.get('normalized_bboxes_segment'),
+                        gt_verification=stage5_results.get('gt_verification'),
+                        token_mapping=token_mapping_data,
+                        handwriting_regions=handwriting_regions,
+                        handwriting_images=handwriting_images,
+                        visual_elements=visual_elements,
+                        visual_element_images=visual_element_images,
+                        layout_elements=visual_elements,
+                        geometries=geometries,
+                        ocr_results=ocr_results,
+                        analysis_stats=stage5_results.get('analysis_stats'),
+                        debug_visualization=debug_viz_bytes
+                    )
+                    log_verbose(f"[Job {request_id}] Document {idx + 1}:   ✓ Document {doc_id} added to dataset")
+                    
+                    # Store comprehensive metadata (matching /generate/pdf format)
+                    metadata.append({
+                        "document_id": doc_id,
+                        "filename": f"{doc_id}.pdf",
+                        "bboxes": bboxes_raw,
+                        "ground_truth": gt,
+                        "geometries": geometries,
+                        "page_width_mm": width_mm,
+                        "page_height_mm": height_mm,
+                        "handwriting_regions": handwriting_regions,
+                        "visual_elements": visual_elements,
+                        "has_stage3_image": final_image_b64 is not None,
+                        "ocr_results": ocr_results,
+                        # Stage 5 results
+                        "normalized_bboxes_word": stage5_results.get('normalized_bboxes_word'),
+                        "normalized_bboxes_segment": stage5_results.get('normalized_bboxes_segment'),
+                        "gt_verification": stage5_results.get('gt_verification'),
+                        "analysis_stats": stage5_results.get('analysis_stats'),
+                        "debug_visualization_available": stage5_results.get('debug_visualization') is not None
+                    })
+                
+                except Exception as e:
+                    print(f"[Job {request_id}] Error processing document {idx + 1}: {str(e)}")
+                    traceback.print_exc()
+                    continue
+            
+            if not pdf_files:
+                raise RuntimeError("Failed to process any documents")
+            
+            log_verbose(f"[Job {request_id}] Processed {len(pdf_files)} PDF files")
+            
+            # ==================== Step 8: Finalize Dataset & Create ZIP ====================
+            log_verbose(f"[Job {request_id}] 📦 Finalizing dataset export...")
+            exporter.finalize(
+                request_id=request_id,
+                user_id=user_id,
+                prompt_params=prompt_params,
+                api_mode="async"
+            )
+            log_verbose(f"[Job {request_id}]   ✓ Dataset structure finalized at {exporter.base_path}")
+            
+            # ==================== Update Status: Zipping ====================
+            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "zipping"))
+            print(f"[Job {request_id}] Status: zipping (creating ZIP archive)")
+            
+            # Create ZIP from organized dataset
+            log_verbose(f"[Job {request_id}] 📦 Creating ZIP archive from dataset...")
+            zip_path = tmp_path / f"docgenie_{request_id}.zip"
+            
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                # Add all files from exporter.base_path
+                for file_path in exporter.base_path.rglob('*'):
+                    if file_path.is_file():
+                        arcname = file_path.relative_to(exporter.base_path.parent)
+                        zip_file.write(file_path, arcname)
+            
+            zip_size_mb = zip_path.stat().st_size / (1024 * 1024)
+            log_verbose(f"[Job {request_id}]   ✓ ZIP created: {zip_size_mb:.2f} MB")
+            
+            # ==================== Update Status: Uploading ====================
+            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, "uploading"))
+            print(f"[Job {request_id}] Status: uploading (uploading to Google Drive)")
+            
+            # ==================== Step 9: Upload to Google Drive ====================
+            print(f"[Job {request_id}] ⬆️  Uploading to Google Drive...")
+            
+            google_drive_url = None
+            gdrive_failed = False
+            # Check if Google Drive token provided
+            if not google_drive_token:
+                print(f"[Job {request_id}] No Google Drive token provided. Skipping Google Drive upload.")
+            else:
+                try:
+                    drive_client = GoogleDriveClient(
+                        access_token=google_drive_token,
+                        refresh_token=google_drive_refresh_token
+                    )
+                    google_drive_url = drive_client.upload_file(
+                        file_path=zip_path,
+                        filename=f"docgenie_{request_id}.zip",
+                        folder_name=settings.GOOGLE_DRIVE_FOLDER_NAME
+                    )
+                    
+                    print(f"[Job {request_id}] ✓ Uploaded to Google Drive: {google_drive_url}")
+                
+                except Exception as e:
+                    print(f"[Job {request_id}] Google Drive upload failed: {str(e)}")
+                    gdrive_failed = True
+                    # Do not raise an error, just continue so we can still save to Supabase
+            
+            # ==================== Step 10: Store Results in Supabase ====================
+            log_verbose(f"[Job {request_id}] Saving results to Supabase...")
+            log_verbose(f"[Job {request_id}]   URL: {google_drive_url}")
+            
+            # Upload ZIP to Supabase
+            zip_url = None
+            try:
+                zip_storage_path = f"{user_id}/{request_id}/generated/docgenie_{request_id}.zip"
+                supabase_client.upload_to_storage("doc_storage", zip_storage_path, zip_path.read_bytes(), "application/zip")
+                zip_url = supabase_client.get_public_url("doc_storage", zip_storage_path)
+                print(f"[Job {request_id}] ✓ Uploaded ZIP to Supabase: {zip_url}")
+            except Exception as e:
+                print(f"[Job {request_id}] ⚠ Supabase ZIP upload failed: {e}")
+
+            # Create generated document record
+            retry_on_network_error(lambda: supabase_client.create_generated_document(
+                request_id=request_id,
+                file_url=google_drive_url,
+                file_type="application/zip",
+                page_count=len(metadata),  # Using document count as page_count
+                model_version=settings.LLM_MODEL,
+                zip_url=zip_url
+            ))
+            
+            # Update request status
+            status = "completed_gdrive_failed" if gdrive_failed else "completed"
+            retry_on_network_error(lambda: supabase_client.update_request_status(request_id, status))
+            
+            # Log analytics
+            retry_on_network_error(lambda: supabase_client.log_analytics_event(
+                user_id=user_id,
+                event_type="document_generation_completed",
+                entity_id=request_id
+            ))
+            
+            print(f"[Job {request_id}] ✅ Job completed successfully!")
+        
+        except Exception as e:
+            # Update status to failed with error message
+            error_message = f"{type(e).__name__}: {str(e)}"
+            print(f"[Job {request_id}] ❌ Job failed: {error_message}")
+            traceback.print_exc()
+            
+            retry_on_network_error(lambda: supabase_client.update_request_status(
+                request_id=request_id,
+                status="failed",
+                error_message=error_message
+            ))
+            
+            # Log analytics
+            retry_on_network_error(lambda: supabase_client.log_analytics_event(
+                user_id=user_id,
+                event_type="document_generation_failed",
+                entity_id=request_id
+            ))
+            
+            raise  # Re-raise so RQ marks job as failed
+
+
+def process_document_generation_job(request_id: str, request_data: Dict[str, Any]):
+    """
+    Synchronous wrapper for RQ - calls the async function with asyncio.run().
+    
+    This is the function that RQ worker calls. It runs the async version using asyncio.
+    """
+    print(f"{'='*60}")
+    print(f"🎯 Worker picked up job: {request_id}")
+    print(f"   User ID: {request_data.get('user_id', 'N/A')}")
+    print(f"   Num documents: {request_data.get('prompt_params', {}).get('num_solutions', 'N/A')}")
+    print(f"{'='*60}")
+    
+    return asyncio.run(process_document_generation_job_async(request_id, request_data))
diff --git a/data/docvqa_hw/handschrift_mit_qid.jsonl b/data/docvqa_hw/handschrift_mit_qid.jsonl
new file mode 100755
index 0000000000000000000000000000000000000000..8ffe728bb000ddc0339b7fd7d22ef996028ee694
--- /dev/null
+++ b/data/docvqa_hw/handschrift_mit_qid.jsonl
@@ -0,0 +1,83 @@
+{"name": "fykp0227_9.pdf", "question_ids": [63039]}
+{"name": "fypn0000_2.pdf", "question_ids": [52698, 52700, 52703]}
+{"name": "hkdv0228_4.pdf", "question_ids": [61298]}
+{"name": "hmmg0227_13.pdf", "question_ids": [63575, 63578]}
+{"name": "hnnp0227_33.pdf", "question_ids": [23169]}
+{"name": "hnnp0227_35.pdf", "question_ids": [21316, 21317]}
+{"name": "hznv0228_1.pdf", "question_ids": [2325, 2327]}
+{"name": "jmmg0227_1.pdf", "question_ids": [63696]}
+{"name": "jqyg0227_1.pdf", "question_ids": [63487]}
+{"name": "jrgn0226_1.pdf", "question_ids": [44904]}
+{"name": "ljgg0227_2.pdf", "question_ids": [61199]}
+{"name": "lkdv0228_13.pdf", "question_ids": [61536, 61537, 61538, 61539]}
+{"name": "lkdv0228_28.pdf", "question_ids": [61527, 61528, 61529, 61530]}
+{"name": "lrgn0226_1.pdf", "question_ids": [43050]}
+{"name": "lyph0227_1.pdf", "question_ids": [5280]}
+{"name": "mmxd0227_1.pdf", "question_ids": [38123, 38163]}
+{"name": "mzyc0227_2.pdf", "question_ids": [63782]}
+{"name": "njpf0227_2.pdf", "question_ids": [61140, 61143, 61144, 61145]}
+{"name": "nmyg0227_1.pdf", "question_ids": [62372, 62373]}
+{"name": "npnm0020_1.pdf", "question_ids": [53668, 53669]}
+{"name": "pnbv0228_1.pdf", "question_ids": [50370]}
+{"name": "rkmd0217_2.pdf", "question_ids": [64390]}
+{"name": "shwg0227_1.pdf", "question_ids": [62010]}
+{"name": "tlgw0228_1.pdf", "question_ids": [50193]}
+{"name": "txpn0095_17.pdf", "question_ids": [52200, 52202, 52204]}
+{"name": "xnbl0037_7.pdf", "question_ids": [575]}
+{"name": "xthh0077_40.pdf", "question_ids": [55068, 55069, 55073, 55076, 55077]}
+{"name": "xthh0077_41.pdf", "question_ids": [54938, 54941, 54943, 54945, 54946, 54948]}
+{"name": "yjdv0228_13.pdf", "question_ids": [61895, 61902]}
+{"name": "yqgl0228_1.pdf", "question_ids": [209]}
+{"name": "zhjl0226_1.pdf", "question_ids": [44927]}
+{"name": "znmf0227_4.pdf", "question_ids": [62529, 62530, 62532]}
+{"name": "yhxd0227_2.pdf", "question_ids": [62644]}
+{"name": "ynbm0227_3.pdf", "question_ids": [56931]}
+{"name": "ffmm0020_1.pdf", "question_ids": [53921, 53924]}
+{"name": "fkmj0226_7.pdf", "question_ids": [52474, 52475, 52476]}
+{"name": "fnkp0227_31.pdf", "question_ids": [63098, 63100, 63102]}
+{"name": "gtmj0226_7.pdf", "question_ids": [5722, 5724]}
+{"name": "gxyd0217_2.pdf", "question_ids": [35331, 35332, 35333, 35335, 35339]}
+{"name": "hhhh0224_10.pdf", "question_ids": [9769, 9770, 9771, 9772]}
+{"name": "hnnp0227_28.pdf", "question_ids": [22936, 22941, 22945, 22995]}
+{"name": "hnnp0227_80.pdf", "question_ids": [23110, 23113, 23116, 23119]}
+{"name": "hqfh0224_17.pdf", "question_ids": [5902, 5904, 5907]}
+{"name": "htwc0228_1.pdf", "question_ids": [62864, 62865]}
+{"name": "jggn0226_39.pdf", "question_ids": [45089, 45094, 45095, 45097, 45098]}
+{"name": "kfkm0081_1.pdf", "question_ids": [56535, 56537, 56539, 56540]}
+{"name": "kggn0226_20.pdf", "question_ids": [46467, 46471, 46475, 46482]}
+{"name": "kggn0226_26.pdf", "question_ids": [43354, 43357, 43376]}
+{"name": "kjnd0004_1.pdf", "question_ids": [7042, 7044, 7045, 7048, 7050]}
+{"name": "kkkp0227_21.pdf", "question_ids": [56382, 56383, 56384, 56385]}
+{"name": "klfw0081_1.pdf", "question_ids": [56916, 56920]}
+{"name": "kyvw0217_1.pdf", "question_ids": [16955, 16956, 16958]}
+{"name": "lfgn0226_29.pdf", "question_ids": [43172, 43185]}
+{"name": "lmyc0227_2.pdf", "question_ids": [40740, 40743, 40745, 40749, 40751, 40755, 40758, 40762, 40764]}
+{"name": "lnnp0227_1.pdf", "question_ids": [59669, 59670, 59671]}
+{"name": "lnwg0227_36.pdf", "question_ids": [31500, 31501, 31502, 31503, 31504]}
+{"name": "lsww0228_14.pdf", "question_ids": [50277, 50281, 50285, 50288, 50291, 50407]}
+{"name": "lxkp0227_5.pdf", "question_ids": [57793, 57794]}
+{"name": "mmkp0227_5.pdf", "question_ids": [62980, 62982]}
+{"name": "nfpw0224_1.pdf", "question_ids": [5571, 5573]}
+{"name": "nhyk0226_1.pdf", "question_ids": [60062, 60064, 60065, 60066]}
+{"name": "nngv0228_2.pdf", "question_ids": [59573]}
+{"name": "pfpw0224_10.pdf", "question_ids": [4950, 4951]}
+{"name": "ppjb0228_7.pdf", "question_ids": [49295, 49301, 49306, 49307, 49312, 49316]}
+{"name": "qllg0023_3.pdf", "question_ids": [54893, 54898]}
+{"name": "rpwx0225_9.pdf", "question_ids": [4763, 4764, 4765, 4766]}
+{"name": "rydb0228_2.pdf", "question_ids": [59517, 59519, 59521, 59523, 59525]}
+{"name": "rzyw0224_1.pdf", "question_ids": [5682, 5685, 5688, 5691, 5697]}
+{"name": "sjkg0227_2.pdf", "question_ids": [61152, 61153, 61154, 61155]}
+{"name": "thwm0227_2.pdf", "question_ids": [59272]}
+{"name": "tpwx0225_9.pdf", "question_ids": [5572, 5574, 5577]}
+{"name": "xngv0228_4.pdf", "question_ids": [59870, 59871, 59872]}
+{"name": "xxhd0227_6.pdf", "question_ids": [62613, 62615]}
+{"name": "xygx0227_9.pdf", "question_ids": [21195, 21197, 21198, 21199]}
+{"name": "xynd0004_1.pdf", "question_ids": [6688, 6689, 6690, 6691]}
+{"name": "xyyv0228_1.pdf", "question_ids": [3036, 3037, 3038]}
+{"name": "yjkg0227_2.pdf", "question_ids": [61148, 61150, 61151]}
+{"name": "ylfh0078_1.pdf", "question_ids": [56262, 56263, 56264, 56265]}
+{"name": "ylml0226_1.pdf", "question_ids": [63790, 63792, 63793]}
+{"name": "ymkm0227_5.pdf", "question_ids": [59275, 59276, 59277, 59278]}
+{"name": "ymkp0227_10.pdf", "question_ids": [61513, 61514, 61515, 61516, 61517, 61518, 61520, 61521]}
+{"name": "zrww0228_4.pdf", "question_ids": [50908, 50909, 50910]}
+{"name": "ztcn0020_11.pdf", "question_ids": [8056, 8057]}
\ No newline at end of file
diff --git a/data/docvqa_hw/zahlen_mit_qid.jsonl b/data/docvqa_hw/zahlen_mit_qid.jsonl
new file mode 100755
index 0000000000000000000000000000000000000000..caccec9bad133b916d967ee85291220e59ca1d96
--- /dev/null
+++ b/data/docvqa_hw/zahlen_mit_qid.jsonl
@@ -0,0 +1,20 @@
+{"name": "ffdh0224_1.pdf", "question_ids": [5844, 5851]}
+{"name": "fgbd0079_5.pdf", "question_ids": [18595, 18598, 18602, 18606, 18609]}
+{"name": "ghlw0228_3.pdf", "question_ids": [50847]}
+{"name": "glfh0224_9.pdf", "question_ids": [5920, 5922, 5925, 5927, 5931]}
+{"name": "hlmd0217_2.pdf", "question_ids": [38949]}
+{"name": "hnpp0000_3.pdf", "question_ids": [5527]}
+{"name": "hrcd0003_1.pdf", "question_ids": [6844]}
+{"name": "jlmd0217_2.pdf", "question_ids": [64321]}
+{"name": "jmmd0217_2.pdf", "question_ids": [37658]}
+{"name": "jxwg0023_2.pdf", "question_ids": [53898]}
+{"name": "kmmw0228_2.pdf", "question_ids": [56444]}
+{"name": "kspw0224_2.pdf", "question_ids": [5980, 5987]}
+{"name": "lsww0228_12.pdf", "question_ids": [52170, 52172, 52174]}
+{"name": "prhm0227_3.pdf", "question_ids": [47948]}
+{"name": "pzyy0078_7.pdf", "question_ids": [18897, 18899, 18901]}
+{"name": "xfbd0003_1.pdf", "question_ids": [51512]}
+{"name": "xrcy0227_48.pdf", "question_ids": [59635]}
+{"name": "zqlp0000_10.pdf", "question_ids": [6093, 6097]}
+{"name": "zqww0228_8.pdf", "question_ids": [50854, 50855, 50856]}
+{"name": "zzyw0224_12.pdf", "question_ids": [4909, 4910, 4911]}
diff --git a/data/exports/DocVQA_clip_kmeans.png b/data/exports/DocVQA_clip_kmeans.png
new file mode 100755
index 0000000000000000000000000000000000000000..3111cab48d764832cf5a50dcd0457c63113d2cb1
--- /dev/null
+++ b/data/exports/DocVQA_clip_kmeans.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5137f433ed859fc090511889344a9da714de83f74c76990d4b1366f65fb34e62
+size 76323
diff --git a/data/exports/DocVQA_layout_kmeans.png b/data/exports/DocVQA_layout_kmeans.png
new file mode 100755
index 0000000000000000000000000000000000000000..87a597f0a5d956dea6878b81e8c7967d104b1b44
--- /dev/null
+++ b/data/exports/DocVQA_layout_kmeans.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9441cb6e919b32156153375e9e7ce031d4462356ee63f4c817ef39b9e387c72b
+size 74539
diff --git a/data/exports/DocVQA_text_kmeans.png b/data/exports/DocVQA_text_kmeans.png
new file mode 100755
index 0000000000000000000000000000000000000000..962ee4689b867ddcebcae56faa9b6db47651875d
--- /dev/null
+++ b/data/exports/DocVQA_text_kmeans.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffdebd2b35c924da36c9b44e08deaec6d9ac3106e133eecc9341c118a4c73f8e
+size 69706
diff --git a/data/models/handwriting/cached_vae/config.json b/data/models/handwriting/cached_vae/config.json
new file mode 100755
index 0000000000000000000000000000000000000000..d639ed9f6ab4a064644af0c9f93f777e7bbd1781
--- /dev/null
+++ b/data/models/handwriting/cached_vae/config.json
@@ -0,0 +1,38 @@
+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.35.2",
+  "_name_or_path": "stabilityai/sd-vae-ft-mse",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 256,
+  "scaling_factor": 0.18215,
+  "shift_factor": null,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": true,
+  "use_quant_conv": true
+}
diff --git a/data/models/handwriting/char_vocab.json b/data/models/handwriting/char_vocab.json
new file mode 100755
index 0000000000000000000000000000000000000000..5740d4eb9d51e86602bce90713d2139532057fa4
--- /dev/null
+++ b/data/models/handwriting/char_vocab.json
@@ -0,0 +1,89 @@
+{
+  "char_to_idx": {
+    "<PAD>": 0,
+    "<UNK>": 1,
+    "<SOS>": 2,
+    "<EOS>": 3,
+    " ": 4,
+    "!": 5,
+    "\"": 6,
+    "#": 7,
+    "&": 8,
+    "'": 9,
+    "(": 10,
+    ")": 11,
+    "*": 12,
+    "+": 13,
+    ",": 14,
+    "-": 15,
+    ".": 16,
+    "/": 17,
+    "0": 18,
+    "1": 19,
+    "2": 20,
+    "3": 21,
+    "4": 22,
+    "5": 23,
+    "6": 24,
+    "7": 25,
+    "8": 26,
+    "9": 27,
+    ":": 28,
+    ";": 29,
+    "?": 30,
+    "A": 31,
+    "B": 32,
+    "C": 33,
+    "D": 34,
+    "E": 35,
+    "F": 36,
+    "G": 37,
+    "H": 38,
+    "I": 39,
+    "J": 40,
+    "K": 41,
+    "L": 42,
+    "M": 43,
+    "N": 44,
+    "O": 45,
+    "P": 46,
+    "Q": 47,
+    "R": 48,
+    "S": 49,
+    "T": 50,
+    "U": 51,
+    "V": 52,
+    "W": 53,
+    "X": 54,
+    "Y": 55,
+    "Z": 56,
+    "a": 57,
+    "b": 58,
+    "c": 59,
+    "d": 60,
+    "e": 61,
+    "f": 62,
+    "g": 63,
+    "h": 64,
+    "i": 65,
+    "j": 66,
+    "k": 67,
+    "l": 68,
+    "m": 69,
+    "n": 70,
+    "o": 71,
+    "p": 72,
+    "q": 73,
+    "r": 74,
+    "s": 75,
+    "t": 76,
+    "u": 77,
+    "v": 78,
+    "w": 79,
+    "x": 80,
+    "y": 81,
+    "z": 82
+  },
+  "max_length": 32,
+  "vocab_size": 83
+}
\ No newline at end of file
diff --git a/data/models/handwriting/config.yaml b/data/models/handwriting/config.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..85ad9dedb78560c4fc9aa4952ad94e86f9a6de43
--- /dev/null
+++ b/data/models/handwriting/config.yaml
@@ -0,0 +1,95 @@
+data:
+  batch_size: 64
+  num_workers: 8
+  train_lmdb_path: ./iam_lmdbclear
+  vocab_path: ./char_vocab.json
+model:
+  latent_shape:
+  - 4
+  - 16
+  - 64
+  scheduler:
+    beta_end: 0.012
+    beta_schedule: linear
+    beta_start: 0.00085
+    num_train_timesteps: 1000
+    prediction_type: epsilon
+  text_encoder:
+    d_ff: 1024
+    d_model: 512
+    dropout: 0.1
+    max_length: 32
+    num_heads: 8
+    num_layers: 4
+    output_dim: 512
+  unet:
+    act_fn: silu
+    attention_head_dim: 8
+    block_out_channels:
+    - 192
+    - 384
+    - 768
+    - 768
+    cross_attention_dim: 512
+    down_block_types:
+    - DownBlock2D
+    - CrossAttnDownBlock2D
+    - CrossAttnDownBlock2D
+    - DownBlock2D
+    in_channels: 4
+    layers_per_block: 2
+    mid_block_type: UNetMidBlock2DCrossAttn
+    norm_num_groups: 32
+    num_class_embeds: 657
+    out_channels: 4
+    sample_size:
+    - 16
+    - 64
+    up_block_types:
+    - UpBlock2D
+    - CrossAttnUpBlock2D
+    - CrossAttnUpBlock2D
+    - UpBlock2D
+  vae:
+    model_name: stabilityai/sd-vae-ft-mse
+training:
+  compile_model: false
+  ema_decay: 0.999
+  ema_inv_gamma: 1.0
+  ema_min_decay: 0.0
+  ema_power: 1.0
+  gradient_accumulation_steps: 1
+  log_every_n_steps: 10
+  lr_scheduler:
+    min_lr: 1.0e-07
+    type: cosine
+    warmup_steps: 2000
+  max_grad_norm: 1.0
+  mixed_precision: bf16
+  mode: latent
+  num_epochs: 300
+  num_inference_steps: 1000
+  optimizer:
+    beta1: 0.9
+    beta2: 0.999
+    eps: 1.0e-08
+    lr: 0.0001
+    type: adamw
+    weight_decay: 0.01
+  output_dir: ./experiments/hf_conditional_latent_batch64
+  resume_from_checkpoint: null
+  run_name: hf_conditional_latent_batch64
+  sample_every_n_steps: 18000
+  save_every_n_epochs: 10
+  seed: 42
+  use_channels_last: false
+  use_ema: true
+wandb:
+  api_key:
+  entity: null
+  notes: Hugging Face UNet with EMA and latent diffusion training.
+  project: handwriting-diffusion
+  tags:
+  - hf
+  - conditional
+  - latent
diff --git a/data/models/handwriting/writer_id_map.json b/data/models/handwriting/writer_id_map.json
new file mode 100755
index 0000000000000000000000000000000000000000..e57166a50587aab1cb4260fd7c9c42f88cbc40f0
--- /dev/null
+++ b/data/models/handwriting/writer_id_map.json
@@ -0,0 +1,659 @@
+{
+  "0": 0,
+  "1": 1,
+  "10": 2,
+  "100": 3,
+  "102": 4,
+  "103": 5,
+  "104": 6,
+  "105": 7,
+  "106": 8,
+  "107": 9,
+  "108": 10,
+  "109": 11,
+  "11": 12,
+  "110": 13,
+  "111": 14,
+  "112": 15,
+  "113": 16,
+  "114": 17,
+  "115": 18,
+  "116": 19,
+  "117": 20,
+  "118": 21,
+  "119": 22,
+  "12": 23,
+  "120": 24,
+  "121": 25,
+  "122": 26,
+  "123": 27,
+  "124": 28,
+  "125": 29,
+  "126": 30,
+  "127": 31,
+  "128": 32,
+  "129": 33,
+  "13": 34,
+  "130": 35,
+  "131": 36,
+  "132": 37,
+  "133": 38,
+  "134": 39,
+  "135": 40,
+  "136": 41,
+  "137": 42,
+  "138": 43,
+  "139": 44,
+  "14": 45,
+  "140": 46,
+  "141": 47,
+  "142": 48,
+  "143": 49,
+  "144": 50,
+  "145": 51,
+  "146": 52,
+  "147": 53,
+  "148": 54,
+  "149": 55,
+  "15": 56,
+  "150": 57,
+  "151": 58,
+  "152": 59,
+  "153": 60,
+  "154": 61,
+  "155": 62,
+  "156": 63,
+  "157": 64,
+  "158": 65,
+  "159": 66,
+  "16": 67,
+  "160": 68,
+  "161": 69,
+  "162": 70,
+  "163": 71,
+  "164": 72,
+  "165": 73,
+  "166": 74,
+  "167": 75,
+  "168": 76,
+  "169": 77,
+  "17": 78,
+  "170": 79,
+  "171": 80,
+  "172": 81,
+  "173": 82,
+  "174": 83,
+  "175": 84,
+  "176": 85,
+  "177": 86,
+  "178": 87,
+  "179": 88,
+  "18": 89,
+  "180": 90,
+  "181": 91,
+  "182": 92,
+  "183": 93,
+  "184": 94,
+  "185": 95,
+  "186": 96,
+  "187": 97,
+  "188": 98,
+  "189": 99,
+  "19": 100,
+  "190": 101,
+  "191": 102,
+  "192": 103,
+  "193": 104,
+  "194": 105,
+  "195": 106,
+  "196": 107,
+  "197": 108,
+  "198": 109,
+  "199": 110,
+  "2": 111,
+  "20": 112,
+  "200": 113,
+  "201": 114,
+  "202": 115,
+  "203": 116,
+  "204": 117,
+  "205": 118,
+  "206": 119,
+  "207": 120,
+  "208": 121,
+  "209": 122,
+  "21": 123,
+  "210": 124,
+  "211": 125,
+  "212": 126,
+  "213": 127,
+  "214": 128,
+  "215": 129,
+  "216": 130,
+  "217": 131,
+  "218": 132,
+  "219": 133,
+  "22": 134,
+  "220": 135,
+  "221": 136,
+  "222": 137,
+  "223": 138,
+  "224": 139,
+  "225": 140,
+  "226": 141,
+  "227": 142,
+  "228": 143,
+  "229": 144,
+  "23": 145,
+  "230": 146,
+  "231": 147,
+  "232": 148,
+  "233": 149,
+  "234": 150,
+  "235": 151,
+  "236": 152,
+  "237": 153,
+  "238": 154,
+  "239": 155,
+  "24": 156,
+  "240": 157,
+  "241": 158,
+  "242": 159,
+  "243": 160,
+  "244": 161,
+  "245": 162,
+  "246": 163,
+  "247": 164,
+  "248": 165,
+  "249": 166,
+  "25": 167,
+  "250": 168,
+  "251": 169,
+  "252": 170,
+  "253": 171,
+  "254": 172,
+  "255": 173,
+  "256": 174,
+  "257": 175,
+  "258": 176,
+  "259": 177,
+  "26": 178,
+  "260": 179,
+  "261": 180,
+  "262": 181,
+  "263": 182,
+  "264": 183,
+  "265": 184,
+  "266": 185,
+  "267": 186,
+  "268": 187,
+  "269": 188,
+  "27": 189,
+  "270": 190,
+  "272": 191,
+  "273": 192,
+  "274": 193,
+  "275": 194,
+  "276": 195,
+  "277": 196,
+  "278": 197,
+  "279": 198,
+  "28": 199,
+  "280": 200,
+  "281": 201,
+  "282": 202,
+  "283": 203,
+  "285": 204,
+  "286": 205,
+  "287": 206,
+  "288": 207,
+  "289": 208,
+  "29": 209,
+  "290": 210,
+  "291": 211,
+  "292": 212,
+  "293": 213,
+  "294": 214,
+  "295": 215,
+  "296": 216,
+  "297": 217,
+  "298": 218,
+  "299": 219,
+  "3": 220,
+  "30": 221,
+  "300": 222,
+  "301": 223,
+  "302": 224,
+  "303": 225,
+  "304": 226,
+  "305": 227,
+  "307": 228,
+  "308": 229,
+  "309": 230,
+  "31": 231,
+  "310": 232,
+  "312": 233,
+  "313": 234,
+  "314": 235,
+  "315": 236,
+  "316": 237,
+  "317": 238,
+  "318": 239,
+  "319": 240,
+  "32": 241,
+  "320": 242,
+  "321": 243,
+  "322": 244,
+  "323": 245,
+  "324": 246,
+  "325": 247,
+  "326": 248,
+  "327": 249,
+  "328": 250,
+  "329": 251,
+  "33": 252,
+  "330": 253,
+  "331": 254,
+  "332": 255,
+  "333": 256,
+  "334": 257,
+  "335": 258,
+  "336": 259,
+  "337": 260,
+  "338": 261,
+  "339": 262,
+  "34": 263,
+  "340": 264,
+  "341": 265,
+  "342": 266,
+  "343": 267,
+  "344": 268,
+  "345": 269,
+  "346": 270,
+  "347": 271,
+  "348": 272,
+  "349": 273,
+  "35": 274,
+  "350": 275,
+  "351": 276,
+  "352": 277,
+  "353": 278,
+  "354": 279,
+  "355": 280,
+  "356": 281,
+  "357": 282,
+  "359": 283,
+  "36": 284,
+  "360": 285,
+  "361": 286,
+  "362": 287,
+  "363": 288,
+  "364": 289,
+  "365": 290,
+  "366": 291,
+  "367": 292,
+  "368": 293,
+  "369": 294,
+  "37": 295,
+  "370": 296,
+  "371": 297,
+  "372": 298,
+  "373": 299,
+  "375": 300,
+  "376": 301,
+  "377": 302,
+  "378": 303,
+  "379": 304,
+  "38": 305,
+  "380": 306,
+  "382": 307,
+  "383": 308,
+  "384": 309,
+  "385": 310,
+  "386": 311,
+  "387": 312,
+  "388": 313,
+  "389": 314,
+  "39": 315,
+  "390": 316,
+  "391": 317,
+  "392": 318,
+  "393": 319,
+  "394": 320,
+  "395": 321,
+  "396": 322,
+  "397": 323,
+  "398": 324,
+  "399": 325,
+  "4": 326,
+  "40": 327,
+  "400": 328,
+  "401": 329,
+  "402": 330,
+  "403": 331,
+  "404": 332,
+  "405": 333,
+  "406": 334,
+  "407": 335,
+  "408": 336,
+  "409": 337,
+  "41": 338,
+  "410": 339,
+  "411": 340,
+  "412": 341,
+  "413": 342,
+  "414": 343,
+  "415": 344,
+  "416": 345,
+  "417": 346,
+  "418": 347,
+  "419": 348,
+  "42": 349,
+  "420": 350,
+  "421": 351,
+  "422": 352,
+  "423": 353,
+  "424": 354,
+  "425": 355,
+  "426": 356,
+  "427": 357,
+  "428": 358,
+  "429": 359,
+  "43": 360,
+  "430": 361,
+  "431": 362,
+  "432": 363,
+  "433": 364,
+  "434": 365,
+  "435": 366,
+  "436": 367,
+  "439": 368,
+  "44": 369,
+  "440": 370,
+  "441": 371,
+  "442": 372,
+  "443": 373,
+  "444": 374,
+  "445": 375,
+  "446": 376,
+  "447": 377,
+  "448": 378,
+  "449": 379,
+  "45": 380,
+  "450": 381,
+  "451": 382,
+  "452": 383,
+  "453": 384,
+  "454": 385,
+  "455": 386,
+  "456": 387,
+  "457": 388,
+  "458": 389,
+  "459": 390,
+  "46": 391,
+  "460": 392,
+  "461": 393,
+  "462": 394,
+  "463": 395,
+  "464": 396,
+  "465": 397,
+  "466": 398,
+  "467": 399,
+  "468": 400,
+  "469": 401,
+  "47": 402,
+  "470": 403,
+  "471": 404,
+  "472": 405,
+  "473": 406,
+  "474": 407,
+  "475": 408,
+  "476": 409,
+  "477": 410,
+  "478": 411,
+  "479": 412,
+  "48": 413,
+  "480": 414,
+  "481": 415,
+  "482": 416,
+  "483": 417,
+  "484": 418,
+  "485": 419,
+  "486": 420,
+  "487": 421,
+  "488": 422,
+  "489": 423,
+  "49": 424,
+  "490": 425,
+  "491": 426,
+  "492": 427,
+  "493": 428,
+  "494": 429,
+  "495": 430,
+  "496": 431,
+  "497": 432,
+  "498": 433,
+  "499": 434,
+  "5": 435,
+  "50": 436,
+  "500": 437,
+  "501": 438,
+  "502": 439,
+  "503": 440,
+  "504": 441,
+  "505": 442,
+  "506": 443,
+  "508": 444,
+  "509": 445,
+  "51": 446,
+  "510": 447,
+  "511": 448,
+  "512": 449,
+  "513": 450,
+  "514": 451,
+  "515": 452,
+  "516": 453,
+  "517": 454,
+  "518": 455,
+  "519": 456,
+  "52": 457,
+  "520": 458,
+  "521": 459,
+  "522": 460,
+  "523": 461,
+  "524": 462,
+  "525": 463,
+  "526": 464,
+  "527": 465,
+  "528": 466,
+  "529": 467,
+  "53": 468,
+  "530": 469,
+  "531": 470,
+  "532": 471,
+  "533": 472,
+  "534": 473,
+  "535": 474,
+  "536": 475,
+  "537": 476,
+  "538": 477,
+  "539": 478,
+  "54": 479,
+  "540": 480,
+  "541": 481,
+  "542": 482,
+  "543": 483,
+  "544": 484,
+  "545": 485,
+  "546": 486,
+  "547": 487,
+  "548": 488,
+  "549": 489,
+  "55": 490,
+  "550": 491,
+  "551": 492,
+  "552": 493,
+  "553": 494,
+  "554": 495,
+  "555": 496,
+  "556": 497,
+  "557": 498,
+  "558": 499,
+  "559": 500,
+  "56": 501,
+  "560": 502,
+  "561": 503,
+  "562": 504,
+  "563": 505,
+  "564": 506,
+  "565": 507,
+  "566": 508,
+  "567": 509,
+  "568": 510,
+  "569": 511,
+  "570": 512,
+  "571": 513,
+  "572": 514,
+  "573": 515,
+  "574": 516,
+  "575": 517,
+  "576": 518,
+  "577": 519,
+  "578": 520,
+  "579": 521,
+  "58": 522,
+  "580": 523,
+  "581": 524,
+  "582": 525,
+  "583": 526,
+  "584": 527,
+  "585": 528,
+  "586": 529,
+  "587": 530,
+  "588": 531,
+  "589": 532,
+  "59": 533,
+  "590": 534,
+  "591": 535,
+  "592": 536,
+  "593": 537,
+  "594": 538,
+  "595": 539,
+  "596": 540,
+  "597": 541,
+  "598": 542,
+  "599": 543,
+  "6": 544,
+  "60": 545,
+  "600": 546,
+  "601": 547,
+  "602": 548,
+  "603": 549,
+  "604": 550,
+  "605": 551,
+  "606": 552,
+  "607": 553,
+  "608": 554,
+  "609": 555,
+  "61": 556,
+  "610": 557,
+  "611": 558,
+  "612": 559,
+  "613": 560,
+  "614": 561,
+  "615": 562,
+  "616": 563,
+  "617": 564,
+  "618": 565,
+  "619": 566,
+  "62": 567,
+  "620": 568,
+  "621": 569,
+  "622": 570,
+  "623": 571,
+  "624": 572,
+  "625": 573,
+  "626": 574,
+  "627": 575,
+  "628": 576,
+  "629": 577,
+  "63": 578,
+  "630": 579,
+  "631": 580,
+  "632": 581,
+  "633": 582,
+  "634": 583,
+  "635": 584,
+  "636": 585,
+  "637": 586,
+  "638": 587,
+  "639": 588,
+  "64": 589,
+  "640": 590,
+  "641": 591,
+  "642": 592,
+  "643": 593,
+  "644": 594,
+  "645": 595,
+  "647": 596,
+  "648": 597,
+  "649": 598,
+  "65": 599,
+  "650": 600,
+  "651": 601,
+  "652": 602,
+  "653": 603,
+  "654": 604,
+  "655": 605,
+  "658": 606,
+  "659": 607,
+  "66": 608,
+  "660": 609,
+  "661": 610,
+  "662": 611,
+  "663": 612,
+  "664": 613,
+  "665": 614,
+  "666": 615,
+  "667": 616,
+  "668": 617,
+  "669": 618,
+  "67": 619,
+  "670": 620,
+  "671": 621,
+  "68": 622,
+  "69": 623,
+  "7": 624,
+  "70": 625,
+  "71": 626,
+  "72": 627,
+  "73": 628,
+  "74": 629,
+  "75": 630,
+  "76": 631,
+  "77": 632,
+  "78": 633,
+  "79": 634,
+  "8": 635,
+  "80": 636,
+  "81": 637,
+  "82": 638,
+  "83": 639,
+  "84": 640,
+  "85": 641,
+  "86": 642,
+  "87": 643,
+  "88": 644,
+  "89": 645,
+  "9": 646,
+  "90": 647,
+  "91": 648,
+  "92": 649,
+  "93": 650,
+  "94": 651,
+  "95": 652,
+  "96": 653,
+  "97": 654,
+  "98": 655,
+  "99": 656
+}
\ No newline at end of file
diff --git a/data/prompt_templates/Adaptation_GT/seed-based.txt b/data/prompt_templates/Adaptation_GT/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..6a8502f06d7f114bd5e48ea59640cb6e006e8486
--- /dev/null
+++ b/data/prompt_templates/Adaptation_GT/seed-based.txt
@@ -0,0 +1,40 @@
+You are an AI specialized in generating unique HTML
+documents based on multiple scanned images of realworld examples. You have been provided with distinct
+sample images, each from a different cultural or regional
+background. You have been provided seed images of
+{doc type}, each originating from different cultural or regional contexts. For example, some might feature:
+• Local languages or regional disclaimers
+• Different date formats (e.g., dd/mm/yyyy vs. mm/dd/yyyy)
+• Unique currency or numbering formats
+• Varying layout norms (positions of key fields, disclaimers, official stamps, etc.)
+Now, please generate {num solutions} unique HTML
+documents that:
+1. Strictly reflect the overall style, layout, and cultural
+cues found in these samples, but do NOT copy any text,
+disclaimers, or layout verbatim from the samples.
+2. Include any essential mandatory fields: {sections}.
+3. Maintain an A4 size format for printing (using @page
+{{ size: A4; }} or similar CSS).
+4. Maintain a {background requirements}.
+5. Avoid copy-pasting or reusing large chunks of HTML,
+CSS, or disclaimers—each document must be at least
+70% different in code and text than the others.
+6. Strictly wrap each new document in
+<HTML>...</HTML> tags, for example:
+1. <HTML>...Solution #1...</HTML>
+2. <HTML>...Solution #2...</HTML>
+...
+{num solutions}. <HTML>...Solution
+#{num solutions}...</HTML>
+Additional Requirements: {user descriptions}
+Include the {gt type} as JSON in the document via <script type="application/json" id="GT">...</script> in the following format: {gt example}
+Notes:
+• Pay close attention to cultural/regional differences seen
+in the seed images (e.g., language, format, disclaimers).
+• Feel free to creatively adapt or combine stylistic cues
+from the seeds, as long as the end result looks authentic
+for that cultural context.
+• Do NOT directly copy-paste text or entire code blocks
+from any single seed image or across these new solutions.
+Now please generate the {num solutions} distinct
+{doc type} documents.
diff --git a/data/prompt_templates/Adaptation_GT/seed-free.txt b/data/prompt_templates/Adaptation_GT/seed-free.txt
new file mode 100755
index 0000000000000000000000000000000000000000..634698f7e5ffd52c3e87fbf2a7844c3079998758
--- /dev/null
+++ b/data/prompt_templates/Adaptation_GT/seed-free.txt
@@ -0,0 +1,25 @@
+You are an AI specialized in generating multiple unique
+HTML documents in one response. Please create
+{num solutions} unique HTML documents representing
+{doc type}.
+Each solution must:
+1. Include all mandatory fields: {sections}.
+2. Be formatted so it could print on A4 (e.g., use @page
+{{ size: A4; }} in your CSS).
+3. Show a significantly different layout, styling, and textual content from every other solution.
+4. Maintain a {background requirements}.
+5. Avoid copy-pasting or reusing large chunks of HTML,
+CSS, or disclaimers—each document must be at least
+70% different in code and text than the others.
+6. Wrap each complete document between <HTML>
+and </HTML> tags, labeled as:
+1. <HTML>...Solution #1...</HTML>
+2. <HTML>...Solution #2...</HTML>
+...
+{num solutions}. <HTML>...Solution
+#{num solutions}...</HTML>
+Include the {gt type} as JSON in the document via <script type="application/json" id="GT">...</script> in the following format: {gt example}
+Do not provide additional commentary or references to the
+other solutions within each HTML.
+Now generate the {num solutions} distinct {doc type}
+documents.
diff --git a/data/prompt_templates/ClaudeRefined1/seed-based.txt b/data/prompt_templates/ClaudeRefined1/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..83b816d620eed5f01c6328df4c1f5d02f3f4bd13
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined1/seed-based.txt
@@ -0,0 +1,78 @@
+# HTML Document Generation Prompt (Refined)
+
+You are an AI specialized in creating culturally authentic HTML documents based on visual analysis of real-world examples. You have been provided with {num_seed_images} seed images of **{doc_type}** documents from different cultural and regional contexts.
+
+## Cultural Variations (If Present)
+The seed images may demonstrate regional differences such as:
+- Language variations and local terminology
+- Date formatting conventions (DD/MM/YYYY, MM/DD/YYYY, etc.)
+- Currency symbols and number formatting
+- Layout preferences (field positioning, official elements, cultural design patterns)
+- Regional legal disclaimers and regulatory requirements
+- Typography and visual hierarchy standards
+
+## Task Requirements
+Generate **{num_solutions}** unique HTML documents that meet these specifications:
+
+### Core Requirements
+1. **Cultural Authenticity**: If cultural/regional variations are present in the seed images, reflect those stylistic elements without directly copying any text, disclaimers, or layouts verbatim
+2. **Required Content**: Include all essential fields: {required_sections}
+3. **Single Page Format**: Design as single-page documents with dimensions appropriate to the document type (receipts: narrow format, forms: standard width, etc.)
+4. **Language**: Generate all content in {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: Each document must be at least 70% different in code structure, styling, and content from others
+
+## Ground Truth Generation
+Generate appropriate ground truth data for each document: {gt_type}.
+Include the ground truth as JSON inside each document in a `<script type="application/json" id="GT">...</script>` tag.
+The ground truth must follow the format: {gt_format}
+
+### Technical Specifications
+- Wrap each solution in `<HTML>...</HTML>` tags numbered sequentially
+- Include the ground truth JSON in `<script type="application/json" id="GT">...</script>` as specified above
+- Implement static CSS appropriate for the document type and single-page layout (no animations, transitions, or dynamic effects)
+
+## Additional Requirements
+{user_descriptions}
+
+### Content Guidelines
+- **DO**: Adapt any cultural/regional stylistic elements present in the seed images
+- **DO**: Create authentic-feeling content appropriate to each cultural context
+- **DO**: Vary layout structures, color schemes, and typographic choices
+- **DO**: Use static styling only (no animations, hover effects, or transitions)
+- **DON'T**: Copy-paste text, code blocks, or entire sections between solutions
+- **DON'T**: Reuse identical disclaimers, headers, or formatting patterns
+- **DON'T**: Include any dynamic effects, animations, or interactive elements
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+Structure your response as:
+
+```
+1. <HTML>
+   <!-- Solution #1 with cultural context A -->
+   ...complete HTML document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 with cultural context B -->
+   ...complete HTML document...
+</HTML>
+
+...continue for all {num_solutions} solutions
+```
+
+## Quality Checklist
+Before generating, ensure each document:
+- [ ] Reflects any authentic cultural/regional characteristics present in seed images
+- [ ] Contains all required sections: {required_sections}
+- [ ] Uses static styling only (no animations or dynamic effects)
+- [ ] Uses appropriate single-page formatting for the document type
+- [ ] All content is in English
+- [ ] Includes the specified ground truth in proper JSON format
+- [ ] Maintains 70%+ uniqueness from other solutions
+- [ ] Follows semantic HTML best practices
+
+Now generate the **{num_solutions}** distinct **{doc_type}** documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined10/seed-based.txt b/data/prompt_templates/ClaudeRefined10/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..21fb12f4689eae2f5eb109d684594b5b67a21c42
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined10/seed-based.txt
@@ -0,0 +1,57 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply generously increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- Never include signatures as handwriting
+
+## Visual Placeholders (if document type requires)
+- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropiate positions
+- Valid types are: signature, stamp, logo, barcode, photo, chart
+- Add data-content attribute with actual content description
+- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people and ensure the author is semantically coherent with the document content
+- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
+- Dimensions in mm/cm, e.g. `width:30mm;height:20mm;`
+- Example: `<div data-placeholder="signature" data-content="john" class="author1" style="width:50mm;height:15mm;"></div>`
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with minified HTML/CSS/JS
+- [ ] Content in {language}
+- [ ] GT JSON present and correctly formatted
+- [ ] Visual elements are semantically coherent
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined11/seed-based.txt b/data/prompt_templates/ClaudeRefined11/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..ea526e5508632fca75840f26cc364944daa15015
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined11/seed-based.txt
@@ -0,0 +1,55 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' and use regular text
+- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- If the handwriting represents a signature mark it additionally with class 'signature'
+
+## Visual Placeholders (if document type requires)
+- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
+- Valid types are: stamp, logo, barcode, photo, chart
+- Add data-content attribute with actual content description
+- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
+- Always provide dimensions in mm/cm, e.g. `width:30mm;height:20mm;`
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with minified HTML/CSS/JS
+- [ ] Content in {language}
+- [ ] GT JSON present and correctly formatted
+- [ ] Visual elements are semantically coherent
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt b/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt
new file mode 100755
index 0000000000000000000000000000000000000000..166162e89a5670ee549ee9a0eb66df9642bdaa2d
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt
@@ -0,0 +1,55 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' and use regular text
+- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- If the handwriting represents a signature mark it additionally with class 'signature'
+
+## Visual Placeholders (if document type requires)
+- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
+- Valid types are: stamp, logo, figure, barcode, photo
+- Add data-content attribute with actual content description
+- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
+- Always provide appropiate dimensions
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+Generate ground truth by assigning each applicable element in HTML a class from the list below to uniquely identify its label:
+{gt_type}
+{gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with minified HTML/CSS
+- [ ] Content in {language}
+- [ ] GT labels via class annotations are present and assigned to correct elements
+- [ ] Visual elements are semantically coherent
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined12/seed-based-json.txt b/data/prompt_templates/ClaudeRefined12/seed-based-json.txt
new file mode 100755
index 0000000000000000000000000000000000000000..6dbac5efd21eb7a8365ac553b11817d6defbb395
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined12/seed-based-json.txt
@@ -0,0 +1,55 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' and use regular text
+- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- If the handwriting represents a signature mark it additionally with class 'signature'
+
+## Visual Placeholders (if document type requires)
+- Insert `<div data-placeholder="type" style="...">` for non-text elements at appropriate positions
+- Valid types are: stamp, logo, figure, barcode, photo
+- Add data-content attribute with actual content description
+- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'
+- Always provide appropiate dimensions
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="logo" data-content="ACME Corp Logo" style="width:150mm;height:100mm;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with minified HTML/CSS
+- [ ] Content in {language}
+- [ ] GT JSON present, correctly formatted and semantically coherent
+- [ ] Visual elements are semantically coherent
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined2/seed-based.txt b/data/prompt_templates/ClaudeRefined2/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..107eeb8425d485b835ec96c4f36573cdb54d1d70
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined2/seed-based.txt
@@ -0,0 +1,70 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' (no special styling/fonts, treat as regular text)
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document
+
+## Visual Placeholders (if document type requires)
+- Include placeholders for non-text visual elements using HTML class 'visual-element' 
+- Add data attributes: data-type (signature/logo/stamp/barcode/photo/chart/etc.) and data-content (actual content)
+- Give each placeholder appropriate dimensions via inline styles
+- Examples: `<div class="visual-element" data-type="signature" data-content="John Smith" style="width: 200px; height: 60px; border: 1px solid red;"></div>`, `<div class="visual-element" data-type="logo" data-content="Acme Corp" style="width: 100px; height: 100px; border: 1px solid red;"></div>`, `<div class="visual-element" data-type="stamp" data-content="PAID" style="width: 80px; height: 80px; border: 1px solid red;"></div>`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined3/seed-based.txt b/data/prompt_templates/ClaudeRefined3/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..1ef1aeafbc5d5865dc065b64c5ff0520bcfc5a03
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined3/seed-based.txt
@@ -0,0 +1,70 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' (no special styling/fonts, treat as regular text)
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document
+
+## Visual Placeholders (if document type requires)
+- Include placeholders for non-text visual elements as JSON in `<script type="application/json" id="visual_elements">...</script>` tag.
+- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content)
+- Describe placement of each visual element with appropriate dimensions and y-rotation
+- Examples: `[{"type": "signature", "content": "John Doe", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt b/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..b968eed4d2f93b252244cc9afb1b18d4b6556a5d
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt
@@ -0,0 +1,97 @@
+You are an AI creating HTML documents that **clone the style and structure** of {num_seed_images} seed images of **{doc_type}** documents.
+
+# Task: Generate {num_solutions} cloned HTML documents
+
+## Core Objective
+**CLONE the visual design, layout, and structure** of the seed images while using **completely different data**. Think of this as creating blank template instances filled with new information.
+
+## Critical Requirements
+1. **Visual Fidelity**: Replicate styling elements from seed images:
+   - Exact layout structure (positioning, spacing, alignment)
+   - Typography (fonts, sizes, weights, colors)
+   - Visual hierarchy and sectioning
+   - Color schemes and backgrounds
+   - Border styles, dividers, and decorative elements
+   - Logo/header/footer placement and styling
+
+2. **Data Uniqueness**: Generate completely new content:
+   - **NEVER copy**: names, addresses, phone numbers, emails, IBANs, account numbers, license numbers, ID numbers, dates, amounts, prices, or any other specific data points
+   - Generate realistic but fictional alternatives for all data fields
+   - Maintain data type appropriateness (valid formats for phones, IBANs, dates, etc.)
+   - Ensure cultural/regional authenticity for generated data
+
+3. **Required Fields**: {required_sections}
+
+4. **Format**: Single-page, dimensions matching seed documents
+
+5. **Language**: {language}
+
+6. **Background**: {background_requirements}
+
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Cloning Strategy
+- **DO**: Match layout grids, spacing, font choices, color palettes, sectioning patterns, table structures, visual element placement
+- **DON'T**: Copy any actual text content, numerical data, personal information, or business-specific details
+- **Think**: "Same template, different instance"
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Replicate CSS styling patterns from seed documents
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' (no special styling/fonts, treat as regular text)
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document
+- Generate different handwritten content than seed documents
+
+## Visual Placeholders (if document type requires)
+- Include placeholders for non-text visual elements as JSON in `<script type="application/json" id="visual_elements">...</script>` tag.
+- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content - must be different from seed)
+- Match placement patterns from seed documents with appropriate dimensions and y-rotation
+- Examples: `[{"type": "signature", "content": "Jane Smith", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]`
+
+## Data Generation Guidelines
+- Names: Generate culturally appropriate fictional names
+- Addresses: Create realistic but non-existent addresses
+- Phone/Fax: Use valid formats with fictional numbers
+- IBANs/Account numbers: Generate format-compliant fictional numbers
+- Dates: Use different dates maintaining logical consistency
+- Amounts: Generate different values appropriate to context
+- IDs/References: Create format-matching fictional identifiers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Clone #1 - Different data from seed -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Clone #2 - Different data from seed -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Layout/structure matches seed documents
+- [ ] Typography and colors replicated
+- [ ] ALL data is different from seed (no copied info)
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] Data formats are culturally appropriate
+
+Generate {num_solutions} cloned {doc_type} documents with new data.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined4/seed-based.txt b/data/prompt_templates/ClaudeRefined4/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..985e165a782f5848155ec49288544652368759cd
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined4/seed-based.txt
@@ -0,0 +1,71 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten' and apply no styles to this class
+- Distinguish between different sizes of handwriting using classes 'hw-size1', 'hw-size2' which are in line with realistic handwriting and dependent on the context
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document
+
+## Visual Placeholders (if document type requires)
+- Include placeholders for non-text visual elements as JSON in `<script type="application/json" id="visual_elements">...</script>` tag.
+- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content)
+- Describe placement of each visual element with appropriate dimensions and y-rotation
+- Examples: `[{"type": "signature", "content": "John Doe", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined5/seed-based.txt b/data/prompt_templates/ClaudeRefined5/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..0d87781ea0b394f199ee0b01c8d3a1705c29c166
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined5/seed-based.txt
@@ -0,0 +1,77 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document
+- Never include signatures as handwriting
+
+## Visual Placeholders (if document type requires)
+- Use invisible placeholder divs with class 'visual-placeholder'
+- Specify type via data-type attribute ('signature', 'stamp', 'logo', 'barcode', 'photo', 'chart', etc.)
+- Add data-content attribute with actual content description
+- For signatures/handwriting, add author class ('author1', 'author2', etc.) to distinguish different people
+- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate
+- Specify dimensions in mm/cm and rotation via inline style transform
+- For overlapping elements (stamps over text), use CSS z-index and absolute positioning
+- Example: `<div class="visual-placeholder author1" data-type="signature" data-content="John Doe" style="width: 11mm; height: 20mm; transform: rotate(-4deg);"></div>`
+- Example: `<div class="visual-placeholder" data-type="stamp" data-content="APPROVED 2024-03-15" style="position: absolute; top: 50mm; right: 20mm; width: 30mm; height: 30mm; transform: rotate(-15deg); z-index: 10;"></div>`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined6/seed-based.txt b/data/prompt_templates/ClaudeRefined6/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..34abb8cfa0c6676642a2a3dae57dc4ed613dab8d
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined6/seed-based.txt
@@ -0,0 +1,77 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- Never include signatures as handwriting
+
+## Visual Placeholders (if document type requires)
+- Use invisible placeholder divs with class 'visual-placeholder'
+- Specify type via data-type attribute (signature, stamp, logo, barcode, photo, chart, etc.)
+- Add data-content attribute with actual content description
+- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people
+- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate
+- Specify dimensions in mm/cm and rotation via **inline** style transform
+- For overlapping elements (stamps over text), use CSS z-index and absolute positioning
+- Example: `<div class="visual-placeholder author1" data-type="signature" data-content="John Doe" style="width: 11mm; height: 20mm; transform: rotate(-4deg);"></div>`
+- Example: `<div class="visual-placeholder" data-type="stamp" data-content="APPROVED 2024-03-15" style="position: absolute; top: 50mm; right: 20mm; width: 30mm; height: 30mm; transform: rotate(-15deg); z-index: 10;"></div>`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined7/seed-based.txt b/data/prompt_templates/ClaudeRefined7/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..9016855e8c2b98941ccbee9dd1a1619d64daf01d
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined7/seed-based.txt
@@ -0,0 +1,78 @@
+You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents.
+
+# Cultural Variations
+Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards.
+
+# Task: Generate {num_solutions} unique HTML documents
+
+## Requirements
+1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Required Fields**: {required_sections}
+3. **Format**: Single-page, dimensions appropriate to document type
+4. **Language**: {language}
+5. **Background**: {background_requirements}
+6. **Uniqueness**: 70%+ different in code, styling, content
+7. **Static Only**: No animations, transitions, or dynamic effects
+
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Technical
+- Wrap each in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Specify page size via `@media print { @page { size: ... } }` in CSS and use standard sizes when appropiate
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply generously increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- Never include signatures as handwriting
+
+## Visual Placeholders (if document type requires)
+- Use invisible placeholder divs with class 'visual-placeholder'
+- Specify type via data-type attribute (signature, stamp, logo, barcode, photo, chart, etc.)
+- Add data-content attribute with actual content description
+- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people
+- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate
+- Specify dimensions in mm/cm
+- For overlapping elements (e.g. stamps over text), use CSS z-index and absolute positioning
+- Example: `<div class="visual-placeholder author1" data-type="signature" data-content="John Doe" style="width: 11mm; height: 20mm;"></div>`
+- Example: `<div class="visual-placeholder" data-type="stamp" data-content="APPROVED 2024-03-15" style="position: absolute; top: 50mm; right: 20mm; width: 30mm; height: 30mm; z-index: 10;"></div>`
+
+## Structural Elements (analyze seed images for)
+Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers
+
+## Additional Requirements
+{user_descriptions}
+
+## Output Format
+```
+1. <HTML>
+   <!-- Solution #1 -->
+   ...complete document...
+</HTML>
+
+2. <HTML>
+   <!-- Solution #2 -->
+   ...complete document...
+</HTML>
+
+...
+```
+
+## Quality Checklist
+- [ ] Authentic cultural characteristics
+- [ ] All required sections: {required_sections}
+- [ ] Static styling only
+- [ ] Single-page format
+- [ ] {language} language
+- [ ] Ground truth JSON included
+- [ ] 70%+ unique
+
+Generate {num_solutions} distinct {doc_type} documents.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined8/seed-based.txt b/data/prompt_templates/ClaudeRefined8/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..2c92de2b2134d09db081d31545dc2511579008d8
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined8/seed-based.txt
@@ -0,0 +1,60 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Specify page size via `@media print { @page { size: ... } }` and also `body` such that the content looks the same in browser and when printed
+- In CSS use standard sizes when appropriate
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply generously increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- Never include signatures as handwriting
+
+## Visual Placeholders
+- Use `<div data-placeholder="type" style="...">` for non-text elements (signature, stamp, logo, barcode, photo, chart)
+- Add data-content attribute with actual content description
+- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people
+- Dimensions in mm/cm: `width:30mm;height:20mm;`
+- Positioning: `position:absolute;top:50mm;right:20mm;` with `z-index` for overlays
+- Example: `<div data-placeholder="signature" data-content="john" class="author1" style="position:absolute;width:11mm;height:20mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:30mm;height:30mm;z-index:10;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+- Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+- For each GT entry, insert the key of the entry as the `id` attribute with the corresponding HTML element.
+- Individual values MUST BE visible and found in the DOM as elements because we want to get the geometries of the values before printing.
+- Example: `<div id="PAIR_1">Name:</div>`
+- Example: `<div id="COMPANY">Corp XY LLC</div>`
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with correct dimensions and minified HTML/CSS
+- [ ] Content in {language}
+- [ ] GT ids present HTML and GT JSON present and correctly formatted
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images in {language}.
\ No newline at end of file
diff --git a/data/prompt_templates/ClaudeRefined9/seed-based.txt b/data/prompt_templates/ClaudeRefined9/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..4b77f2ba4e08dadcf342bce1510e7286ed80df38
--- /dev/null
+++ b/data/prompt_templates/ClaudeRefined9/seed-based.txt
@@ -0,0 +1,54 @@
+You are an AI creating authentic HTML representations of documents based on seed images.
+Analyze the seed images for structural and semantic content and generate authentic variations.
+The generated documents will be printed.
+
+## Requirements
+1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim
+2. **Format**: Single-page documents with dimensions appropriate to the document type
+3. **Language**: {language}
+4. **Static Only**: No animations, transitions, or dynamic effects
+
+## Technical
+- Wrap each document in `<HTML>...</HTML>` tags, numbered sequentially
+- Static CSS only for single-page layout
+- Generate only minified CSS, HTML, JS.
+
+## Content Guidelines
+**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling
+**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects
+
+## Handwritten Fields (if document type requires)
+- Mark with class 'handwritten'
+- Apply generously increased size to 'handwritten', in line with realistic handwriting
+- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people
+- Never include signatures as handwriting
+
+## Visual Placeholders
+- Use `<div data-placeholder="type" style="...">` for non-text elements (signature, stamp, logo, barcode, photo, chart)
+- Add data-content attribute with actual content description
+- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people
+- Dimensions in mm/cm: `width:30mm;height:20mm;`
+- Positioning: `position:absolute;top:50mm;right:20mm;` with `z-index` for overlays
+- Example: `<div data-placeholder="signature" data-content="john" class="author1" style="position:absolute;width:11mm;height:20mm;z-index:10;"></div>`
+- Example: `<div data-placeholder="stamp" data-content="APPROVED 2024-03-15" style="position:absolute;top:50mm;right:20mm;width:30mm;height:30mm;z-index:10;"></div>`
+
+## Output Format
+Generate minified HTML like this:
+```
+1. <HTML><!DOCTYPE html><html ... document 1 ... </html></HTML>
+2. <HTML><!DOCTYPE html><html ... document 2 ... </html></HTML>
+...
+```
+## Ground Truth
+Generate ground truth as JSON in `<script type="application/json" id="GT">...</script>` tag.
+Ground truth specification: {gt_type}
+Ground truth must follow the format: {gt_format}
+
+## Quality Checklist
+- [ ] Authentic variations without verbatim copying from seed images
+- [ ] Static styling only (no animations or dynamic effects)
+- [ ] Single-page format with minified HTML/CSS/JS
+- [ ] Content in {language}
+- [ ] GT JSON present and correctly formatted
+
+Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images.
\ No newline at end of file
diff --git a/data/prompt_templates/DocGenie/seed-based.txt b/data/prompt_templates/DocGenie/seed-based.txt
new file mode 100755
index 0000000000000000000000000000000000000000..8c0911a092f031d8914d4831e9a7dfd33944a1d0
--- /dev/null
+++ b/data/prompt_templates/DocGenie/seed-based.txt
@@ -0,0 +1,39 @@
+You are an AI specialized in generating unique HTML
+documents based on multiple scanned images of realworld examples. You have been provided with distinct
+sample images, each from a different cultural or regional
+background. You have been provided seed images of
+{doc type}, each originating from different cultural or regional contexts. For example, some might feature:
+• Local languages or regional disclaimers
+• Different date formats (e.g., dd/mm/yyyy vs. mm/dd/yyyy)
+• Unique currency or numbering formats
+• Varying layout norms (positions of key fields, disclaimers, official stamps, etc.)
+Now, please generate {num solutions} unique HTML
+documents that:
+1. Strictly reflect the overall style, layout, and cultural
+cues found in these samples, but do NOT copy any text,
+disclaimers, or layout verbatim from the samples.
+2. Include any essential mandatory fields: {sections}.
+3. Maintain an A4 size format for printing (using @page
+{{ size: A4; }} or similar CSS).
+4. Maintain a {background requirements}.
+5. Avoid copy-pasting or reusing large chunks of HTML,
+CSS, or disclaimers—each document must be at least
+70% different in code and text than the others.
+6. Strictly wrap each new document in
+<HTML>...</HTML> tags, for example:
+1. <HTML>...Solution #1...</HTML>
+2. <HTML>...Solution #2...</HTML>
+...
+{num solutions}. <HTML>...Solution
+#{num solutions}...</HTML>
+Additional Requirements: {user descriptions}
+Notes:
+• Pay close attention to cultural/regional differences seen
+in the seed images (e.g., language, format, disclaimers).
+• Feel free to creatively adapt or combine stylistic cues
+from the seeds, as long as the end result looks authentic
+for that cultural context.
+• Do NOT directly copy-paste text or entire code blocks
+from any single seed image or across these new solutions.
+Now please generate the {num solutions} distinct
+{doc type} documents.
diff --git a/data/prompt_templates/DocGenie/seed-free.txt b/data/prompt_templates/DocGenie/seed-free.txt
new file mode 100755
index 0000000000000000000000000000000000000000..fbdc60bdb2ca1ee2b1904774c481d90d0fe02966
--- /dev/null
+++ b/data/prompt_templates/DocGenie/seed-free.txt
@@ -0,0 +1,24 @@
+You are an AI specialized in generating multiple unique
+HTML documents in one response. Please create
+{num solutions} unique HTML documents representing
+{doc type}.
+Each solution must:
+1. Include all mandatory fields: {sections}.
+2. Be formatted so it could print on A4 (e.g., use @page
+{{ size: A4; }} in your CSS).
+3. Show a significantly different layout, styling, and textual content from every other solution.
+4. Maintain a {background requirements}.
+5. Avoid copy-pasting or reusing large chunks of HTML,
+CSS, or disclaimers—each document must be at least
+70% different in code and text than the others.
+6. Wrap each complete document between <HTML>
+and </HTML> tags, labeled as:
+1. <HTML>...Solution #1...</HTML>
+2. <HTML>...Solution #2...</HTML>
+...
+{num solutions}. <HTML>...Solution
+#{num solutions}...</HTML>
+Do not provide additional commentary or references to the
+other solutions within each HTML.
+Now generate the {num solutions} distinct {doc type}
+documents.
diff --git a/data/syn_dataset_definitions/cord_alpha=0.5.yaml b/data/syn_dataset_definitions/cord_alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..56477fc1cb5e7283ad9e4c755a9d66e190947ade
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=0.5.yaml
@@ -0,0 +1,235 @@
+name: "cord_alpha=0.5"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..97c8b163be1e4465a3d47ade8cd74a2de3a812cb
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml
@@ -0,0 +1,236 @@
+name: "cord_alpha=0.5_v1"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/cord_alpha=0.75.yaml b/data/syn_dataset_definitions/cord_alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..17a90c6b18a4d805e8a7e45dfdb226434d95ca46
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=0.75.yaml
@@ -0,0 +1,235 @@
+name: "cord_alpha=0.75"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..8c8349967f803443cb53f336fbb11aed6fdb7e01
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml
@@ -0,0 +1,235 @@
+name: "cord_alpha=0.75_v1"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/cord_alpha=1.0.yaml b/data/syn_dataset_definitions/cord_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..564c6ef6081edeebaf80a4a78c0ed17f8fa98f14
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=1.0.yaml
@@ -0,0 +1,235 @@
+name: "cord_alpha=1.0"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB.NM #MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB.UNITPRICE #MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB.CNT # MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB.PRICE #MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..f4c0bc3fef2d8b5c777b2b280aeb2891a5a0899f
--- /dev/null
+++ b/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml
@@ -0,0 +1,235 @@
+name: "cord_alpha=1.0_v1"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "cord"
+documents_count: 1200
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+
+label_mapping: 
+  MENU_NM: MENU.NM
+  MENU_NUM: MENU.NUM
+  MENU_UNITPRICE: MENU.UNITPRICE
+  MENU_CNT: MENU.CNT
+  MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE
+  MENU_PRICE: MENU.PRICE
+  MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL
+  MENU_VATYN: MENU.VATYN
+  MENU_ETC: MENU.ETC
+  MENU_SUB_NM: MENU.SUB_NM
+  MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE
+  MENU_SUB_CNT: MENU.SUB_CNT
+  MENU_SUB_PRICE: MENU.SUB_PRICE
+  MENU_SUB_ETC: MENU.SUB_ETC
+  VOID_MENU_NM: VOID_MENU.NM
+  VOID_MENU_PRICE: VOID_MENU.PRICE
+  SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE
+  SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE
+  SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE
+  SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE
+  SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE
+  SUB_TOTAL_ETC: SUB_TOTAL.ETC
+  TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE
+  TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC
+  TOTAL_CASHPRICE: TOTAL.CASHPRICE
+  TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE
+  TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE
+  TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE
+  TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT
+  TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT
+
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..a584e090f06d98708333198de6d6102016ff0ff3
--- /dev/null
+++ b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml
@@ -0,0 +1,40 @@
+name: "doclaynet4k_alpha=1.0_CLS"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "doclaynet_4k_cls"
+documents_count: 4500
+valid_labels: 
+  - financial_reports
+  - scientific_articles
+  - laws_and_regulations
+  - government_tenders
+  - manuals
+  - patents
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "single A4 pages out of diverse business and technical"
+  language: "English"
+  gt_type: |
+    document class label
+      * financial_reports
+      * scientific_articles
+      * laws_and_regulations
+      * government_tenders
+      * manuals
+      * patents
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..d672f79881d2d4d10c146a48319a43ce4dc533b2
--- /dev/null
+++ b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml
@@ -0,0 +1,60 @@
+name: "doclaynet4k_alpha=1.0_DLA"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "doclaynet_4k_dla"
+documents_count: 4500
+valid_labels: 
+  - LE-CAPTION
+  - LE-FOOTNOTE
+  - LE-FORMULA
+  - LE-LIST-ITEM
+  - LE-PAGE-FOOTER
+  - LE-PAGE-HEADER
+  - LE-PICTURE
+  - LE-SECTION-HEADER
+  - LE-TABLE
+  - LE-TEXT
+  - LE-TITLE
+label_mapping: 
+  LE-CAPTION: Caption
+  LE-FOOTNOTE: Footnote
+  LE-FORMULA: Formula
+  LE-LIST-ITEM: List-item
+  LE-PAGE-FOOTER: Page-footer
+  LE-PAGE-HEADER: Page-header
+  LE-PICTURE: Picture
+  LE-SECTION-HEADER: Section-header
+  LE-TABLE: Table
+  LE-TEXT: Text
+  LE-TITLE: "Title "
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of diverse business and technical"
+  language: "English"
+  gt_type: |
+      * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element.
+      * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers.
+      * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements.
+      * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure.
+      * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text.
+      * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names.
+      * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables.
+      * "LE-SECTION-HEADER": Section and subsection headings.
+      * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element.
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content.
+      * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml b/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e741acc279eca821ab3759ef3bc290468e91d052
--- /dev/null
+++ b/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml
@@ -0,0 +1,40 @@
+name: "doclaynet_alpha=1.0_CLS"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "doclaynet"
+documents_count: 4500
+valid_labels: 
+  - financial_reports
+  - scientific_articles
+  - laws_and_regulations
+  - government_tenders
+  - manuals
+  - patents
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "single A4 pages out of diverse business and technical"
+  language: "English"
+  gt_type: |
+    document class label
+      * financial_reports
+      * scientific_articles
+      * laws_and_regulations
+      * government_tenders
+      * manuals
+      * patents
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml b/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..704cc415d61b48f0ac721ea49b2249a46036ac95
--- /dev/null
+++ b/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml
@@ -0,0 +1,49 @@
+name: "doclaynet_alpha=1.0_DLA"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "doclaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-CAPTION
+  - LE-FOOTNOTE
+  - LE-FORMULA
+  - LE-LIST-ITEM
+  - LE-PAGE-FOOTER
+  - LE-PAGE-HEADER
+  - LE-PICTURE
+  - LE-SECTION-HEADER
+  - LE-TABLE
+  - LE-TEXT
+  - LE-TITLE
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of diverse business and technical"
+  language: "English"
+  gt_type: |
+      * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element.
+      * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers.
+      * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements.
+      * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure.
+      * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text.
+      * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names.
+      * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables.
+      * "LE-SECTION-HEADER": Section and subsection headings.
+      * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element.
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content.
+      * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa.yaml b/data/syn_dataset_definitions/docvqa.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..d01d9234a697f4925263966fe1a164452da20ca3
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa.yaml
@@ -0,0 +1,24 @@
+name: "docvqa"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e85ff874ab12aa07a541c08464fba387c9efce65
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=0.5"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..b4969951fe24990db65740d49e1e5aeb40bd79af
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=0.5_v1"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..df32b8ec9bedd893272cf1c2dc07abc0a7efc45d
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=0.75"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..0faec31c7ec5b5ddae500e8abfaff2dbbc284de1
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=0.75_v1"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml b/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..a300707ba8462a2b661629dd466a475c170a5011
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=1.0"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..c4600f0d5c0430203a3f34883c18c4d09d18350a
--- /dev/null
+++ b/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml
@@ -0,0 +1,24 @@
+name: "docvqa_alpha=1.0_v1"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_docvqa"
+documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/funsd_alpha=1.0.yaml b/data/syn_dataset_definitions/funsd_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..08368cca7e58a7a63bdf5010f265e0dc704d213b
--- /dev/null
+++ b/data/syn_dataset_definitions/funsd_alpha=1.0.yaml
@@ -0,0 +1,133 @@
+name: "funsd_alpha=1.0"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "funsd"
+documents_count: 300
+valid_labels: 
+  - HEADER
+  - QUESTION
+  - ANSWER
+label_mapping: 
+valid_secondary_labels:
+  - PAIR_1
+  - PAIR_2
+  - PAIR_3
+  - PAIR_4
+  - PAIR_5
+  - PAIR_6
+  - PAIR_7
+  - PAIR_8
+  - PAIR_9
+  - PAIR_10
+  - PAIR_11
+  - PAIR_12
+  - PAIR_13
+  - PAIR_14
+  - PAIR_15
+  - PAIR_16
+  - PAIR_17
+  - PAIR_18
+  - PAIR_19
+  - PAIR_20
+  - PAIR_21
+  - PAIR_22
+  - PAIR_23
+  - PAIR_24
+  - PAIR_25
+  - PAIR_26
+  - PAIR_27
+  - PAIR_28
+  - PAIR_29
+  - PAIR_30
+  - PAIR_31
+  - PAIR_32
+  - PAIR_33
+  - PAIR_34
+  - PAIR_35
+  - PAIR_36
+  - PAIR_37
+  - PAIR_38
+  - PAIR_39
+  - PAIR_40
+  - PAIR_41
+  - PAIR_42
+  - PAIR_43
+  - PAIR_44
+  - PAIR_45
+  - PAIR_46
+  - PAIR_47
+  - PAIR_48
+  - PAIR_49
+  - PAIR_50
+  - PAIR_51
+  - PAIR_52
+  - PAIR_53
+  - PAIR_54
+  - PAIR_55
+  - PAIR_56
+  - PAIR_57
+  - PAIR_58
+  - PAIR_59
+  - PAIR_60
+  - PAIR_61
+  - PAIR_62
+  - PAIR_63
+  - PAIR_64
+  - PAIR_65
+  - PAIR_66
+  - PAIR_67
+  - PAIR_68
+  - PAIR_69
+  - PAIR_70
+  - PAIR_71
+  - PAIR_72
+  - PAIR_73
+  - PAIR_74
+  - PAIR_75
+  - PAIR_76
+  - PAIR_77
+  - PAIR_78
+  - PAIR_79
+  - PAIR_80
+  - PAIR_81
+  - PAIR_82
+  - PAIR_83
+  - PAIR_84
+  - PAIR_85
+  - PAIR_86
+  - PAIR_87
+  - PAIR_88
+  - PAIR_89
+  - PAIR_90
+  - PAIR_91
+  - PAIR_92
+  - PAIR_93
+  - PAIR_94
+  - PAIR_95
+  - PAIR_96
+  - PAIR_97
+  - PAIR_98
+  - PAIR_99
+  - PAIR_100
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "form"
+  language: "English"
+  gt_type: |
+    keys and their values structured as QA pairs
+      * "HEADER": The header of the question answer pair.
+      * "QUESTION": The question i.e. a key.
+      * "ANSWER": The answer i.e a value.
+  gt_format: |
+    Group individual annotations in groups using the enumerator class PAIR_<idx> and a annotation class from the list above (e.g. "PAIR_1 QUESTION", "PAIR_1 ANSWER", "PAIR_2 HEADER", ...).
+    Ensure to annotate exact using spans, i.e. "QUESTION" element should not contain "ANSWER".
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml b/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..268048251ed1767b29dbdaa91e4f91356d05e9bc
--- /dev/null
+++ b/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml
@@ -0,0 +1,27 @@
+name: "icdar2019_alpha=1.0"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "icdar2019"
+documents_count: 1600
+valid_labels: 
+  - LE-TABLE
+label_mapping: 
+  LE-TABLE: table
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of diverse modern digital-born and historical archival scanned"
+  language: "English"
+  gt_type: |
+      * "LE-TABLE": Any tabular structure containing data organized in rows and columns. Include the complete table region from border to border.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/kleister_alpha=1.0.yaml b/data/syn_dataset_definitions/kleister_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..3c43337a33cae3f3d4477805b7b685140063027b
--- /dev/null
+++ b/data/syn_dataset_definitions/kleister_alpha=1.0.yaml
@@ -0,0 +1,41 @@
+name: "kleister_alpha=1.0"
+task: "KIE"
+dataloader_model_task_as: "QA"
+base_dataset_name: "ex_klc"
+documents_count: 4000
+valid_labels:
+  - address__post_town
+  - address__postcode
+  - address__street_line
+  - charity_name
+  - charity_number
+  - income_annually_in_british_pounds
+  - report_date
+  - spending_annually_in_british_pounds
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "UK charity annual financial report"
+  language: "English"
+  gt_type: |
+    keys and their values (if applicable, provide as plaintext values from the document):
+      * "address__post_town": Post town of the address of the charitable organization.
+      * "address__postcode": Postcode of the address of the charitable organization.
+      * "address__street_line": Street line of the address of the charitable organization.
+      * "charity_name": The name of the charitable organization.
+      * "charity_number": The registered number of the charitable organization.
+      * "income_annually_in_british_pounds": The annual income in British Pounds of the charitable organization.
+      * "report_date": The reporting date of the annual document of the charitable organization.
+      * "spending_annually_in_british_pounds": The annual spending in British Pounds of the charitable organization.
+  gt_format: '{"address__post_town": "<value>", "spending_annually_in_british_pounds": "<value>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..163946dbc2158f3fff6154388ffe18514685689e
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml
@@ -0,0 +1,89 @@
+name: "cord"
+task: "KIE"
+base_dataset_name: "cord"
+documents_count: 1000
+valid_labels: 
+  - MENU.NM
+  - MENU.NUM
+  - MENU.UNITPRICE
+  - MENU.CNT
+  - MENU.DISCOUNTPRICE
+  - MENU.PRICE
+  - MENU.ITEMSUBTOTAL
+  - MENU.VATYN
+  - MENU.ETC
+  - MENU.SUB.NM
+  - MENU.SUB.UNITPRICE
+  - MENU.SUB.CNT
+  - MENU.SUB.PRICE
+  - MENU.SUB.ETC
+  - VOID_MENU.NM
+  - VOID_MENU.PRICE
+  - SUB_TOTAL.SUBTOTAL_PRICE
+  - SUB_TOTAL.DISCOUNT_PRICE
+  - SUB_TOTAL.SERVICE_PRICE
+  - SUB_TOTAL.OTHERSVC_PRICE
+  - SUB_TOTAL.TAX_PRICE
+  - SUB_TOTAL.ETC
+  - TOTAL.TOTAL_PRICE
+  - TOTAL.TOTAL_ETC
+  - TOTAL.CASHPRICE
+  - TOTAL.CHANGEPRICE
+  - TOTAL.CREDITCARDPRICE
+  - TOTAL.EMONEYPRICE
+  - TOTAL.MENUTYPE_CNT
+  - TOTAL.MENUQTY_CNT
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU.NM": The menu item name.
+      * "MENU.NUM": The menu item number or identifier.
+      * "MENU.UNITPRICE": The price per unit of the menu item.
+      * "MENU.CNT": The quantity or count of the menu item.
+      * "MENU.DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU.PRICE": The final price of the menu item.
+      * "MENU.ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU.VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU.ETC": Other miscellaneous menu item information.
+      * "MENU.SUB.NM": The name of a sub-item or modifier.
+      * "MENU.SUB.UNITPRICE": The price per unit of the sub-item.
+      * "MENU.SUB.CNT": The quantity of the sub-item.
+      * "MENU.SUB.PRICE": The price of the sub-item.
+      * "MENU.SUB.ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU.NM": The name of a cancelled or voided item.
+      * "VOID_MENU.PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL.SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL.DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL.SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL.OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL.TAX_PRICE": The tax amount.
+      * "SUB_TOTAL.ETC": Other subtotal information.
+      * "TOTAL.TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL.TOTAL_ETC": Other total-related information.
+      * "TOTAL.CASHPRICE": The amount paid in cash.
+      * "TOTAL.CHANGEPRICE": The change given back to the customer.
+      * "TOTAL.CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL.EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL.MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL.MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Up to 8 menu items and the receipt data as a JSON object {
+      "MENU_1": {"MENU.NM": "<name_1>", "MENU.NUM": "<num_1>", ...},
+      "MENU_2": {"MENU.NM": "<name_2>", "MENU.NUM": "<num_2>", ...},
+      ...,
+      "VOID_MENU": {"VOID_MENU.NM": "<void_name_1>", "VOID_MENU.PRICE": "<void_price_1>"},
+      "GENERIC": {"SUB_TOTAL.SUBTOTAL_PRICE": "<sub_total_price>", ...,  "TOTAL.TOTAL_PRICE": ...}
+    }
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..13ea3e5f1ca88611e3e79b965ee9bc549a50050c
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml
@@ -0,0 +1,45 @@
+name: "doclaynet"
+task: "DLA"
+base_dataset_name: "doclaynet"
+documents_count: 10
+valid_labels: 
+  - LE-CAPTION
+  - LE-FOOTNOTE
+  - LE-FORMULA
+  - LE-LIST-ITEM
+  - LE-PAGE-FOOTER
+  - LE-PAGE-HEADER
+  - LE-PICTURE
+  - LE-SECTION-HEADER
+  - LE-TABLE
+  - LE-TEXT
+  - LE-TITLE
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of diverse business and technical"
+  language: "English"
+  gt_type: |
+    Give each applicable element in HTML a layout class from the list below to uniquely identify its label:
+      * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element.
+      * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers.
+      * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements.
+      * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure.
+      * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text.
+      * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names.
+      * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables.
+      * "LE-SECTION-HEADER": Section and subsection headings.
+      * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element.
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content.
+      * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers.
+  gt_format: 'Empty JSON object: {}'
+
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..87ba0d4e8a76c87555378271680dbf0d6b1ce488
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml
@@ -0,0 +1,19 @@
+name: "docvqa"
+task: "QA"
+base_dataset_name: "ex_docvqa"
+documents_count: 1000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+valid_labels: 
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7b08ace6c32232d726b3b2800dbe420ee373c7d8
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml
@@ -0,0 +1,28 @@
+name: "funsd"
+task: "QA"
+base_dataset_name: "funsd"
+documents_count: 300
+valid_labels: 
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "form"
+  language: "English"
+  gt_type: |
+    keys and their values structured as QA pairs
+      * "HEADER": The header of the question answer pair.
+      * "QUESTION": The question i.e. a key.
+      * "ANSWER": The answer i.e, a value.
+  gt_format: |
+    Up to 8 pairs as a JSON object {
+      "PAIR_1": {"header": "<header text>", "question": "<key>", "answer": "<value>"},
+      "PAIR_2": {"header": "<header text>", "question": "<key>", "answer": "<value>"},
+      ...
+    }
+
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7e8b12404d1fb0551366803399633fe5784e726c
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml
@@ -0,0 +1,25 @@
+name: "icdar2019"
+task: "DLA"
+base_dataset_name: "icdar2019"
+documents_count: 10
+valid_labels: 
+  - LE-TABLE
+  
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of diverse modern digital-born and historical archival scanned"
+  language: "English"
+  gt_type: |
+    Give each applicable element in HTML a layout class from the list below to uniquely identify its label:
+      * "LE-TABLE": Any tabular structure containing data organized in rows and columns. Include the complete table region from border to border.
+  gt_format: 'Empty JSON object: {}'
+
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
+
+# Issues:
+# TODO:
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..6fdccad3871b4bdc9cc0c72c5be35882cb3109cc
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml
@@ -0,0 +1,30 @@
+name: "publaynet"
+task: "DLA"
+base_dataset_name: "publaynet"
+documents_count: 10
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+    Give each applicable element in HTML a layout class from the list below to uniquely identify its label:
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises article titles and standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format: 'Empty JSON object: {}'
+
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e5756d2d6451afc201a9cb5a74bf4c7b3e03cd87
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml
@@ -0,0 +1,52 @@
+name: "rvlcdip"
+task: "CLASSIFICATION"
+base_dataset_name: "rvlcdip"
+documents_count: 10
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..18680f4cd72e6fd69399e291f9b77adc3443ce5c
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml
@@ -0,0 +1,32 @@
+name: "sroie"
+task: "KIE"
+base_dataset_name: "sroie"
+documents_count: 1000
+valid_labels: 
+  - COMPANY
+  - DATE
+  - ADDRESS
+  - TOTAL
+  
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: 'JSON object {"COMPANY": "<value>", "DATE": "<value>", "ADDRESS": "<value>", "TOTAL": "<value>"}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..88fc8ea9390ea663bcbf21e5b4ecd5a66a64f89b
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml
@@ -0,0 +1,44 @@
+name: "tobacco3482"
+task: "CLASSIFICATION"
+base_dataset_name: "tobacco3482"
+documents_count: 1000
+valid_labels: 
+  - ADVERTISEMENT
+  - EMAIL
+  - FORM
+  - LETTER
+  - MEMO
+  - NEWS_ARTICLE
+  - NOTE
+  - REPORT
+  - RESUME
+  - SCIENTIFIC
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "legal and corporate"
+  language: "English"
+  gt_type: |
+    document class labels:
+      * ADVERTISEMENT: Advertisement
+      * EMAIL: Email
+      * FORM: Form
+      * LETTER: Letter
+      * MEMO: Memo
+      * NEWS_ARTICLE: News article
+      * NOTE: Note/handwritten note
+      * REPORT: Report
+      * RESUME: Resume/CV
+      * SCIENTIFIC: Scientific publication
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: start | 38.61 USD
+# ICVPR: end   | 50.37 USD
+#   936 samples @ 11.76 USD => 1.25 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml b/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..896cd126f1e76400132082ab4923e7480408057c
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml
@@ -0,0 +1,20 @@
+name: "docvqa-handwritten-sizes4"
+documents_count: 10 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_folder: "docvqa-handwritten-examples"
+seed_images_count: 1
+seed_image_max_width: 512
+seed_image_quality: 80
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml b/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..b30d37b86b73d7c270de3c1a0ec3d68b00a943f4
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml
@@ -0,0 +1,21 @@
+name: "docvqa-pipelinetest"
+base_dataset_name: "ex_docvqa"
+documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+sampling_strategy: "proportional_cluster_size_sampling"
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..8cf71906cc43e233c6c83624a69433bc2f0416d6
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml
@@ -0,0 +1,22 @@
+name: "docvqa-test-alpha=-1"
+base_dataset_name: "ex_docvqa"
+documents_count: 1 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: -1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7f5d1ad9d170f9d172ab5ad7213be22453c67f88
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml
@@ -0,0 +1,22 @@
+name: "docvqa-test-alpha=0.5"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..4dd7d39c594f76a82c01b9fd5f19cd05c419815e
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml
@@ -0,0 +1,22 @@
+name: "docvqa-test-alpha=0.75"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..830a206982556a4dbf750e0ebd95e84f423e5a1d
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml
@@ -0,0 +1,22 @@
+name: "docvqa-test-alpha=0"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..435ac48602e6c965d48be81c6c18eddb2028ae16
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml
@@ -0,0 +1,22 @@
+name: "docvqa-test-alpha=1"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-test.yaml b/data/syn_dataset_definitions/legacy/docvqa-test.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..c241265205f64c3e77f1773e02273d7ff3c49b85
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-test.yaml
@@ -0,0 +1,21 @@
+name: "docvqa-test"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+prompt_template: "ClaudeRefined7"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml b/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7c29e7c4bb4bfcbe802f90bad2e5a0f0d1fc9dbe
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml
@@ -0,0 +1,21 @@
+name: "docvqa-viselems"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+prompt_template: "ClaudeRefined10"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml b/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..9f8248735bf2f1929786613922dfdf31657d79ba
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml
@@ -0,0 +1,18 @@
+name: "docvqa-viselems2"
+task: "QA"
+base_dataset_name: "ex_docvqa"
+documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_count: 10
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/sroie-test.yaml b/data/syn_dataset_definitions/legacy/sroie-test.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..9538dbc43640bde0d5c5158dd747e7fab6a7bce2
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/sroie-test.yaml
@@ -0,0 +1,27 @@
+name: "sroie-test"
+task: "KIE"
+base_dataset_name: "sroie"
+documents_count: 100
+
+prompt_template: "ClaudeRefined11"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: 'JSON object {"COMPANY": "<value>", "DATE": "<value>", "ADDRESS": "<value>", "TOTAL": "<value>"}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml b/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..41cbee70ab5b361bbc639a9bfcf5a7c078c1c817
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml
@@ -0,0 +1,34 @@
+name: "sroie"
+task: "KIE"
+base_dataset_name: "sroie"
+documents_count: 50
+valid_labels: 
+  - COMPANY
+  - DATE
+  - ADDRESS
+  - TOTAL
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: |
+    Ensure every label is only present once and to annotate exact using spans, e.g. "ADDRESS" element should not contain other contact info.
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..a0761ea7cda99c190ae82ab93954b00eac58c881
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml
@@ -0,0 +1,20 @@
+name: "syn_docvqa-handwritten-authors-visual_elements-examples_seed_based"
+documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined2"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_folder: "docvqa-handwritten-examples"
+seed_images_count: 1
+seed_image_max_width: 512
+seed_image_quality: 80
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..93c5fd84c5013b4e2277d28e46c8696efbfa5d1d
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml
@@ -0,0 +1,29 @@
+name: "syn-docvqa-handwritten-examples-seed-based"
+documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined1"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "Also include **handwritten textfields**, if the type of document demands it: mark these simply with the HTML class 'handwritten', otherwise apply no specific styles or fonts and treat them as usual text spans.
+   Analyze the seed images to identify and replicate the primary structural elements, which may include: 
+   * Headers, titles, and document identification
+   * Main content organization (tables, paragraphs, lists, visual elements)
+   * Data relationships and hierarchical information
+   * Labels, captions, and descriptive text
+   * Numerical data, dates, and reference information
+   * Visual elements like charts, diagrams, or structured layouts
+   * Footer information, signatures, or supplementary details
+   * Any other document-specific organizational patterns observed"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_folder: "docvqa-handwritten-examples"
+seed_images_count: 1
+seed_image_max_width: 500
+seed_image_quality: 80
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..cf894d8a287cf033f6a104801e84785090a81587
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml
@@ -0,0 +1,28 @@
+name: "syn-docvqa-seed-based"
+documents_count: 15000 # 10.194 Documents in DocVQA train, 39,461 QA pairs
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined1"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business and administrative"
+  language: "English"
+  sections:
+    - "N/A - replicate structural elements observed in seed images"
+  background_requirements: "white background"
+  additional_requirements: "Analyze the seed images to identify and replicate the primary structural elements, which may include: 
+   * Headers, titles, and document identification
+   * Main content organization (tables, paragraphs, lists, visual elements)
+   * Data relationships and hierarchical information
+   * Labels, captions, and descriptive text
+   * Numerical data, dates, and reference information
+   * Visual elements like charts, diagrams, or structured layouts
+   * Footer information, signatures, or supplementary details
+   * Any other document-specific organizational patterns observed"
+  gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document."
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_images_folder: "docvqa"
+seed_images_count: 10
+seed_image_max_width: 500
+seed_image_quality: 80
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..10cd8bdc1b5e8d45448e7e165089d3e3d5f09cab
--- /dev/null
+++ b/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml
@@ -0,0 +1,23 @@
+name: "syn-sroie-seed-based"
+documents_count: 600
+
+seed_type: "seed-based"  # or "seed-free"
+prompt_template: "ClaudeRefined1"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  sections:
+    - "company"
+    - "date"
+    - "address"
+    - "total"
+  background_requirements: "white background"
+  additional_requirements: "None"
+  gt_type: "keys and their values"
+  gt_format: '{"company": "company value", "date": "date value", "address": "address value", "total": "total value"}'
+
+seed_images_folder: "sroie"
+seed_images_count: 10
+seed_image_max_width: 500
+seed_image_quality: 80
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..4fe80d9ab6b792c2aa65c084788007cf63702c54
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_alpha=0.5"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 0.5
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..c86b6bd8bd993711c1385a8dd4293dc3d80ad83d
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_alpha=0.5_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 0.5
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7f363a06a904e9e7d2ecad5b8677c07b886b000d
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_alpha=0.75"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 0.75
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..760d1911dd20a864b8d1d5a6d6c961f5440552b9
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_alpha=0.75_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 0.75
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml b/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..0840bd9ac7b3e771cc622821f4e03a035262c5ee
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml
@@ -0,0 +1,39 @@
+name: "publaynet_alpha=1.0"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+label_mapping: 
+  LE-TEXT: text
+  LE-TITLE: title
+  LE-TABLE: table
+  LE-FIGURE: figure
+  LE-LIST: list
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..25e71a4f983d90d6529e6984dbfa00bb4e5ce6c6
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_alpha=1.0_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10 # Should have been 5
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..d016b2d8976c16a500fbecd8134112a551ca861a
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=0.5"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 0.5
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..39002a7736e3235bf95bebdfa4a18d87db2286db
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=0.5_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 0.5
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..eb43943862eae3251bfd507029df532e43731234
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=0.75"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 0.75
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..7864fc37f1a1fa59d02c47a1f4ab7bbe41c35c0e
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=0.75_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 0.75
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..95f91c0efdcbc04256798d4faf3a14451d8cfe80
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=1.0"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..14e9cf44dcacec4178f58fad1b2a5cc2ae27caee
--- /dev/null
+++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml
@@ -0,0 +1,33 @@
+name: "publaynet_correct-sampling_alpha=1.0_v1"
+task: "DLA"
+dataloader_model_task_as:
+base_dataset_name: "publaynet"
+documents_count: 4500
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v1"
+seed_images_count: 4
+hdbscan_min_cluster_size: 5
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/rvlcdip.yaml b/data/syn_dataset_definitions/rvlcdip.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e0897cbfe4dc63644d48d20d3429076c5909c1a3
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 10
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..099934af2e96f6783e6c62786b2607c635bce8c6
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=0.5"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..5a9c73a4508b804dbd661dcbc02112494d74f3f6
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=0.5_v1"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.5
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..306524e3ec99a18c72bd8a23370a2d2e117e50f3
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=0.75"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..595c21385ab79a6724b4e54a04eb7b5d5b85e9f9
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=0.75_v1"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 0.75
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..3b32ab6f79eed8a14e2d624c629d99c2bf12cfc3
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=1.0"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..d78111012868203f5be22fe285b3aa56d2cf7d56
--- /dev/null
+++ b/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml
@@ -0,0 +1,57 @@
+name: "rvlcdip_alpha=1.0_v1"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "rvlcdip"
+documents_count: 4500
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v1"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/sroie.yaml b/data/syn_dataset_definitions/sroie.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..0ad1e348176c843296dffc5931517b4aa2d4c9fd
--- /dev/null
+++ b/data/syn_dataset_definitions/sroie.yaml
@@ -0,0 +1,37 @@
+name: "sroie"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "sroie"
+documents_count: 50
+valid_labels: 
+  - COMPANY
+  - DATE
+  - ADDRESS
+  - TOTAL
+label_mapping: 
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: '{"COMPANY": "<value>", "DATE": "<value>", "ADDRESS": "<value>", "TOTAL": "<value>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/sroie_alpha=1.0.yaml b/data/syn_dataset_definitions/sroie_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..1122fc39ac9d8f51714077ce5b866ce536b4985e
--- /dev/null
+++ b/data/syn_dataset_definitions/sroie_alpha=1.0.yaml
@@ -0,0 +1,37 @@
+name: "sroie_alpha=1.0"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "sroie"
+documents_count: 1000
+valid_labels: 
+  - COMPANY
+  - DATE
+  - ADDRESS
+  - TOTAL
+label_mapping: 
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: '{"COMPANY": "<value>", "DATE": "<value>", "ADDRESS": "<value>", "TOTAL": "<value>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/sroie_test.yaml b/data/syn_dataset_definitions/sroie_test.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..3b50a0e0bc5fd826fa1f4f45205a9773b78eaddf
--- /dev/null
+++ b/data/syn_dataset_definitions/sroie_test.yaml
@@ -0,0 +1,37 @@
+name: "sroie_test"
+task: "KIE"
+dataloader_model_task_as:
+base_dataset_name: "sroie"
+documents_count: 10
+valid_labels: 
+  - COMPANY
+  - DATE
+  - ADDRESS
+  - TOTAL
+label_mapping: 
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    keys and their values
+      * "COMPANY": The company name.
+      * "DATE": The date on the receipt.
+      * "ADDRESS": The address of the company.
+      * "TOTAL": The total amount.
+  gt_format: '{"COMPANY": "<value>", "DATE": "<value>", "ADDRESS": "<value>", "TOTAL": "<value>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: 22.10.2025 | 11.21 USD
+# ICVPR: 23.10.2025 | 38.61 USD
+#   1950 samples @ 27.4 USD => 1.4 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/templates/cord.yaml b/data/syn_dataset_definitions/templates/cord.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..fb069bd03f06f336deeb77050c121a49cdb96388
--- /dev/null
+++ b/data/syn_dataset_definitions/templates/cord.yaml
@@ -0,0 +1,200 @@
+name: "cord"
+task: "KIE"
+base_dataset_name: "cord"
+documents_count: 10
+valid_labels: 
+  - MENU_NM
+  - MENU_NUM
+  - MENU_UNITPRICE
+  - MENU_CNT
+  - MENU_DISCOUNTPRICE
+  - MENU_PRICE
+  - MENU_ITEMSUBTOTAL
+  - MENU_VATYN
+  - MENU_ETC
+  - MENU_SUB_NM
+  - MENU_SUB_UNITPRICE
+  - MENU_SUB_CNT
+  - MENU_SUB_PRICE
+  - MENU_SUB_ETC
+  - VOID_MENU_NM
+  - VOID_MENU_PRICE
+  - SUB_TOTAL_SUBTOTAL_PRICE
+  - SUB_TOTAL_DISCOUNT_PRICE
+  - SUB_TOTAL_SERVICE_PRICE
+  - SUB_TOTAL_OTHERSVC_PRICE
+  - SUB_TOTAL_TAX_PRICE
+  - SUB_TOTAL_ETC
+  - TOTAL_TOTAL_PRICE
+  - TOTAL_TOTAL_ETC
+  - TOTAL_CASHPRICE
+  - TOTAL_CHANGEPRICE
+  - TOTAL_CREDITCARDPRICE
+  - TOTAL_EMONEYPRICE
+  - TOTAL_MENUTYPE_CNT
+  - TOTAL_MENUQTY_CNT
+valid_secondary_labels:
+  - MENU_1
+  - MENU_2
+  - MENU_3
+  - MENU_4
+  - MENU_5
+  - MENU_6
+  - MENU_7
+  - MENU_8
+  - MENU_9
+  - MENU_10
+  - MENU_11
+  - MENU_12
+  - MENU_13
+  - MENU_14
+  - MENU_15
+  - MENU_16
+  - MENU_17
+  - MENU_18
+  - MENU_19
+  - MENU_20
+  - MENU_21
+  - MENU_22
+  - MENU_23
+  - MENU_24
+  - MENU_25
+  - MENU_26
+  - MENU_27
+  - MENU_28
+  - MENU_29
+  - MENU_30
+  - MENU_31
+  - MENU_32
+  - MENU_33
+  - MENU_34
+  - MENU_35
+  - MENU_36
+  - MENU_37
+  - MENU_38
+  - MENU_39
+  - MENU_40
+  - MENU_41
+  - MENU_42
+  - MENU_43
+  - MENU_44
+  - MENU_45
+  - MENU_46
+  - MENU_47
+  - MENU_48
+  - MENU_49
+  - MENU_50
+  - MENU_51
+  - MENU_52
+  - MENU_53
+  - MENU_54
+  - MENU_55
+  - MENU_56
+  - MENU_57
+  - MENU_58
+  - MENU_59
+  - MENU_60
+  - MENU_61
+  - MENU_62
+  - MENU_63
+  - MENU_64
+  - MENU_65
+  - MENU_66
+  - MENU_67
+  - MENU_68
+  - MENU_69
+  - MENU_70
+  - MENU_71
+  - MENU_72
+  - MENU_73
+  - MENU_74
+  - MENU_75
+  - MENU_76
+  - MENU_77
+  - MENU_78
+  - MENU_79
+  - MENU_80
+  - MENU_81
+  - MENU_82
+  - MENU_83
+  - MENU_84
+  - MENU_85
+  - MENU_86
+  - MENU_87
+  - MENU_88
+  - MENU_89
+  - MENU_90
+  - MENU_91
+  - MENU_92
+  - MENU_93
+  - MENU_94
+  - MENU_95
+  - MENU_96
+  - MENU_97
+  - MENU_98
+  - MENU_99
+  - MENU_100
+  - VOID_MENU
+  - VOID_MENU_1 # the LLM shouldn't do this but does
+  - VOID_MENU_2
+  - VOID_MENU_3
+  - VOID_MENU_4
+  - VOID_MENU_5
+  - VOID_MENU_6
+  - VOID_MENU_7
+  - VOID_MENU_8
+  - VOID_MENU_9
+  - VOID_MENU_10
+  - GENERIC
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 3
+  doc_type: "receipt"
+  language: "English"
+  gt_type: |
+    (if applicable, provide as plaintext values from the document)
+      // Menu items (multiple menu items are allowed)
+      * "MENU_NM": The menu item name.
+      * "MENU_NUM": The menu item number or identifier.
+      * "MENU_UNITPRICE": The price per unit of the menu item.
+      * "MENU_CNT": The quantity or count of the menu item.
+      * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item.
+      * "MENU_PRICE": The final price of the menu item.
+      * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line.
+      * "MENU_VATYN": The VAT indicator (yes/no) for the menu item.
+      * "MENU_ETC": Other miscellaneous menu item information.
+      * "MENU_SUB_NM": The name of a sub-item or modifier.
+      * "MENU_SUB_UNITPRICE": The price per unit of the sub-item.
+      * "MENU_SUB_CNT": The quantity of the sub-item.
+      * "MENU_SUB_PRICE": The price of the sub-item.
+      * "MENU_SUB_ETC": Other sub-item information.
+      // Menu items that were canceled
+      * "VOID_MENU_NM": The name of a cancelled or voided item.
+      * "VOID_MENU_PRICE": The price of the cancelled item.
+      // Generic receipt data
+      * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges.
+      * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount.
+      * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee.
+      * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges.
+      * "SUB_TOTAL_TAX_PRICE": The tax amount.
+      * "SUB_TOTAL_ETC": Other subtotal information.
+      * "TOTAL_TOTAL_PRICE": The final total amount on the receipt.
+      * "TOTAL_TOTAL_ETC": Other total-related information.
+      * "TOTAL_CASHPRICE": The amount paid in cash.
+      * "TOTAL_CHANGEPRICE": The change given back to the customer.
+      * "TOTAL_CREDITCARDPRICE": The amount paid by credit card.
+      * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment.
+      * "TOTAL_MENUTYPE_CNT": The count of different menu item types.
+      * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered.
+  gt_format: |
+    Group individual menu items in groups using the menu item enumerator class MENU_<idx> and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...).
+    For void/canceled menu items use the class "VOID_MENU" instead of the enumeration.
+    For generic receipt data use the class "GENERIC".
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/templates/publaynet.yaml b/data/syn_dataset_definitions/templates/publaynet.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..885158bef50feba059b2ca84e94d7d338f833ac4
--- /dev/null
+++ b/data/syn_dataset_definitions/templates/publaynet.yaml
@@ -0,0 +1,33 @@
+name: "publaynet"
+task: "DLA"
+base_dataset_name: "publaynet"
+documents_count: 20
+valid_labels: 
+  - LE-TEXT
+  - LE-TITLE
+  - LE-TABLE
+  - LE-FIGURE
+  - LE-LIST
+label_mapping: 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "annotation"
+prompt_params:
+  num_solutions: 2
+  doc_type: "single A4 pages out of one and two column scientific article"
+  language: "English"
+  gt_type: |
+      * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables.
+      * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text.
+      * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels.
+      * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures.
+      * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects.
+  gt_format:
+
+seed_selection_strategy: "v2"
+seed_images_count: 4
+hdbscan_min_cluster_size: 10
+embedding_type: image
+alpha: 1
+max_seed_pool: -1
diff --git a/data/syn_dataset_definitions/templates/rvlcdip.yaml b/data/syn_dataset_definitions/templates/rvlcdip.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..45e33017660a53598959979e9f09e489a388774a
--- /dev/null
+++ b/data/syn_dataset_definitions/templates/rvlcdip.yaml
@@ -0,0 +1,54 @@
+name: "rvlcdip"
+task: "CLASSIFICATION"
+base_dataset_name: "rvlcdip"
+documents_count: 10
+valid_labels: 
+  - letter
+  - form
+  - email
+  - handwritten
+  - advertisement
+  - scientific report
+  - scientific publication
+  - specification
+  - file folder
+  - news article
+  - budget
+  - invoice
+  - presentation
+  - questionnaire
+  - resume
+  - memo
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "business correspondence and corporate"
+  language: "English"
+  gt_type: |
+    document class label
+      * letter
+      * form
+      * email
+      * handwritten
+      * advertisement
+      * scientific report
+      * scientific publication
+      * specification
+      * file folder
+      * news article
+      * budget
+      * invoice
+      * presentation
+      * questionnaire
+      * resume
+      * memo
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml b/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..9c89ba154534e79281ee9fb5c7bb90ff7a3958f3
--- /dev/null
+++ b/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml
@@ -0,0 +1,60 @@
+name: "tobacco3482_alpha=1.0"
+task: "CLASSIFICATION"
+dataloader_model_task_as:
+base_dataset_name: "tobacco3482"
+documents_count: 5500
+valid_labels: 
+  - ADVERTISEMENT
+  - EMAIL
+  - FORM
+  - LETTER
+  - MEMO
+  - NEWS_ARTICLE
+  - NOTE
+  - REPORT
+  - RESUME
+  - SCIENTIFIC
+label_mapping: 
+  ADVERTISEMENT: ADVE
+  EMAIL: Email
+  FORM: Form
+  LETTER: Letter
+  MEMO: Memo
+  NEWS_ARTICLE: News
+  NOTE: Note
+  REPORT: Report
+  RESUME: Resume
+  SCIENTIFIC: Scientific
+ 
+valid_secondary_labels:
+
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "legal and corporate"
+  language: "English"
+  gt_type: |
+    document class labels:
+      * ADVERTISEMENT: Advertisement
+      * EMAIL: Email
+      * FORM: Form
+      * LETTER: Letter
+      * MEMO: Memo
+      * NEWS_ARTICLE: News article
+      * NOTE: Note/handwritten note
+      * REPORT: Report
+      * RESUME: Resume/CV
+      * SCIENTIFIC: Scientific publication
+  gt_format: 'JSON object {"label": "<class label>"}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 10
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
+
+# ICVPR: start | 38.61 USD
+# ICVPR: end   | 50.37 USD
+#   936 samples @ 11.76 USD => 1.25 ct/doc
\ No newline at end of file
diff --git a/data/syn_dataset_definitions/wtq_alpha=1.0.yaml b/data/syn_dataset_definitions/wtq_alpha=1.0.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..d03cc747e95624906ca2e0bdb4c8e7f5abf3dba9
--- /dev/null
+++ b/data/syn_dataset_definitions/wtq_alpha=1.0.yaml
@@ -0,0 +1,30 @@
+name: "wtq_alpha=1.0"
+task: "QA"
+dataloader_model_task_as:
+base_dataset_name: "ex_wiki"
+documents_count: 1600  # 1600 (1400 + 200 margin of error)
+valid_labels: 
+label_mapping:
+valid_secondary_labels:
+  
+prompt_template: "ClaudeRefined12"
+prompt_task: "json"
+prompt_params:
+  num_solutions: 3
+  doc_type: "semi-structures table"
+  language: "English"
+  gt_type: |
+    Multiple complex question-answer pairs in everyday language that can be answered from the associated table, with their answers taken **verbatim** from the document.
+    Common Question Types:
+    * Lookup: Finding specific cell values ("What is the capital of France?")
+    * Aggregation: Counting, summing, averaging ("How many players scored over 20 points?")
+    * Comparison: Finding max/min ("Which country has the largest population?")
+    * Reasoning: Requiring multiple steps ("What team did the highest scorer play for?")
+  gt_format: '{"<Text of question 1>": "<Answer to question 1>", "<Text of question 2>": "<Answer to question 2>", ...}'
+
+seed_selection_strategy: "v2"
+seed_images_count: 6
+hdbscan_min_cluster_size: 5
+embedding_type: combined
+alpha: 1
+max_seed_pool: -1
diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png
new file mode 100755
index 0000000000000000000000000000000000000000..aa97cb337fdcd413404851c8e4d2cb7f97a32e39
--- /dev/null
+++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8862d479ae51472629b63424e6786a6ee0affd0b46c96dff3cc2489d6fdfa85e
+size 1210373
diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..785f4fca8497adc0fc39aed20fc6ded7461fb82c
--- /dev/null
+++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf61a520590255d3b96c005b62d52a60b8135fbd3efa6a68a3b8289a865e9217
+size 1435185
diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png
new file mode 100755
index 0000000000000000000000000000000000000000..70d255d6e0cc15d9d7e46880b04e124dae035e47
--- /dev/null
+++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:030bd85435de1b77e07a5ce579686fa807e57195914d84c438de469fb2506948
+size 1537276
diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..9ac0c82944c8810570cadd00c423d117bf5e11ff
--- /dev/null
+++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f871d775be655d255086af8ea04730c235d4040ce4e7503de98f16205ad8a373
+size 1693736
diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..bc618c88575a5f082debc36aeb90ee8c884b3516
--- /dev/null
+++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1177692955eed1930228849a22197ffa6e93d7c393a82e40dfe584d16b0bcecf
+size 1470095
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..c372c616b1709eee531374257ed7913b108b2de2
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1de46e8129861a4e753e3a6c9b10b00c8d32c3e69a3b5ff78bc2fbf6b0d86b6f
+size 1562926
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png
new file mode 100755
index 0000000000000000000000000000000000000000..4b3680af82196261833234f0c5d28ec8e4662fb1
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04f9cb6b08eff876ba35311dfb09a1ef7994219f09997064138570d59acf7ded
+size 1195767
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..95547d755a56866ad5afe728506dc399800be2d1
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6fc569379b9e99d61554940cbea022af613b1cba910d6731d23e83f9929403f
+size 1344638
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png
new file mode 100755
index 0000000000000000000000000000000000000000..f4ae0402577db8aaa501c6f0518ef1b32ae50217
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a71156800daddbf8fd1f7bfc8d5827afc24df0b080750567bcba865569d14fed
+size 1439178
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..e0534c57bebd1df59c618b1dcba988d9a4a23388
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f133f7ec17a2f0eaf58b96c941c0d420b282edc07673ff6f5b03e8dcdd7c7f
+size 1261549
diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png
new file mode 100755
index 0000000000000000000000000000000000000000..8ac33ecef7a424b0e7c93abbe737a68bd144ddf0
--- /dev/null
+++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:205f1444edf957d3301ff43ff3fb8bbfc79210c3b84de1322269cbf77dc7fa71
+size 1697350
diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png
new file mode 100755
index 0000000000000000000000000000000000000000..75f45acc15292c3628d3d0f621f01d4b5d28a42f
--- /dev/null
+++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3467d58ae9d8f2b4bdca834915ac2245096d1b673aaffb1d504246cd9bc67c9
+size 2127546
diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png
new file mode 100755
index 0000000000000000000000000000000000000000..2c36cf358e7a7732a638f6f8cc436cd5985c50d8
--- /dev/null
+++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a91e2d887b5f3bc49ea5c98c86e11bbdb806234b6716e2970a5926562e8e6be5
+size 2180085
diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png
new file mode 100755
index 0000000000000000000000000000000000000000..49003d4dba70cac7bb02ed72dddc89af277e0842
--- /dev/null
+++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb3a5912c28948e49a59d1b61886c9ba113371531a6e25de50efec70b075c74
+size 2096141
diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png
new file mode 100755
index 0000000000000000000000000000000000000000..0b5981fde9eca21d21e95bfef7d8cb2c2e54f9b4
--- /dev/null
+++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00e880cf6d316a7f46f8505b8e351ca8d21e360d93ca738f339bbd5f27d979b1
+size 2241792
diff --git a/data/visual_element_prefabs/photo/photo1.jpg b/data/visual_element_prefabs/photo/photo1.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..c1aa252ccf87453a9a0f3bbbcd333453aa33dd11
--- /dev/null
+++ b/data/visual_element_prefabs/photo/photo1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f21fa89f133ca73b40e9b6051b032b7ca0a69ff69a6f0ec160fda62bbbfacd
+size 547596
diff --git a/data/visual_element_prefabs/photo/photo2.jpg b/data/visual_element_prefabs/photo/photo2.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..2b71d7099b95ab6dd2873d01c9f4e29ee383a5b9
--- /dev/null
+++ b/data/visual_element_prefabs/photo/photo2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c55f123f45626687bea3d54bb12353447c45cce7264a13f9250610fa506209cd
+size 590629
diff --git a/data/visual_element_prefabs/photo/photo3.jpg b/data/visual_element_prefabs/photo/photo3.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..56cfdfdc993cf167c2c49b2eadc5ddfbbd24c2db
--- /dev/null
+++ b/data/visual_element_prefabs/photo/photo3.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:632ae0175f03eaaf19315a250aa6fb1af62eb4e331e3185967a3348bbd04394a
+size 550545
diff --git a/data/visual_element_prefabs/photo/photo4.jpg b/data/visual_element_prefabs/photo/photo4.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..fbd37c4d57816e3e183e4da832ef2362cba5a83c
--- /dev/null
+++ b/data/visual_element_prefabs/photo/photo4.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c298f261a7b24aadf96b63fc4cf560a1b69c236f884a58d11cf8e3dc8eed64d
+size 552578
diff --git a/data/visual_element_prefabs/photo/photo5.jpg b/data/visual_element_prefabs/photo/photo5.jpg
new file mode 100755
index 0000000000000000000000000000000000000000..da5fd64d53b6a2be84f4f9f1df4088b2f7c34433
--- /dev/null
+++ b/data/visual_element_prefabs/photo/photo5.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7df040dd747bc9b53e195a71a06171199c57a57b85e10c84c6caaf6613c4defe
+size 527274
diff --git a/deploy.sh b/deploy.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a79c88db58c289ddb028194da1b432c3c63e8472
--- /dev/null
+++ b/deploy.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+# ============================================
+# DocGenie Deployment Helper Script
+# ============================================
+# Quick deployment script for Railway + RunPod
+
+set -e  # Exit on error
+
+echo "🚀 DocGenie Deployment Helper"
+echo "=============================="
+echo ""
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print colored messages
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+
+print_error() {
+    echo -e "${RED}✗ $1${NC}"
+}
+
+print_info() {
+    echo -e "${YELLOW}ℹ $1${NC}"
+}
+
+# Check prerequisites
+echo "Checking prerequisites..."
+
+# Check if Docker is installed
+if ! command -v docker &> /dev/null; then
+    print_error "Docker is not installed. Please install Docker first."
+    exit 1
+fi
+print_success "Docker installed"
+
+# Check if .env exists
+if [ ! -f "api/.env" ]; then
+    print_error "api/.env file not found. Please create it first."
+    exit 1
+fi
+print_success "Environment file found"
+
+# Menu
+echo ""
+echo "Select deployment option:"
+echo "1) Build Handwriting Service Docker image"
+echo "2) Push Handwriting Service to Docker Hub"
+echo "3) Deploy API to Railway"
+echo "4) Run local test environment (docker-compose)"
+echo "5) Full deployment (Handwriting + API)"
+echo "0) Exit"
+echo ""
+read -p "Enter option (0-5): " option
+
+case $option in
+    1)
+        echo ""
+        print_info "Building Handwriting Service Docker image..."
+        
+        # Build image
+        cd handwriting_service
+        docker buildx build --platform linux/amd64 \
+            -t docgenie-handwriting:latest \
+            --build-arg BUILDKIT_INLINE_CACHE=1 \
+            .
+        
+        print_success "Image built successfully"
+        print_info "Tag: docgenie-handwriting:latest"
+        ;;
+        
+    2)
+        echo ""
+        read -p "Enter your Docker Hub username: " docker_username
+        
+        print_info "Tagging image for Docker Hub..."
+        docker tag docgenie-handwriting:latest ${docker_username}/docgenie-handwriting:latest
+        
+        print_info "Pushing to Docker Hub..."
+        docker push ${docker_username}/docgenie-handwriting:latest
+        
+        print_success "Image pushed successfully"
+        print_info "Deploy this on RunPod: ${docker_username}/docgenie-handwriting:latest"
+        ;;
+        
+    3)
+        echo ""
+        print_info "Deploying API to Railway..."
+        
+        # Check if Railway CLI is installed
+        if ! command -v railway &> /dev/null; then
+            print_error "Railway CLI not installed. Installing..."
+            npm i -g @railway/cli
+        fi
+        
+        # Deploy
+        railway up
+        
+        print_success "API deployed to Railway"
+        print_info "View logs: railway logs"
+        print_info "View URL: railway open"
+        ;;
+        
+    4)
+        echo ""
+        print_info "Starting local test environment..."
+        print_info "This will start: Redis, API, Worker, Handwriting Service"
+        
+        # Check if GPU is available
+        if command -v nvidia-smi &> /dev/null; then
+            print_info "GPU detected, using CUDA"
+            docker-compose up
+        else
+            print_info "No GPU detected, using CPU for handwriting service"
+            DEVICE=cpu docker-compose up
+        fi
+        ;;
+        
+    5)
+        echo ""
+        print_info "Full deployment starting..."
+        
+        # Step 1: Build handwriting image
+        print_info "Step 1/4: Building Handwriting Service..."
+        cd handwriting_service
+        docker buildx build --platform linux/amd64 \
+            -t docgenie-handwriting:latest \
+            --build-arg BUILDKIT_INLINE_CACHE=1 \
+            .
+        cd ..
+        print_success "Handwriting image built"
+        
+        # Step 2: Push to Docker Hub
+        echo ""
+        read -p "Enter your Docker Hub username: " docker_username
+        print_info "Step 2/4: Pushing to Docker Hub..."
+        docker tag docgenie-handwriting:latest ${docker_username}/docgenie-handwriting:latest
+        docker push ${docker_username}/docgenie-handwriting:latest
+        print_success "Image pushed"
+        
+        # Step 3: Deploy to RunPod (manual)
+        echo ""
+        print_info "Step 3/4: Deploy to RunPod (manual step)"
+        print_info "1. Go to https://runpod.io → Serverless → New Endpoint"
+        print_info "2. Use image: ${docker_username}/docgenie-handwriting:latest"
+        print_info "3. Select GPU: RTX 4090 or A40"
+        print_info "4. Set port: 8080"
+        print_info "5. Set env: DEVICE=cuda"
+        read -p "Press Enter when RunPod deployment is complete..."
+        
+        # Step 4: Get RunPod URL and deploy API
+        echo ""
+        read -p "Enter your RunPod endpoint URL: " runpod_url
+        
+        print_info "Step 4/4: Deploying API to Railway..."
+        
+        # Set HANDWRITING_SERVICE_URL
+        export HANDWRITING_SERVICE_URL=$runpod_url
+        
+        # Deploy to Railway
+        if ! command -v railway &> /dev/null; then
+            print_error "Railway CLI not installed. Installing..."
+            npm i -g @railway/cli
+        fi
+        
+        railway up
+        
+        print_success "Full deployment complete!"
+        echo ""
+        print_info "Next steps:"
+        print_info "1. Set HANDWRITING_SERVICE_URL in Railway dashboard"
+        print_info "2. railway variables set HANDWRITING_SERVICE_URL=$runpod_url"
+        print_info "3. Test: curl https://your-domain.up.railway.app/health"
+        ;;
+        
+    0)
+        echo "Goodbye!"
+        exit 0
+        ;;
+        
+    *)
+        print_error "Invalid option"
+        exit 1
+        ;;
+esac
+
+echo ""
+print_success "Done!"
diff --git a/docgenie/__init__.py b/docgenie/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..56bc87e9e06dec489b2e81b897c8125ca713ec3a
--- /dev/null
+++ b/docgenie/__init__.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from enum import Enum
+from pathlib import Path
+
+_root_path = Path(__file__).parent.parent.resolve()
+
+
+# Project paths
+class ENV:
+    # General
+    ROOT_DIR: Path = _root_path
+    DATA_DIR: Path = ROOT_DIR / "data"
+
+    DATASETS_DIR: Path = ROOT_DIR / "data" / "datasets"
+    BASE_DATASETS_DIR: Path = DATASETS_DIR / "base_v2"
+    SYN_DATASETS_PREPARED_DIR: Path = DATASETS_DIR / "synthesized_prepared"
+    SYN_DATASETS_DIR: Path = DATASETS_DIR / "synthesized_datasets"
+
+    VISUAL_ELEMENT_PREFABS_DIR: Path = DATA_DIR / "visual_element_prefabs"
+
+    EMBEDDINGS_DIR: Path = DATA_DIR / "embeddings"
+    GT_EMBEDDINGS_DIR: Path = DATA_DIR / "gt_embeddings"
+    CLUSTERS_DIR: Path = DATA_DIR / "clusters"
+    CLUSTER_PLOTS: Path = DATA_DIR / "cluster_plots"
+    SYN_DATASET_STAT_PLOTS: Path = DATA_DIR / "syn_dataste_statistics_plots"
+
+    ANALYZATION_DIR: Path = DATA_DIR / "analyzation"
+    GT_ANALYZATION_DIR: Path = ANALYZATION_DIR / "gt"
+    KIE_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "kie"
+    CLS_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "cls"
+    QA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "qa"
+    DLA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "dla"
+
+    WEBAPP_CACHE_DIR: Path = DATA_DIR / "webapp_cache"
+    QA_GT_WEBAPP_CACHE_DIR: Path = WEBAPP_CACHE_DIR / "qa_gt"
+
+    TEMP_DIR: Path = DATA_DIR / "temp"
+
+    MODELS_DIR: Path = DATA_DIR / "models"
+    RUNS_DIR: Path = DATA_DIR / "runs"
+
+    EXPORTS_DIR: Path = DATA_DIR / "exports"
+
+    # Contains combined datasets (original and synthetic)
+    PREPARED_DATASETS_DIR: Path = DATASETS_DIR / "prepared"
+
+    SYN_DATA_DEFINITIONS_DIR: Path = DATA_DIR / "syn_dataset_definitions"
+    PROMPT_TEMPLATES_DIR: Path = DATA_DIR / "prompt_templates"
+    SEED_IMAGES_DIR: Path = DATA_DIR / "seed-images"
+
+
+ENV.BASE_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.SYN_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.SYN_DATASETS_PREPARED_DIR.mkdir(parents=True, exist_ok=True)
+ENV.VISUAL_ELEMENT_PREFABS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.PREPARED_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.CLUSTERS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.TEMP_DIR.mkdir(parents=True, exist_ok=True)
+ENV.MODELS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.CLUSTER_PLOTS.mkdir(parents=True, exist_ok=True)
+ENV.SYN_DATASET_STAT_PLOTS.mkdir(parents=True, exist_ok=True)
+ENV.GT_EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
+ENV.KIE_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
+ENV.CLS_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
+ENV.DLA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
+ENV.QA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True)
+ENV.QA_GT_WEBAPP_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+
+class LLM:
+    CLAUDE_SONNET_4 = "claude-sonnet-4-20250514"
+    CLAUDE_SONNET_4_5 = "claude-sonnet-4-5-20250929"
+    CLAUDE_HAIKU_4_5 = "claude-haiku-4-5-20251001"
+    TINYLLM_CLAUDE_SONNET_4 = "anthropic/claude-sonnet-4-20250514"
+
+
+# Default values for generation
+class GENERATION:
+    LLM = LLM.CLAUDE_SONNET_4_5
+    MAX_TOKENS = 16384
+    HANDWRITING_MODEL_CHECKPOINT = ENV.MODELS_DIR / "handwriting" / "latest.pt"
diff --git a/docgenie/analyzation/clustering/cmds/generate_clusters.py b/docgenie/analyzation/clustering/cmds/generate_clusters.py
new file mode 100755
index 0000000000000000000000000000000000000000..a4fdbe9de2a22450293148638779e9c958f0127a
--- /dev/null
+++ b/docgenie/analyzation/clustering/cmds/generate_clusters.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._metrics import calculate_cluster_statistics
+from docgenie.analyzation.clustering.core._utilities import (
+    EmbeddingType,
+    _save_clustering_metrics,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(cfg: ClusteringConfig):
+    """
+    Generate clusters for all embedding types and save results.
+    """
+
+    import numpy as np
+
+    from docgenie.analyzation.clustering.core._algorithms import (
+        _read_and_cluster_embeddings,
+    )
+    from docgenie.analyzation.clustering.core._metrics import (
+        evaluate_clusters_unsupervised,
+    )
+    from docgenie.analyzation.clustering.core._utilities import (
+        _get_clustering_output_path,
+    )
+
+    logger.info(f"Clustering with config:\n{cfg}")
+
+    for embedding_type in EmbeddingType.__members__.values():
+        logger.info(f"Generating clusters for {embedding_type.value=}")
+
+        # see if embeddings exist
+        embeddings_path = (
+            Path(cfg.embeddings_dir) / cfg.dataset_name / (f"{embedding_type.value}.h5")
+        )
+        if not embeddings_path.exists():
+            logger.warning(
+                f"Embeddings not found for {cfg.dataset_name} at {embeddings_path}, skipping..."
+            )
+            continue
+
+        # save cluster labels
+        output_dir = Path(cfg.output_dir) / cfg.dataset_name / embedding_type.value
+        clusters_path = _get_clustering_output_path(
+            output_dir=output_dir,
+            intermediate_num_dims=cfg.intermediate_num_dims,
+            hdbscan_min_cluster_size=cfg.hdbscan_min_cluster_size,
+            hdbscan_metric=cfg.hdbscan_metric,
+            k_nn_n_neighbors=cfg.k_nn_n_neighbors,
+            do_knn=cfg.do_knn,
+            method=cfg.method,
+        )
+
+        if not Path(clusters_path).exists():
+            outputs = _read_and_cluster_embeddings(
+                embeddings_dir=cfg.embeddings_dir,
+                dataset_name=cfg.dataset_name,
+                embedding_type=embedding_type,
+                intermediate_num_dims=cfg.intermediate_num_dims,
+                hdbscan_min_cluster_size=cfg.hdbscan_min_cluster_size,
+                hdbscan_metric=cfg.hdbscan_metric,
+                k_nn_n_neighbors=cfg.k_nn_n_neighbors,
+                seed=cfg.seed,
+                do_knn=cfg.do_knn,
+                cache_dir=output_dir,
+                method=cfg.method,
+            )
+
+            logger.info(f"Saving clusters to {clusters_path}...")
+            Path(clusters_path).parent.mkdir(parents=True, exist_ok=True)
+            np.save(
+                clusters_path,
+                outputs,
+            )
+            cluster_labels = outputs["cluster_labels"]
+            num_noise = outputs.get("num_noise", 0)
+            embeddings_reduced_dim = outputs["embeddings_reduced_dim"]
+        else:
+            logger.info(f"Loading existing clusters from {clusters_path}...")
+            cluster_results = np.load(clusters_path, allow_pickle=True).item()
+            cluster_labels = cluster_results["cluster_labels"]
+            num_noise = cluster_results["num_noise"]
+            embeddings_reduced_dim = cluster_results["embeddings_reduced_dim"]
+
+        # compute cluster statistics
+        cluster_stats = calculate_cluster_statistics(
+            embeddings_reduced_dim, cluster_labels
+        )
+        cluster_stats.to_csv(
+            clusters_path.parent / clusters_path.name.replace(".npy", "_stats.csv"),
+            index=False,
+        )
+
+        # compute metrics
+        cluster_metrics, num_clusters = evaluate_clusters_unsupervised(
+            embeddings=embeddings_reduced_dim, cluster_labels=cluster_labels
+        )
+
+        # save metrics
+        _save_clustering_metrics(
+            output_dir=cfg.output_dir,
+            dataset_name=cfg.dataset_name,
+            hdbscan_min_cluster_size=cfg.hdbscan_min_cluster_size,
+            intermediate_num_dims=cfg.intermediate_num_dims,
+            hdbscan_metric=cfg.hdbscan_metric,
+            k_nn_n_neighbors=cfg.k_nn_n_neighbors,
+            method=cfg.method,
+            embedding_type=embedding_type,
+            embeddings=embeddings_reduced_dim,
+            cluster_metrics=cluster_metrics,
+            num_clusters=num_clusters,
+            num_noise=num_noise,
+            seed=cfg.seed,
+            do_knn=cfg.do_knn,
+        )
+
+
+class ClusteringConfig(pydantic.BaseModel):
+    """
+    Configuration for clustering operations.
+    """
+
+    dataset_name: str
+    seed: int = 42
+    hdbscan_min_cluster_size: int = 10
+    intermediate_num_dims: int = 100
+    hdbscan_metric: str = "euclidean"
+    do_knn: bool = True
+    k_nn_n_neighbors: int = 5
+    embeddings_dir: str | Path = ENV.EMBEDDINGS_DIR
+    output_dir: str | Path = ENV.CLUSTERS_DIR
+    method: str = "hdbscan"  # or "kmeans"
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=ClusteringConfig,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/clustering/cmds/generate_embeddings.py b/docgenie/analyzation/clustering/cmds/generate_embeddings.py
new file mode 100755
index 0000000000000000000000000000000000000000..ee9a8ec2000831a99453c5e3d602775e9cf9917c
--- /dev/null
+++ b/docgenie/analyzation/clustering/cmds/generate_embeddings.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._embeddings import (
+    _load_sample_ids_from_embeddings,
+    _save_embeddings,
+    embedding_extraction_with_cache,
+)
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interface import load_data_pipeline, load_preprocessed_data_pipeline
+from docgenie.evaluation.utils import get_device
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class GenerateEmbeddingsConfig(pydantic.BaseModel):
+    """
+    Configuration for generating embeddings.
+    """
+
+    dataset_name: str
+    is_synth: bool = False
+    output_dir: str = ENV.EMBEDDINGS_DIR
+    kernel_size: int = 4
+    split: str = "train"
+    batch_size: int = 16
+    dataloader_num_workers: int = 8
+    use_preprocessed: bool = False
+    verify_only: bool = False
+    is_synthetic: bool = False
+
+
+def main(cfg: GenerateEmbeddingsConfig):
+    # setup data pipeline and dataloaders with preprocessing
+    # this will save preprocessed msgpacks
+    if cfg.use_preprocessed:
+        data_pipeline = load_preprocessed_data_pipeline(
+            dataset_name=cfg.dataset_name,
+            is_synthetic=cfg.is_synth,
+            task_type=TaskType.generate_embeddings,
+            split=cfg.split,
+            is_synthetic=cfg.is_synthetic,
+        )
+    else:
+        data_pipeline = load_data_pipeline(
+            dataset_name=cfg.dataset_name,
+            is_synthetic=cfg.is_synth,
+            task_type=TaskType.generate_embeddings,
+            split=cfg.split,
+            is_synthetic=cfg.is_synthetic,
+        )
+
+    if cfg.verify_only:
+        output_dir = Path(cfg.output_dir) / cfg.dataset_name
+        sample_ids_per_type = {}
+        for embedding_type in list(EmbeddingType):
+            cache_file = Path(output_dir) / f"{embedding_type.value}.h5"
+            if not cache_file.exists():
+                logger.warning(
+                    f"Cache file {cache_file} does not exist. Please run the script "
+                    "without --verify_only to generate embeddings."
+                )
+                continue
+            sample_ids = _load_sample_ids_from_embeddings(cache_file)
+            logger.info(
+                f"Cache file {cache_file} exists with {len(sample_ids)} samples."
+            )
+            sample_ids_per_type[embedding_type.value] = sample_ids
+
+        # make sure sample ids are the same across all types
+        sample_ids = sample_ids_per_type[
+            sample_ids_per_type.keys().__iter__().__next__()
+        ]
+        for embedding_type, ids in sample_ids_per_type.items():
+            assert ids == sample_ids, (
+                f"Sample IDs for {embedding_type} do not match those for "
+                f"{EmbeddingType.layout.value}"
+            )
+
+        logger.info(f"All cache files exist for dataset {cfg.dataset_name}.")
+        return
+
+    # print dataset info
+    logger.info(data_pipeline.dataset)
+
+    # setup dataloader
+    dataloader = data_pipeline.dataloader(
+        split=cfg.split,
+        batch_size=cfg.batch_size,
+        shuffle=False,
+        num_workers=cfg.dataloader_num_workers,
+    )
+
+    # check whether batch in the dataset has ocr content
+    batch = next(iter(dataloader))
+    has_ocr_content = batch.words is not None
+
+    output_dir = Path(cfg.output_dir) / cfg.dataset_name
+    embeddings_per_type = {}
+    sample_ids_per_type = {}
+    for embedding_type in list(EmbeddingType):
+        if embedding_type == EmbeddingType.combined:
+            continue
+        if (
+            embedding_type
+            in [EmbeddingType.layout, EmbeddingType.text, EmbeddingType.paper]
+            and not has_ocr_content
+        ):
+            logger.warning(
+                f"Skipping {embedding_type.value} embeddings for dataset {cfg.dataset_name} "
+                "as it does not have OCR content."
+            )
+            continue
+        embeddings, sample_ids = embedding_extraction_with_cache(
+            dataloader=dataloader,
+            output_dir=output_dir,
+            embedding_type=embedding_type,
+            device=get_device(),
+        )
+        embeddings_per_type[embedding_type.value] = embeddings
+        sample_ids_per_type[embedding_type.value] = sample_ids
+        logger.info(
+            f"Generated {embedding_type.value} embeddings for {len(sample_ids)} samples."
+        )
+
+    # make sure sample ids are the same across all types
+    sample_ids = sample_ids_per_type[sample_ids_per_type.keys().__iter__().__next__()]
+    print("Sample ids of first 10 samples: ", sample_ids[:10])
+    for embedding_type, ids in sample_ids_per_type.items():
+        assert ids == sample_ids, (
+            f"Sample IDs for {embedding_type} do not match those for "
+            f"{EmbeddingType.layout.value}"
+        )
+
+    if not has_ocr_content:
+        logger.warning(
+            f"Skipping {EmbeddingType.combined.value} embeddings for dataset {cfg.dataset_name} "
+            "as it does not have OCR content."
+        )
+        return
+    cache_file = Path(output_dir) / f"{EmbeddingType.combined.value}.h5"
+    if not cache_file.exists():
+        import numpy as np
+        from sklearn.preprocessing import StandardScaler
+
+        embeddings_per_type = {
+            k: StandardScaler().fit_transform(v) for k, v in embeddings_per_type.items()
+        }
+
+        combined_embeddings = np.hstack(
+            [
+                v
+                for k, v in embeddings_per_type.items()
+                if k
+                in [
+                    EmbeddingType.layout.value,
+                    EmbeddingType.text.value,
+                    EmbeddingType.image.value,
+                ]
+            ]
+        )
+
+        logger.info(
+            f"Generated {EmbeddingType.combined.value} embeddings for {len(sample_ids)} samples."
+        )
+        _save_embeddings(
+            embeddings=combined_embeddings,
+            sample_ids=sample_ids,
+            file_path=Path(output_dir) / f"{EmbeddingType.combined.value}.h5",
+        )
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=GenerateEmbeddingsConfig,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/clustering/cmds/generate_seeds.py b/docgenie/analyzation/clustering/cmds/generate_seeds.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb4920c6934e2dcd5e1eb36e4595b2d69a11df16
--- /dev/null
+++ b/docgenie/analyzation/clustering/cmds/generate_seeds.py
@@ -0,0 +1,293 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pydantic.v1 as pydantic
+import pydantic_argparse
+import tqdm
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._embeddings import (
+    _load_sample_ids_from_embeddings,
+)
+from docgenie.analyzation.clustering.core._utilities import (
+    EmbeddingType,
+    _get_clustering_output_path,
+    _visualize_images_grid,
+)
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    import numpy as np
+
+
+logger = get_logger(__name__)
+
+
+def alpha_cluster_sampling_create_pool(
+    cluster_labels: np.ndarray,
+    max_seed_pool: int = -1,
+) -> np.ndarray:
+    """
+    Create a pool of candidate seed images for LLM prompt construction.
+
+    The pool is sampled **proportional to cluster sizes**, ensuring that
+    each cluster is represented at least once if possible. This prevents
+    small clusters from being entirely excluded from the pool.
+
+    Args:
+        cluster_labels: np.ndarray of cluster labels for all samples.
+        max_seed_pool: int, maximum number of seed images to select for the pool.
+            - If -1 or larger than the dataset, the full dataset is used.
+
+    Returns:
+        np.ndarray: indices of samples included in the pool.
+    """
+    n_samples = len(cluster_labels)
+    unique_labels = np.unique(cluster_labels)
+
+    # Use full dataset if max_seed_pool is -1 or larger than dataset
+    if max_seed_pool == -1 or max_seed_pool >= n_samples:
+        return np.arange(n_samples)
+
+    # Step 1: guarantee one sample per cluster
+    guaranteed_indices = [
+        np.random.choice(np.where(cluster_labels == label)[0])
+        for label in unique_labels
+    ]
+
+    remaining = max_seed_pool - len(guaranteed_indices)
+    if remaining <= 0:
+        # pool is smaller than number of clusters: return guaranteed samples
+        return np.array(guaranteed_indices)
+
+    # Step 2: sample remaining indices proportional to cluster sizes
+    cluster_sizes = {
+        label: np.sum(cluster_labels == label).item() for label in unique_labels
+    }
+    cluster_prob = {
+        label: size / sum(cluster_sizes.values())
+        for label, size in cluster_sizes.items()
+    }
+    doc_prob = np.array([cluster_prob[cluster_labels[i]] for i in range(n_samples)])
+
+    # Exclude guaranteed indices
+    available_indices = np.setdiff1d(np.arange(n_samples), guaranteed_indices)
+    available_prob = doc_prob[available_indices]
+    available_prob = available_prob / available_prob.sum()
+
+    sampled_remaining = np.random.choice(
+        available_indices, size=remaining, replace=False, p=available_prob
+    )
+
+    pool_indices = np.concatenate([guaranteed_indices, sampled_remaining])
+    return pool_indices
+
+
+def alpha_cluster_sampling_pool(
+    cluster_labels: np.ndarray,
+    total_seeds: int,
+    pool_indices: np.ndarray,
+    alpha: float = 1.0,
+    seed_selection_strategy: str = "v1",
+) -> tuple[list[int], list[int]]:
+    """
+    Sample seeds from a pool using two-stage alpha-based cluster probabilities:
+    1) Pick a cluster based on alpha weighting
+    2) Pick a random sample from that cluster
+
+    Args:
+        cluster_labels: np.ndarray of cluster labels for all samples
+        total_seeds: number of seeds to sample
+        pool_indices: available sample indices
+        alpha: exponent for cluster weighting
+            - alpha=1 -> proportional to cluster size
+            - alpha=0 -> uniform across clusters
+            - alpha<0 -> inverse-proportional to cluster size
+
+    Returns:
+        Tuple of (sampled_indices, sampled_clusters)
+    """
+    pool_labels = cluster_labels[pool_indices]
+    unique_labels = np.unique(pool_labels)
+
+    # Compute cluster sizes in pool
+    cluster_sizes = {
+        label: np.sum(pool_labels == label).item() for label in unique_labels
+    }
+
+    # Compute alpha-weighted cluster probabilities
+    cluster_probs = np.array([cluster_sizes[label] ** alpha for label in unique_labels])
+    cluster_probs = cluster_probs / cluster_probs.sum()
+
+    if seed_selection_strategy == "v1":
+        sampled_indices = []
+        sampled_clusters = []
+
+        for _ in range(total_seeds):
+            # 1) Pick a cluster according to alpha probabilities
+            cluster = np.random.choice(unique_labels, p=cluster_probs)
+
+            # 2) Pick a random sample from that cluster in the pool
+            cluster_pool_indices = pool_indices[pool_labels == cluster]
+            sample = np.random.choice(cluster_pool_indices)
+
+            sampled_indices.append(int(sample))
+            sampled_clusters.append(int(cluster))
+
+        return sampled_indices, sampled_clusters
+
+    elif seed_selection_strategy == "v2":
+        sampled_indices = []
+        sampled_clusters = []
+
+        cluster = np.random.choice(unique_labels, p=cluster_probs)
+        for _ in range(total_seeds):
+            cluster_pool_indices = pool_indices[pool_labels == cluster]
+            sample = np.random.choice(cluster_pool_indices)
+
+            sampled_indices.append(int(sample))
+            sampled_clusters.append(int(cluster))
+
+        return sampled_indices, sampled_clusters
+    else:
+        raise ValueError(f"Unknown seed selection strategy: {seed_selection_strategy}")
+
+
+def generate_seeds_for_embedding_type(
+    cfg: GenerateSeedsConfig, embedding_type: EmbeddingType
+) -> tuple[Path, Path]:
+    import random
+
+    import numpy as np
+    import pandas as pd
+
+    # set seed
+    np.random.seed(cfg.seed)
+    random.seed(cfg.seed)
+
+    # get paths
+    output_dir = Path(cfg.clusters_dir) / cfg.dataset_name / embedding_type.value
+    embeddings_path = (
+        Path(cfg.embeddings_dir) / cfg.dataset_name / f"{embedding_type.value}.h5"
+    )
+    cluster_sample_ids = _load_sample_ids_from_embeddings(embeddings_path)
+    clusters_path = _get_clustering_output_path(
+        output_dir=output_dir,
+        intermediate_num_dims=cfg.intermediate_num_dims,
+        hdbscan_min_cluster_size=cfg.hdbscan_min_cluster_size,
+        hdbscan_metric=cfg.hdbscan_metric,
+        k_nn_n_neighbors=cfg.k_nn_n_neighbors,
+        do_knn=cfg.do_knn,
+        method=cfg.method,
+    )
+
+    # load data
+    # cluster_labels coresponds to the document indices in the dataset
+    cluster_results = np.load(clusters_path, allow_pickle=True).item()
+    cluster_labels = cluster_results["cluster_labels"]
+    logger.info(f"Cluster labels shape: {cluster_labels.shape}")
+    assert len(cluster_labels) == len(cluster_sample_ids), (
+        "Mismatch in number of samples"
+    )
+
+    pool_indices = alpha_cluster_sampling_create_pool(
+        cluster_labels=cluster_labels, max_seed_pool=cfg.max_pool_size
+    )
+    seed_samples = []
+    seed_clusters = []
+    for _ in tqdm.tqdm(range(cfg.total_seed_runs), desc="Sampling seeds"):
+        sampled_seeds, sampled_clusters = alpha_cluster_sampling_pool(
+            cluster_labels=cluster_labels,
+            total_seeds=cfg.total_seeds_per_run,
+            pool_indices=pool_indices,
+            alpha=cfg.alpha,
+            seed_selection_strategy=cfg.seed_selection_strategy,
+        )
+        sampled_seed_ids = [cluster_sample_ids[i] for i in sampled_seeds]
+        seed_samples.append(sampled_seed_ids)
+        seed_clusters.append(sampled_clusters)
+
+    # save the sampled seeds
+    seeds_output_path = Path(cfg.output_dir) / clusters_path.name.replace(
+        ".npy",
+        f"_alpha={cfg.alpha}_max-pool-size={cfg.max_pool_size}_strategy={cfg.seed_selection_strategy}_seeds.csv",
+    )
+    dataframe = pd.DataFrame(seed_samples)
+    logger.info(f"Saving sampled seeds to {seeds_output_path}...")
+    dataframe.to_csv(seeds_output_path, index=False)
+
+    # save the sampled clusters
+    clusters_output_path = Path(cfg.output_dir) / clusters_path.name.replace(
+        ".npy",
+        f"_alpha={cfg.alpha}_max-pool-size={cfg.max_pool_size}_strategy={cfg.seed_selection_strategy}_clusters.csv",
+    )
+    dataframe = pd.DataFrame(seed_clusters)
+    logger.info(f"Saving sampled seeds to {clusters_output_path}...")
+    dataframe.to_csv(clusters_output_path, index=False)
+
+    # also visualize the random 20 seed documents as an image grid
+    # load all seed documents into an image grid
+    if cfg.visualize_seeds:
+        from docgenie.data import load_dataset
+
+        dataset = load_dataset(cfg.dataset_name, split="train")
+        seed_images = []
+        for seed in sampled_seeds[: cfg.n_seeds_to_visualize]:
+            seed_images.append(
+                dataset.train.get_by_id(cluster_sample_ids[seed]).image.content
+            )
+        vis_fname = seeds_output_path.parent / seeds_output_path.name.replace(
+            ".csv", ".png"
+        )
+        _visualize_images_grid(
+            images=seed_images,
+            save_path=vis_fname,
+        )
+
+    return seeds_output_path, clusters_output_path
+
+
+class GenerateSeedsConfig(pydantic.BaseModel):
+    """
+    Configuration for generating clustering seeds.
+    """
+
+    # same as clustering config
+    dataset_name: str
+    seed: int = 42
+    hdbscan_min_cluster_size: int = 10
+    intermediate_num_dims: int = 100
+    hdbscan_metric: str = "euclidean"
+    do_knn: bool = True
+    k_nn_n_neighbors: int = 5
+    embeddings_dir: str | Path = ENV.EMBEDDINGS_DIR
+    clusters_dir: str | Path = ENV.CLUSTERS_DIR
+    output_dir: str | Path
+    method: str = "hdbscan"  # or "kmeans"
+    seed_selection_strategy: str = "v1"
+
+    # specific to seed generation
+    total_seed_runs: int = 10000
+    total_seeds_per_run: int = 10
+    visualize_seeds: bool = False
+    n_seeds_to_visualize: int = 20
+
+    # sampling strategy
+    max_pool_size: int = -1  # if -1, seeds are selected from complete dataset, otherwise a pool is generated via proportional sampling, where it is ensured that each cluster is selected at least once
+    """
+    sampling exponent for clusters.
+        - alpha=1 -> proportional
+        - alpha=0 -> uniform
+        - alpha<0 -> inverse-proportional
+    """
+    alpha: float = 0
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=GenerateSeedsConfig,
+    )
+    generate_seeds_for_embedding_type(parser.parse_typed_args(), EmbeddingType.combined)
diff --git a/docgenie/analyzation/clustering/cmds/load_seed_samples.py b/docgenie/analyzation/clustering/cmds/load_seed_samples.py
new file mode 100755
index 0000000000000000000000000000000000000000..98e942c1f6609c6d9cd5511873295d1e991778b2
--- /dev/null
+++ b/docgenie/analyzation/clustering/cmds/load_seed_samples.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from pathlib import Path
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._embeddings import (
+    _load_sample_ids_from_embeddings,
+)
+from docgenie.analyzation.clustering.core._utilities import (
+    EmbeddingType,
+    _get_clustering_output_path,
+)
+from docgenie.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def main(cfg: LoadSeedSamples):
+    import pandas as pd
+    from docgenie.data import load_dataset
+
+    for embedding_type in EmbeddingType.__members__.values():
+        output_dir = Path(cfg.output_dir) / cfg.dataset_name / embedding_type.value
+        embeddings_path = (
+            Path(cfg.embeddings_dir) / cfg.dataset_name / f"{embedding_type.value}.h5"
+        )
+        sample_ids = _load_sample_ids_from_embeddings(embeddings_path)
+        clusters_path = _get_clustering_output_path(
+            output_dir=output_dir,
+            intermediate_num_dims=cfg.intermediate_num_dims,
+            hdbscan_min_cluster_size=cfg.hdbscan_min_cluster_size,
+            hdbscan_metric=cfg.hdbscan_metric,
+            k_nn_n_neighbors=cfg.k_nn_n_neighbors,
+            do_knn=cfg.do_knn,
+            method=cfg.method,
+        )
+        seeds_output_path = clusters_path.parent / clusters_path.name.replace(
+            ".npy", f"_strategy={cfg.sampling_strategy}_seeds.csv"
+        )
+
+        # load the sampled seeds
+        dataset = load_dataset(cfg.dataset_name, split="train")
+        seed_sample_indices = pd.read_csv(seeds_output_path)
+        for _, row in seed_sample_indices.iterrows():
+            # get seed samples from first row
+            sampled_seeds = row.tolist()
+            seed_sample_ids = [sample_ids[int(i)] for i in sampled_seeds]
+            samples = [dataset.train.get_by_id(sid) for sid in seed_sample_ids]
+            print(f"Loaded {len(samples)} seed samples from {seeds_output_path}")
+            print("Example sample: ", samples[0])
+            break
+
+
+class LoadSeedSamples(pydantic.BaseModel):
+    # same as clustering config
+    dataset_name: str
+    seed: int = 42
+    hdbscan_min_cluster_size: int = 10
+    intermediate_num_dims: int = 100
+    hdbscan_metric: str = "euclidean"
+    do_knn: bool = True
+    k_nn_n_neighbors: int = 5
+    embeddings_dir: str | Path = ENV.EMBEDDINGS_DIR
+    output_dir: str | Path = ENV.CLUSTERS_DIR
+    method: str = "hdbscan"  # or "kmeans"
+    sampling_strategy: str = "uniform_cluster_sampling"
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=LoadSeedSamples,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/clustering/compute_best_clusterings.py b/docgenie/analyzation/clustering/compute_best_clusterings.py
new file mode 100755
index 0000000000000000000000000000000000000000..4941fa639fd50a5f495e8c4558b135050041fdd3
--- /dev/null
+++ b/docgenie/analyzation/clustering/compute_best_clusterings.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Compute top N clustering configurations per dataset
+from a single global metrics file.
+
+Example:
+    python compute_best_clusterings_all_in_one.py \
+        --metrics compactness__silhouette_score balance__entropy \
+        --directions max max \
+        --top 5
+"""
+
+import argparse
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+from pathlib import Path
+
+from docgenie import ENV
+
+
+# --------------------------------------------------------------------
+# CONFIG
+# --------------------------------------------------------------------
+METRICS_FILE = ENV.CLUSTERS_DIR / "metrics-seed=42.csv"
+
+
+# --------------------------------------------------------------------
+# FUNCTIONS
+# --------------------------------------------------------------------
+valid_datasets = [
+    "cord",
+    "doclaynet_4k",
+    "ex_docvqa",
+    "ex_klc",
+    "ex_wiki",
+    "funsd",
+    "icdar2019",
+    "publaynet",
+    "rvlcdip",
+    "sroie",
+    "tobacco3482",
+]
+
+
+def compute_best_per_dataset(df, metrics, directions, top_n=5, filter_datasets=False):
+    """Compute top N configs per dataset for selected metrics."""
+    results = []
+
+    for dataset, group in df.groupby("dataset_name"):
+        if filter_datasets and dataset not in valid_datasets:
+            continue
+
+        df_norm = group.copy()
+        scaler = MinMaxScaler()
+
+        # normalize + direction handling
+        for metric, direction in zip(metrics, directions):
+            if metric not in group.columns:
+                raise ValueError(
+                    f"Metric '{metric}' not found in columns: {list(group.columns)}"
+                )
+
+            # normed = scaler.fit_transform(group[[metric]].values)
+            normed = group[[metric]].values
+            if direction == "min":
+                normed = 1 - normed  # flip so higher is better
+            df_norm[metric] = normed
+
+        df_norm["final_score"] = df_norm[metrics].mean(axis=1)
+        top = df_norm.sort_values("final_score", ascending=False).head(top_n)
+        top["dataset_name"] = dataset
+        results.append(top)
+
+    combined = pd.concat(results, ignore_index=True)
+    return combined
+
+
+# Compute final embedding ranking
+def compute_embedding_ranking(top_df, top_n, filter_datasets):
+    """Aggregate top N positions across datasets per embedding type."""
+    ranking_list = []
+
+    for dataset, group in top_df.groupby("dataset_name"):
+        if filter_datasets and dataset not in valid_datasets:
+            continue
+
+        # Sort by final_score descending
+        group_sorted = group.sort_values("final_score", ascending=False).reset_index()
+        # Assign position-based score
+        group_sorted["rank_score"] = (
+            top_n - group_sorted.index
+        )  # top row = top_n, next = top_n-1 ...
+        ranking_list.append(
+            group_sorted[["embedding_type", "min_cluster_size", "rank_score"]]
+        )
+
+    # Combine all datasets
+    all_scores = pd.concat(ranking_list)
+    # Sum scores per embedding type
+    final_ranking = (
+        all_scores.groupby(["embedding_type", "min_cluster_size"], as_index=False)[
+            "rank_score"
+        ]
+        .sum()
+        .reset_index()
+    )
+    final_ranking = final_ranking.sort_values("rank_score", ascending=False)
+    final_ranking["final_rank"] = range(1, len(final_ranking) + 1)
+
+    return final_ranking
+
+
+# --------------------------------------------------------------------
+# MAIN
+# --------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute top N clustering configurations per dataset."
+    )
+    parser.add_argument(
+        "--metrics", nargs="+", required=True, help="Metrics to consider"
+    )
+    parser.add_argument(
+        "--directions",
+        nargs="+",
+        required=True,
+        help="Directions for each metric (max/min)",
+    )
+
+    parser.add_argument(
+        "--filter",
+        action="store_true",
+        help="If set, only take into account used datasets",
+    )
+
+    parser.add_argument(
+        "--min-cluster-size",
+        type=int,
+        help="Only consider rows with this min_cluster_size",
+    )
+    parser.add_argument("--top", type=int, default=5, help="Top N results per dataset")
+    parser.add_argument(
+        "--outfile", default="best_clusterings_summary.csv", help="Output CSV path"
+    )
+    args = parser.parse_args()
+
+    if len(args.metrics) != len(args.directions):
+        parser.error("Number of metrics and directions must match.")
+
+    print(f"📂 Loading metrics from {METRICS_FILE}")
+    df = pd.read_csv(METRICS_FILE)
+
+    # Apply filter if specified
+    if args.min_cluster_size is not None:
+        df = df[df["min_cluster_size"] == args.min_cluster_size]
+        if df.empty:
+            print(f"⚠️ No rows found with min_cluster_size = {args.min_cluster_size}")
+            return
+
+    print(f"✅ Found {len(df)} rows across {df['dataset_name'].nunique()} datasets")
+    combined = compute_best_per_dataset(
+        df, args.metrics, args.directions, args.top, filter_datasets=args.filter
+    )
+
+    # Select main display columns
+    cols_to_show = [
+        "dataset_name",
+        "embedding_type",
+        "min_cluster_size",
+        "intermediate_dims",
+        "method",
+        *args.metrics,
+        "final_score",
+    ]
+    cols_to_show = [c for c in cols_to_show if c in combined.columns]
+
+    print("\n=== Top results per dataset ===")
+    for ds, g in combined.groupby("dataset_name"):
+        print(f"\n--- {ds} ---")
+        print(g[cols_to_show])
+
+    # Save combined summary
+    out_path = Path(args.outfile)
+    combined.to_csv(out_path, index=False)
+    print(f"\n✅ Summary saved to {out_path.resolve()}")
+
+    final_ranking = compute_embedding_ranking(
+        combined, top_n=args.top, filter_datasets=args.filter
+    )
+
+    print("\n=== Final Ranking of Embedding Types ===")
+    print(final_ranking)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docgenie/analyzation/clustering/core/_algorithms.py b/docgenie/analyzation/clustering/core/_algorithms.py
new file mode 100755
index 0000000000000000000000000000000000000000..38224262f5e79a5f51142d3f096ffc1e2db32aeb
--- /dev/null
+++ b/docgenie/analyzation/clustering/core/_algorithms.py
@@ -0,0 +1,288 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from docgenie.analyzation.clustering.core._utilities import (
+    EmbeddingType,
+)
+from docgenie.analyzation.clustering.core._embeddings import (
+    _load_embeddings,
+)
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    import numpy as np
+    import torch
+
+logger = get_logger(__name__)
+
+
+def _normalized_embeddings(
+    embeddings: np.ndarray,
+) -> np.ndarray:
+    import numpy as np
+
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    return embeddings / norms
+
+
+def _reduce_embeddings_dims(
+    embeddings: torch.Tensor,
+    intermediate_num_dims: int = None,
+    reduce_dim_metric: str = "euclidean",
+    seed: int = None,
+):
+    import math
+
+    import umap
+
+    if intermediate_num_dims is None:
+        intermediate_num_dims = math.floor(math.sqrt(embeddings.shape[1]))
+
+    if intermediate_num_dims < embeddings.shape[1]:
+        logger.info(
+            f"Reducing embedding dimensions from {embeddings.shape[1]} to {intermediate_num_dims=} before clustering..."
+        )
+        umap_engine = umap.UMAP(
+            n_components=intermediate_num_dims,
+            metric=reduce_dim_metric,
+            n_jobs=-1,
+            verbose=False,
+            random_state=seed,
+        )
+        return umap_engine.fit_transform(embeddings)
+    return embeddings
+
+
+def _run_hdbscan(
+    embeddings: torch.Tensor,
+    hdbscan_min_cluster_size: int = 10,
+    hdbscan_metric: str = "euclidean",
+    seed: int = None,
+):
+    import hdbscan
+    import numpy as np
+
+    approx_min_span_tree = True
+    if seed is not None:
+        np.random.seed(seed)
+        approx_min_span_tree = False  # otherwise not deterministic
+
+    logger.info("Running HDBSCAN...")
+    clusterer = hdbscan.HDBSCAN(
+        min_cluster_size=hdbscan_min_cluster_size,
+        metric=hdbscan_metric,
+        core_dist_n_jobs=-1,
+        approx_min_span_tree=approx_min_span_tree,
+        algorithm="best",
+        prediction_data=True,
+    )
+    cluster_labels = clusterer.fit_predict(embeddings)
+    soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
+    return soft_clusters, cluster_labels
+
+
+def _run_knn(
+    embeddings: torch.Tensor,
+    cluster_labels: np.ndarray,
+    k_nn_n_neighbors: int = 5,
+):
+    import copy
+
+    from sklearn.neighbors import KNeighborsClassifier
+
+    # train k-NN classifier
+    noise_mask = cluster_labels == -1
+    non_noise_mask = cluster_labels != -1
+    X_non_noise = embeddings[non_noise_mask]
+    y_non_noise = cluster_labels[non_noise_mask]
+    knn = KNeighborsClassifier(n_neighbors=k_nn_n_neighbors, n_jobs=-1)
+    knn.fit(X_non_noise, y_non_noise)
+
+    X_noise = embeddings[noise_mask]
+    predicted_labels = knn.predict(X_noise)
+
+    # assign predicted labels back to noise points
+    cluster_labels = copy.deepcopy(cluster_labels)
+    cluster_labels[noise_mask] = predicted_labels
+
+    return cluster_labels
+
+
+def _get_cached_reduced_embeddings(
+    embeddings: np.ndarray,
+    intermediate_num_dims: int,
+    reduce_dim_metric: str,
+    seed: int,
+    cache_dir: str = None,
+) -> np.ndarray:
+    """Get reduced embeddings from cache or compute and cache them."""
+    import os
+    import pickle
+
+    if cache_dir is None:
+        # Compute without caching
+        return _reduce_embeddings_dims(
+            embeddings=embeddings,
+            intermediate_num_dims=intermediate_num_dims,
+            reduce_dim_metric=reduce_dim_metric,
+            seed=seed,
+        )
+
+    # Create cache key from parameters
+    cache_key = f"{intermediate_num_dims}_{reduce_dim_metric}_{seed}"
+    cache_file = os.path.join(cache_dir, f"reduced_embeddings_{cache_key}.pkl")
+
+    # Try to load from cache
+    if os.path.exists(cache_file):
+        logger.info(f"Loading reduced embeddings from cache: {cache_file}")
+        with open(cache_file, "rb") as f:
+            return pickle.load(f)
+
+    # Compute and cache
+    os.makedirs(cache_dir, exist_ok=True)
+    reduced_embeddings = _reduce_embeddings_dims(
+        embeddings=embeddings,
+        intermediate_num_dims=intermediate_num_dims,
+        reduce_dim_metric=reduce_dim_metric,
+        seed=seed,
+    )
+
+    with open(cache_file, "wb") as f:
+        pickle.dump(reduced_embeddings, f)
+    logger.info(f"Cached reduced embeddings to: {cache_file}")
+
+    return reduced_embeddings
+
+
+# layoutlm CLS token, clip, text, combined
+def _read_and_cluster_embeddings(
+    embeddings_dir: str,
+    dataset_name: str,
+    embedding_type: EmbeddingType,
+    intermediate_num_dims: int = None,
+    hdbscan_min_cluster_size: int = 10,
+    hdbscan_metric: str = "euclidean",
+    method: str = "hdbscan",
+    n_kmeans_clusters: int = 150,
+    k_nn_n_neighbors: int = 5,
+    seed: int = 42,
+    do_knn: bool = True,
+    cache_dir: str = None,
+) -> dict:
+    """
+    Read embeddings from H5PY file, reduce dimensions, and cluster them.
+
+    This function first loads the embeddings from an H5PY file, normalizes them to unit length,
+    and then reduces their dimensions using UMAP if specified (by default we always use umap).
+    It then applies the chosen clustering algorithm (HDBSCAN or KMeans) to the reduced embeddings. Usually we only
+    use HDBSCAN currently with KNN, and optionally apply k-NN to label noise points. Without KNN, HDBSCAN returns
+    clusters with noise points associated a label of -1. KMeans is also supported as an alternative clustering method.
+    The function returns a dictionary containing the cluster labels, noise mask, number of noise points,
+    reduced embeddings, and soft cluster assignments.
+
+    Args:
+        embeddings_dir (str): Directory where the embeddings H5PY file is located.
+        dataset_name (str): Name of the dataset (used to construct the file name).
+        embedding_type (EmbeddingType): Type of embeddings (layout, clip, text).
+        intermediate_num_dims (int, optional): Number of dimensions to reduce embeddings to before clustering.
+            If None, no dimensionality reduction is applied. Defaults to None.
+        hdbscan_min_cluster_size (int, optional): Minimum cluster size for HDBSCAN algorithm.
+            Defaults to 10.
+        hdbscan_metric (str, optional): Distance metric used by HDBSCAN algorithm.
+            Defaults to "euclidean".
+        method (str, optional): The clustering method to use ("hdbscan" or "kmeans"). Defaults to "hdbscan".
+        n_kmeans_clusters (int, optional): Number of clusters for KMeans algorithm.
+            Only used if method is "kmeans". Defaults to 150.
+        k_nn_n_neighbors (int, optional): Number of neighbors for k-NN algorithm.
+            Only used if method is "hdbscan" and do_knn is True. Defaults to 5.
+        seed (int, optional): Random seed for reproducibility. Defaults to 42.
+        do_knn (bool, optional): Whether to apply k-nearest neighbors processing.
+            Only used if method is "hdbscan". Defaults to True.
+        cache_dir (str, optional): Directory to cache reduced embeddings.
+            If None, no caching is done. Defaults to None.
+    """
+    import numpy as np
+    import torch
+    from pathlib import Path
+
+    # read the embeddings
+    embeddings, _ = _load_embeddings(
+        file_path=Path(embeddings_dir) / dataset_name / f"{embedding_type.value}.h5"
+    )
+    embeddings = torch.from_numpy(embeddings)
+
+    # normalize the embeddings
+    embeddings = _normalized_embeddings(embeddings)
+
+    # we also reduce embeddings to embeddings_2d for visualization
+    # we only run it to cache the embeddings
+    _get_cached_reduced_embeddings(
+        embeddings=embeddings,
+        intermediate_num_dims=2,
+        reduce_dim_metric=hdbscan_metric,
+        seed=seed,
+        cache_dir=cache_dir,
+    )
+
+    # reduce embedding dimensions for clustering
+    embeddings_reduced_dim = _get_cached_reduced_embeddings(
+        embeddings=embeddings,
+        intermediate_num_dims=intermediate_num_dims,
+        reduce_dim_metric=hdbscan_metric,
+        seed=seed,
+        cache_dir=cache_dir,
+    )
+
+    # convert embeddings to double
+    embeddings_reduced_dim = embeddings_reduced_dim.astype(np.double)
+
+    # normalize reduced embeddings
+    embeddings_reduced_dim = _normalized_embeddings(embeddings_reduced_dim)
+
+    if method == "hdbscan":
+        # step 1: run the clustering algorithm on the embeddings
+        soft_clusters, cluster_labels = _run_hdbscan(
+            embeddings=embeddings_reduced_dim,
+            hdbscan_min_cluster_size=hdbscan_min_cluster_size,
+            hdbscan_metric=hdbscan_metric,
+            seed=seed,
+        )
+
+        # step 2: train k-NN on non-noise points
+        # select points that are not labeled as noise
+        num_noise = np.sum(cluster_labels == -1)
+        noise_mask = cluster_labels == -1
+
+        logger.info("Number of noise points: %d", num_noise)
+
+        # return if not using k-NN to label noise points
+        if do_knn and num_noise > 0:
+            cluster_labels = _run_knn(
+                embeddings=embeddings_reduced_dim,
+                cluster_labels=cluster_labels,
+                k_nn_n_neighbors=k_nn_n_neighbors,
+            )
+
+        return {
+            "cluster_labels": cluster_labels,
+            "noise_mask": noise_mask,
+            "num_noise": num_noise,
+            "embeddings_reduced_dim": embeddings_reduced_dim,
+            "soft_clusters": soft_clusters,
+        }
+    elif method == "kmeans":
+        from sklearn.cluster import KMeans
+
+        kmeans = KMeans(n_clusters=n_kmeans_clusters, random_state=seed, n_init="auto")
+        cluster_labels = kmeans.fit_predict(embeddings_reduced_dim)
+        soft_clusters = np.zeros((len(cluster_labels), n_kmeans_clusters))
+        soft_clusters[np.arange(len(cluster_labels)), cluster_labels] = 1.0
+        return {
+            "cluster_labels": cluster_labels,
+            "num_noise": 0,
+            "embeddings_reduced_dim": embeddings_reduced_dim,
+            "soft_clusters": soft_clusters,
+        }
+    else:
+        raise ValueError(f"Unknown clustering method: {method}")
diff --git a/docgenie/analyzation/clustering/core/_embeddings.py b/docgenie/analyzation/clustering/core/_embeddings.py
new file mode 100755
index 0000000000000000000000000000000000000000..6514a464fb332b1d1abf39855f706da451032a13
--- /dev/null
+++ b/docgenie/analyzation/clustering/core/_embeddings.py
@@ -0,0 +1,329 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable
+
+import tqdm
+
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data._core._data_types import DocumentInstanceModelInput
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    import numpy as np
+    from torch.utils.data import DataLoader
+
+logger = get_logger(__name__)
+
+
+def _iterate_dataset(
+    model_fn: Callable,
+    embedding_fn: Callable,
+    dataloader: "DataLoader",
+    device: str = "cpu",
+):
+    """Inner function that actually generates the embeddings."""
+    import torch
+
+    model = model_fn()
+    model.to(device)
+    model.eval()
+
+    sample_ids = []
+    embeddings = []
+    with torch.no_grad():
+        for batch in tqdm.tqdm(dataloader, desc="Extracting embeddings"):
+            batch: DocumentInstanceModelInput
+            batch = batch.select_first_overflow_samples()
+            batch = batch.to(device)
+
+            token_bboxes = batch.token_bboxes
+            if token_bboxes is not None:
+                if token_bboxes.min() >= 0 and token_bboxes.max() <= 1.0:
+                    # if bboxes are normalized to [0, 1], convert to [0, 1000] as expected by layoutlmv3
+                    token_bboxes = (token_bboxes * 1000).long()
+                else:
+                    logger.warning(
+                        f"Token bboxes must be in the range [0, 1], but got min {token_bboxes.min()} and max {token_bboxes.max()}"
+                    )
+                    token_bboxes = (token_bboxes.clip(0, 1.0) * 1000).long()
+
+                # assert check
+                assert token_bboxes.min() >= 0 and token_bboxes.max() <= 1000, (
+                    f"Token bboxes must be in the range [0, 1000], but got min {token_bboxes.min()} and max {token_bboxes.max()}"
+                )
+
+            # make sure if image is normlized 0-1 as in layoutlm we renormalize using clip stats
+            assert batch.image.min() >= -1.1 and batch.image.max() <= 1.1, (
+                f"Image pixel values must be in the range [0, 1], but got min {batch.image.min()} and max {batch.image.max()}"
+            )
+
+            # make inputs
+            inputs = dict(
+                input_ids=batch.token_ids,
+                bbox=token_bboxes,
+                attention_mask=batch.attention_mask,
+                pixel_values=batch.image,
+                words=batch.words,
+            )
+
+            embeddings.append(embedding_fn(model, inputs))
+
+            # in our preprocessed dataset indices are always unqiue
+            # but sample_ids may not be always unique in some rare cases
+            sample_ids.extend(batch.sample_id)
+
+    embeddings = torch.cat(embeddings, dim=0)
+    return embeddings.cpu().numpy(), sample_ids
+
+
+def _extract_layoutlm_embeddings(
+    dataloader: "DataLoader",
+    device: str = "cpu",
+):
+    """Inner function that actually generates the embeddings."""
+
+    def model_fn():
+        from transformers import (
+            LayoutLMv3Model,
+        )
+
+        model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
+        model.to(device)
+        model.eval()
+        return model
+
+    def embedding_fn(model, inputs):
+        outputs = model(
+            input_ids=inputs["input_ids"],
+            bbox=inputs["bbox"],
+            attention_mask=inputs["attention_mask"],
+            pixel_values=inputs["pixel_values"],
+        )
+        return outputs.last_hidden_state[:, 0, :]
+
+    embeddings, sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return embeddings, sample_ids
+
+
+def _extract_text_embeddings(
+    dataloader: "DataLoader",
+    device: str = "cpu",
+):
+    """Inner function that actually generates the embeddings."""
+
+    def model_fn():
+        from sentence_transformers import SentenceTransformer
+
+        model = SentenceTransformer("all-mpnet-base-v2")
+        model.to(device)
+        model.eval()
+        return model
+
+    def embedding_fn(model, inputs):
+        sentences = [" ".join(words_per_sample) for words_per_sample in inputs["words"]]
+        return model.encode(sentences, convert_to_tensor=True)
+
+    embeddings, sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return embeddings, sample_ids
+
+
+def _extract_image_embeddings(
+    dataloader: "DataLoader",
+    device: str = "cpu",
+):
+    """Inner function that actually generates the embeddings."""
+    OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+    OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+    def model_fn():
+        from transformers import (
+            CLIPModel,
+        )
+
+        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        model.to(device)
+        model.eval()
+        return model
+
+    def embedding_fn(model, inputs):
+        from torchvision.transforms.functional import normalize
+
+        # make sure if image is normlized 0-1 as in layoutlm we renormalize using clip stats
+        inputs["pixel_values"] = inputs["pixel_values"] * 0.5 + 0.5  # -1 to 1 to [0, 1]
+        inputs["pixel_values"] = normalize(
+            inputs["pixel_values"], mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD
+        )
+        outputs = model.get_image_features(pixel_values=inputs["pixel_values"])
+        return outputs.cpu()
+
+    embeddings, sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return embeddings, sample_ids
+
+
+def _extract_paper_embeddings(
+    dataloader: "DataLoader",
+    device: str = "cpu",
+    paper_embedding_kernel_size: int = 4,
+):
+    """Inner function that actually generates the embeddings."""
+
+    def model_fn():
+        from transformers import (
+            LayoutLMv3Model,
+        )
+
+        model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
+        model.to(device)
+        model.eval()
+        return model
+
+    def embedding_fn(model, inputs):
+        import torch
+        from torch import nn
+
+        # do layoutlmv3 forward
+        outputs = model(
+            input_ids=inputs["input_ids"],
+            bbox=inputs["bbox"],
+            attention_mask=inputs["attention_mask"],
+            pixel_values=inputs["pixel_values"],
+        )
+
+        # get last last_hidden_state
+        last_hidden_state_batch = outputs.last_hidden_state
+
+        # now apply paper embedding logic
+        pad_token_id = model.config.pad_token_id
+        num_image_tokens = (model.config.input_size // model.config.patch_size) ** 2
+        embeddings = []
+        for idx in range(last_hidden_state_batch.shape[0]):
+            last_hidden_state = last_hidden_state_batch[idx, :, :]  # (L, D)
+            Lt = (inputs["input_ids"][idx] != pad_token_id).sum()  # its a 1D tensor
+            text_embedding = last_hidden_state[:Lt, :]  # (Lt, D)
+            # image_embedding_with_padding = last_hidden_state[Lt:, :]     # (Lv, D)
+            image_embedding = last_hidden_state[-num_image_tokens:, :]  # (Lv, D)
+
+            # Step 1: Mean pooling of text embeddings
+            vt = text_embedding.mean(dim=0)  # shape: (D,)
+
+            # Step 2: 1D max-pooling on image embeddings to reduce feature dimension
+            # Reshape Hv to (Lv, 1, D) to apply 1D max-pooling along the feature dimension
+            Hv_reshaped = image_embedding.unsqueeze(1)  # (Lv, 1, D)
+            maxpool = nn.MaxPool1d(
+                kernel_size=paper_embedding_kernel_size,
+                stride=paper_embedding_kernel_size,
+            )
+            Hv_pooled = maxpool(Hv_reshaped)  # (Lv, 1, N), N < D
+            Hv_pooled = Hv_pooled.squeeze(1)  # shape: (Lv, N)
+
+            # Step 3: Mean pooling of pooled image embeddings
+            vv = Hv_pooled.mean(dim=0)  # shape: (N,)
+
+            # Step 4: Concatenate text and pooled image embeddings
+            v = torch.cat([vt, vv], dim=0)  # shape: (D + N,)
+            embeddings.append(v)
+        return torch.stack(embeddings, dim=0)  # (B, D + N)
+
+    embeddings, sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return embeddings, sample_ids
+
+
+def embedding_extraction_with_cache(
+    dataloader: "DataLoader",
+    output_dir: str | Path,
+    embedding_type: EmbeddingType,
+    device: str = "cpu",
+    cache_outputs: bool = True,
+):
+    """Generic cacher function that handles caching logic for any embedding type."""
+    cache_file = Path(output_dir) / f"{embedding_type.value}.h5"
+    if cache_outputs and cache_file.exists():
+        logger.info(
+            f"Loading cached {embedding_type.value} embeddings from {cache_file}"
+        )
+        return _load_embeddings(cache_file)
+
+    # Generate new embeddings using the provided extraction function
+    if embedding_type == EmbeddingType.layout:
+        extraction_func = _extract_layoutlm_embeddings
+        embeddings, sample_ids = extraction_func(dataloader, device)
+    elif embedding_type == EmbeddingType.text:
+        extraction_func = _extract_text_embeddings
+        embeddings, sample_ids = extraction_func(dataloader, device)
+    elif embedding_type == EmbeddingType.image:
+        extraction_func = _extract_image_embeddings
+        embeddings, sample_ids = extraction_func(dataloader, device)
+    elif embedding_type == EmbeddingType.paper:
+        extraction_func = _extract_paper_embeddings
+        embeddings, sample_ids = extraction_func(dataloader, device)
+    else:
+        raise ValueError(f"Unsupported embedding type: {embedding_type}")
+
+    if cache_outputs:
+        assert len(sample_ids) == embeddings.shape[0], (
+            f"Number of sample IDs ({len(sample_ids)}) must match number of embeddings ({embeddings.shape[0]})"
+        )
+        assert len(set(sample_ids)) == len(sample_ids), "Sample IDs must be unique"
+        _save_embeddings(
+            embeddings=embeddings,
+            sample_ids=sample_ids,
+            file_path=Path(output_dir) / f"{embedding_type.value}.h5",
+        )
+        return _load_embeddings(cache_file)
+
+    return embeddings, sample_ids
+
+
+def _save_embeddings(embeddings: "np.ndarray", sample_ids: list[str], file_path: Path):
+    import h5py
+
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    with h5py.File(file_path, "w") as f:
+        f.create_dataset("embeddings", data=embeddings)
+        f.create_dataset("sample_ids", data=sample_ids)
+
+
+def _load_embeddings(file_path: Path):
+    import h5py
+
+    with h5py.File(file_path, "r") as f:
+        sample_ids = f["sample_ids"][:]
+        embeddings = f["embeddings"][:]
+    return embeddings, [
+        s.decode("utf-8") if isinstance(s, bytes) else s for s in sample_ids
+    ]
+
+
+def _load_sample_ids_from_embeddings(file_path: Path):
+    import h5py
+
+    with h5py.File(file_path, "r") as f:
+        sample_ids = f["sample_ids"][:]
+    return [  # decode and remove the index suffx
+        s.decode("utf-8") if isinstance(s, bytes) else s for s in sample_ids
+    ]
diff --git a/docgenie/analyzation/clustering/core/_metrics.py b/docgenie/analyzation/clustering/core/_metrics.py
new file mode 100755
index 0000000000000000000000000000000000000000..38fa58d85eee9d945cf4095b49bfd99c1493bfbe
--- /dev/null
+++ b/docgenie/analyzation/clustering/core/_metrics.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import numpy as np
+    import pandas as pd
+
+
+# Distance / Connectivity
+def _normalized_connectivity(X, labels, n_neighbors=10):
+    """
+    Normalized connectivity metric: measures if each point's nearest neighbors
+    are in the same cluster. 0 = perfect connectivity, 1 = worst.
+
+    Parameters:
+    - X: data points (n_samples x n_features)
+    - labels: cluster labels
+    - n_neighbors: number of neighbors to consider
+
+    Returns:
+    - normalized connectivity score (0-1)
+    """
+    from sklearn.neighbors import NearestNeighbors
+
+    n_samples = X.shape[0]
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X)
+    distances, indices = nbrs.kneighbors(X)
+
+    # Exclude self from neighbors
+    indices = indices[:, 1:]
+    score = 0
+    for i in range(n_samples):
+        for j in indices[i]:
+            if labels[i] != labels[j]:
+                score += 1 / n_neighbors  # penalize different cluster
+
+    # Maximum possible score is n_samples (each point has all neighbors in other clusters)
+    max_score = n_samples
+    normalized_score = score / max_score
+    return normalized_score
+
+
+# Compactness / Separation
+def _cluster_compactness_scores(embeddings, labels):
+    """
+    Compute compactness scores for clusters using various metrics.
+    """
+    from sklearn.metrics import (
+        calinski_harabasz_score,
+        davies_bouldin_score,
+        silhouette_score,
+    )
+
+    return {
+        "silhouette_score": silhouette_score(embeddings, labels),
+        "calinski_harabasz_score": calinski_harabasz_score(embeddings, labels),
+        "davies_bouldin_score": davies_bouldin_score(embeddings, labels),
+    }
+
+
+# Balance / Size Equity
+def _cluster_balance_scores(cluster_sizes):
+    """
+    Compute balance scores for clusters using various metrics.
+    """
+    import numpy as np
+    import scipy
+
+    sizes = np.array(cluster_sizes)
+    entropy = scipy.stats.entropy(sizes)
+    norm_entropy = entropy / np.log(len(sizes))
+
+    # Coefficient of variation
+    cv = sizes.std() / sizes.mean()
+    mmr = sizes.min() / sizes.max()
+
+    # Gini coefficient
+    sorted_sizes = np.sort(sizes)
+    n = len(sizes)
+    gini = (
+        2 * np.sum((np.arange(1, n + 1)) * sorted_sizes) / (n * sorted_sizes.sum())
+    ) - (n + 1) / n
+
+    return {
+        "entropy": norm_entropy.item(),
+        "coefficient_of_variation": cv.item(),
+        "min-to-max-ratio": mmr.item(),
+        "gini-coefficient": gini.item(),
+    }
+
+
+def evaluate_clusters_unsupervised(
+    embeddings: np.ndarray, cluster_labels: np.ndarray
+) -> tuple[dict[str, float], int]:
+    """
+    Evaluate clustering quality using unsupervised metrics.
+    """
+    import numpy as np
+    import torch
+
+    if isinstance(embeddings, torch.Tensor):
+        embeddings = embeddings.numpy()
+
+    unique_entries, counts = np.unique(cluster_labels, return_counts=True)
+    result = dict()
+    result["connectivity"] = {
+        "normalized_connectivity": _normalized_connectivity(
+            X=embeddings,
+            labels=cluster_labels,
+            n_neighbors=int(embeddings.shape[0] * 0.01),
+        ),
+    }
+    result["compactness"] = _cluster_compactness_scores(
+        embeddings=embeddings, labels=cluster_labels
+    )
+    result["balance"] = _cluster_balance_scores(counts)
+    return result, len(unique_entries)
+
+
+def calculate_cluster_statistics(
+    embeddings: np.ndarray, cluster_labels: np.ndarray
+) -> "pd.DataFrame":
+    """
+    Calculate statistics for each cluster, including size and variance.
+    Variance is computed as the average pairwise cosine distance within the cluster.
+    """
+    import numpy as np
+    import pandas as pd
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    unique_clusters = set(cluster_labels)
+    cluster_stats = []
+    for cluster_id in unique_clusters:
+        cluster_mask = cluster_labels == cluster_id
+        cluster_embeddings = embeddings[cluster_mask]
+        cluster_size = len(cluster_embeddings)
+        sim_matrix = cosine_similarity(cluster_embeddings)
+        cosine_distances = 1 - sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
+        cosine_diversity = np.mean(cosine_distances)
+        cluster_stats.append(
+            {
+                "cluster_id": cluster_id,
+                "size": cluster_size,
+                "variance": cosine_diversity,
+            }
+        )
+    return pd.DataFrame(cluster_stats)
diff --git a/docgenie/analyzation/clustering/core/_utilities.py b/docgenie/analyzation/clustering/core/_utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..d04d9dbd0b3e9ca154d10f170b3c5eeeaa7556a5
--- /dev/null
+++ b/docgenie/analyzation/clustering/core/_utilities.py
@@ -0,0 +1,268 @@
+from __future__ import annotations
+
+import enum
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    import numpy as np
+    from PIL.Image import Image
+
+logger = get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    import torch
+
+
+class EmbeddingType(str, enum.Enum):
+    """
+    Enum for different types of embeddings used in DocGenie.
+    """
+
+    layout = "layout"
+    image = "image"
+    text = "text"
+    combined = "combined"
+    paper = "paper_kernel=4"
+
+
+def _glob_clustering_output_paths(output_dir: str | Path):
+    """
+    List all clustering output files in the specified directory.
+
+    Args:
+        output_dir (str | Path): The directory to search for clustering output files.
+            This must point to the `output_directory/dataset_name/embedding_type` level.
+    """
+    output_path = Path(output_dir)
+    return list(
+        output_path.glob("method=*_clusters_ind=*_hmcs=*_hm=*_do_knn=*_knn=*.npy")
+    )
+
+
+def _get_clustering_output_path(
+    output_dir: str | Path,
+    intermediate_num_dims: int,
+    hdbscan_min_cluster_size: int = 10,
+    hdbscan_metric: str = "euclidean",
+    do_knn: bool = True,
+    k_nn_n_neighbors: int = 5,
+    method: str = "hdbscan",
+):
+    """
+    Generate a standardized file path for clustering output results.
+
+    This function creates a descriptive filename that encodes all the clustering
+    parameters used, allowing for easy identification and retrieval of clustering
+    results based on the specific configuration.
+
+    Args:
+        output_dir (str | Path): The base directory where clustering results will be saved.
+            This must point to the `output_directory/dataset_name/embedding_type` level.
+        intermediate_num_dims (int): The number of dimensions used in intermediate processing.
+        hdbscan_min_cluster_size (int, optional): Minimum cluster size for HDBSCAN algorithm.
+            Defaults to 10.
+        hdbscan_metric (str, optional): Distance metric used by HDBSCAN algorithm.
+            Defaults to "euclidean".
+        do_knn (bool, optional): Whether to apply k-nearest neighbors processing.
+            Defaults to True.
+        k_nn_n_neighbors (int, optional): Number of neighbors for k-NN algorithm.
+            Defaults to 5.
+        method (str, optional): The clustering method being used. Defaults to "hdbscan".
+
+    Returns:
+        Path: A Path object pointing to the clustering output file with encoded parameters
+              in the filename format: method={method}_clusters_ind={intermediate_num_dims}_
+              hmcs={hdbscan_min_cluster_size}_hm={hdbscan_metric}_do_knn={do_knn}_
+              knn={k_nn_n_neighbors}.npy
+    """
+    return (
+        output_dir
+        / f"method={method}_clusters_ind={intermediate_num_dims}_hmcs={hdbscan_min_cluster_size}_hm={hdbscan_metric}_do_knn={do_knn}_knn={k_nn_n_neighbors}.npy"
+    )
+
+
+def _save_clustering_metrics(
+    output_dir: str | Path,
+    dataset_name: str,
+    hdbscan_min_cluster_size: int,
+    intermediate_num_dims: int,
+    hdbscan_metric: str,
+    k_nn_n_neighbors: int,
+    method: str,
+    embedding_type: "EmbeddingType",
+    embeddings: "np.ndarray",
+    cluster_metrics: dict,
+    num_clusters: int,
+    num_noise: int,
+    seed: int,
+    do_knn: bool = True,
+) -> None:
+    import hashlib
+
+    import torch
+
+    cnt = embeddings.shape[0]
+    noise_percent = num_noise / float(cnt)
+    noise_percent = (
+        noise_percent.item()
+        if isinstance(noise_percent, torch.Tensor)
+        else noise_percent
+    )
+    metrics_row = {
+        "dataset_name": dataset_name,
+        "embedding_type": embedding_type.value,
+        "min_cluster_size": hdbscan_min_cluster_size,
+        "intermediate_dims": intermediate_num_dims,
+        "hdbscan_metric": hdbscan_metric,
+        "k_nn_n_neighbors": k_nn_n_neighbors,
+        "num_clusters": num_clusters,
+        "num_noise": num_noise,
+        "noise_percent": noise_percent,
+        "do_knn": do_knn,
+        "method": method,
+    }
+
+    # Add cluster metrics to the row
+    for cat, items in cluster_metrics.items():
+        for k, v in items.items():
+            metrics_row[f"{cat}__{k}"] = v
+
+    # Generate unique hash based on configuration parameters only (excluding results)
+    config_items = {
+        "dataset_name": dataset_name,
+        "embedding_type": embedding_type.value,
+        "min_cluster_size": hdbscan_min_cluster_size,
+        "intermediate_dims": intermediate_num_dims,
+        "hdbscan_metric": hdbscan_metric,
+        "k_nn_n_neighbors": k_nn_n_neighbors,
+        "seed": seed,
+        "do_knn": do_knn,
+        "method": method,
+    }
+    row_hash = hashlib.md5(str(sorted(config_items.items())).encode()).hexdigest()
+    metrics_row["row_hash"] = row_hash
+
+    # Save metrics
+    metrics_path = Path(output_dir) / f"metrics-seed={seed}.csv"
+    if metrics_path.exists():
+        df = pd.read_csv(metrics_path)
+        df = df[df["row_hash"] != row_hash]
+        df = pd.concat([df, pd.DataFrame([metrics_row])], ignore_index=True)
+    else:
+        df = pd.DataFrame([metrics_row])
+
+    logger.info(f"Saving clustering metrics to {metrics_path}...")
+    df.to_csv(metrics_path, index=False)
+
+
+def _visualize_images_grid(
+    images: list[np.ndarray | "Image"],
+    save_path: str | Path,
+    nrow: int = 8,
+    title: str | None = None,
+    figsize: tuple[int, int] = (12, 8),
+    dpi: int = 150,
+) -> None:
+    """
+    Create and save an image grid using torchvision's make_grid utility.
+
+    Args:
+        images: List of numpy arrays or PIL images to arrange in grid
+        save_path: Path where the grid image will be saved
+        nrow: Number of images displayed in each row of the grid
+        title: Optional title for the saved image
+        figsize: Figure size for matplotlib
+        dpi: DPI for saved image
+    """
+    import matplotlib.pyplot as plt
+    import numpy as np
+    import torch
+    import torchvision.transforms as transforms
+    from torchvision.transforms.functional import resize
+    from torchvision.utils import make_grid
+
+    # Convert inputs to tensors
+    tensor_images = []
+    for img in images:
+        if isinstance(img, np.ndarray):
+            # Handle different numpy array formats
+            if img.ndim == 2:  # Grayscale
+                img = np.expand_dims(img, axis=0)  # Add channel dimension
+            elif img.ndim == 3 and img.shape[2] == 3:  # RGB with channels last
+                img = np.transpose(img, (2, 0, 1))  # Convert to channels first
+            elif img.ndim == 3 and img.shape[0] in [1, 3]:  # Already channels first
+                pass
+            else:
+                raise ValueError(f"Unsupported numpy array shape: {img.shape}")
+
+            tensor = torch.from_numpy(img).float()
+        else:  # PIL Image
+            transform = transforms.ToTensor()
+            tensor = transform(img)
+
+        tensor = resize(tensor, size=(512, 512))  # Resize to fixed size
+        tensor_images.append(tensor)
+
+    # Stack all tensors
+    batch_tensor = torch.stack(tensor_images)
+
+    # Create grid
+    grid = make_grid(
+        batch_tensor,
+        nrow=nrow,
+    )
+
+    # Convert to numpy for matplotlib (channels last)
+    grid_np = grid.permute(1, 2, 0).numpy()
+
+    # Create matplotlib figure
+    fig, ax = plt.subplots(figsize=figsize)
+    ax.imshow(grid_np)
+    ax.axis("off")
+
+    if title:
+        ax.set_title(title, fontsize=16, pad=20)
+
+    # Save the figure
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=dpi, bbox_inches="tight", pad_inches=0.1)
+    plt.close()
+
+    logger.info(f"Image grid saved to {save_path}")
+
+
+def _load_pdfs_to_pil_images(pdf_paths: list[str | Path]) -> list["Image"]:
+    """
+    Loads a list of PDF document paths to PIL Images by rendering the first page of each PDF as PNG.
+
+    Args:
+        pdf_paths: List of paths to PDF files
+
+    Returns:
+        List of PIL Image objects, one for each PDF's first page
+    """
+    from pdf2image import convert_from_path
+
+    pil_images = []
+
+    for pdf_path in pdf_paths:
+        try:
+            # Convert first page of PDF to PIL Image
+            images = convert_from_path(str(pdf_path), first_page=1, last_page=1, dpi=72)
+
+            if images:
+                pil_images.append(images[0])
+            else:
+                logger.warning(f"No images converted from PDF: {pdf_path}")
+
+        except Exception as e:
+            logger.error(f"Failed to convert PDF {pdf_path}: {e}")
+            continue
+
+    return pil_images
diff --git a/docgenie/analyzation/clustering/utils.py b/docgenie/analyzation/clustering/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..d127a9763d81c60e8f49e8268d7cd4142b61954d
--- /dev/null
+++ b/docgenie/analyzation/clustering/utils.py
@@ -0,0 +1,24 @@
+
+
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+from docgenie import ENV
+
+
+def read_embeddings_numpy(dataset_name: str, embeddings_type: str, kernel_size: int = None) -> np.ndarray:
+    all_embeddings = []
+    fname = f'{dataset_name}_{embeddings_type}'
+    if embeddings_type == 'paper':
+        fname += f'_kernel={kernel_size}'
+    
+    fpath = ENV.EMBEDDINGS_DIR / f'{fname}.h5'
+    with h5py.File(fpath, "r") as f:
+        for id_ in tqdm(sorted(f.keys())):
+            emb = f[id_][:]  # load tensor in numpy format
+            all_embeddings.append(emb)
+
+    # Vertically stack along the first dimension
+    X = np.vstack(all_embeddings)
+    return X
\ No newline at end of file
diff --git a/docgenie/analyzation/clustering/webapp/__init__.py b/docgenie/analyzation/clustering/webapp/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ebe0d18b611b4c8fdc6378618de71d7bb7fabd12
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/__init__.py
@@ -0,0 +1,11 @@
+"""
+Document clustering visualization web application.
+
+This package provides an interactive Dash web application for visualizing
+document clustering results with scatter plots, cluster analysis, and
+document preview capabilities.
+"""
+
+from .app import create_app, main
+
+__all__ = ["create_app", "main"]
diff --git a/docgenie/analyzation/clustering/webapp/_deprecated/visualize_clusters.py b/docgenie/analyzation/clustering/webapp/_deprecated/visualize_clusters.py
new file mode 100755
index 0000000000000000000000000000000000000000..55876078d5b3624db4488e8366f9869467f6de30
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/_deprecated/visualize_clusters.py
@@ -0,0 +1,634 @@
+"""
+Document Clustering Visualization Dashboard
+
+A refactored modular version of the clustering visualization tool.
+This file serves as the main entry point and maintains backward compatibility.
+"""
+
+from .app import main
+
+if __name__ == "__main__":
+    main()
+from flask import Response
+
+# from flask import send_from_directory
+from plotly.subplots import make_subplots
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._utilities import (
+    EmbeddingType,
+    _get_clustering_output_path,
+)
+from docgenie.data import load_dataset
+
+# --------------------------
+# Dash app + server route to serve PDFs
+# --------------------------
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+server = app.server
+
+
+@server.route("/image/<string:index>")
+def serve_image(index):
+    global dataset
+    image = dataset.train[int(index)].image.content
+    img_io = io.BytesIO()
+    image.save(img_io, "PNG")
+    img_io.seek(0)
+    return Response(img_io.getvalue(), mimetype="image/png")
+
+
+@server.route("/cluster_grid/<indices_list>")
+def serve_cluster_grid(indices_list):
+    """Create and serve a grid image from multiple document PDFs."""
+    import io
+
+    from flask import Response
+    from PIL import Image, ImageDraw, ImageFont
+
+    try:
+        print(f"Creating grid for doc IDs: {indices_list}")
+        # Parse document IDs from comma-separated string
+        indices_list = indices_list.split(",")[:12]  # Limit to 12 for performance
+
+        # Grid dimensions
+        cols = min(4, len(indices_list))
+        rows = (len(indices_list) + cols - 1) // cols
+
+        # Image dimensions
+        thumb_width, thumb_height = 200, 280
+        grid_width = cols * thumb_width + (cols - 1) * 10  # 10px spacing
+        grid_height = rows * thumb_height + (rows - 1) * 10
+
+        # Create grid image
+        grid_img = Image.new("RGB", (grid_width, grid_height), "white")
+        for i, index in enumerate(indices_list):
+            row = i // cols
+            col = i % cols
+            x = col * (thumb_width + 10)
+            y = row * (thumb_height + 10)
+
+            image = dataset.train[int(index)].image.content
+
+            try:
+                # Resize to thumbnail
+                image.thumbnail(
+                    (thumb_width, thumb_height - 30), Image.Resampling.LANCZOS
+                )
+
+                # Paste thumbnail into grid
+                grid_img.paste(image, (x, y))
+
+                # Add document ID label
+                draw = ImageDraw.Draw(grid_img)
+                try:
+                    font = ImageFont.truetype(
+                        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12
+                    )
+                except Exception:
+                    font = ImageFont.load_default()
+
+                text_y = y + image.height + 5
+                draw.text((x, text_y), index, fill="black", font=font)
+            except Exception:
+                # Draw placeholder for failed PDF
+                draw = ImageDraw.Draw(grid_img)
+                draw.rectangle(
+                    [x, y, x + thumb_width, y + thumb_height - 30],
+                    outline="gray",
+                    fill="lightgray",
+                )
+                draw.text((x + 10, y + 10), f"Error loading\n{index}", fill="black")
+
+        # Convert PIL Image to bytes for response
+        img_io = io.BytesIO()
+        grid_img.save(img_io, "PNG")
+        img_io.seek(0)
+
+        return Response(img_io.getvalue(), mimetype="image/png")
+
+    except Exception as e:
+        return f"Error creating grid: {str(e)}", 500
+
+
+# --------------------------
+# Cluster Analysis Functions
+# --------------------------
+def create_cluster_visualization(
+    cluster_df: pd.DataFrame,
+    dataset_name: str,
+    cluster_labels: np.ndarray,
+) -> go.Figure:
+    """Create a comprehensive visualization of cluster statistics with clickable clusters."""
+    fig = make_subplots(
+        rows=4,
+        cols=1,
+        subplot_titles=(
+            "Cluster Sizes",
+            "Cluster Variances",
+            "Size vs Variance",
+            "Distribution",
+        ),
+        specs=[
+            [{"type": "bar"}],
+            [{"type": "bar"}],
+            [{"type": "scatter"}],
+            [{"type": "histogram"}],
+        ],
+    )
+
+    # Prepare custom data for click events
+    cluster_indices = {}
+    for cluster_id in cluster_df["cluster_id"]:
+        indices = np.where(cluster_labels == cluster_id)[0].tolist()
+        cluster_indices[cluster_id] = indices
+
+    # Plot 1: Cluster sizes (clickable)
+    fig.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["size"],
+            name="Size",
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            hovertemplate="Cluster %{x}<br>Size: %{y}<br>Click to view images<extra></extra>",
+        ),
+        row=1,
+        col=1,
+    )
+
+    # Plot 2: Cluster variances
+    fig.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["variance"],
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            name="Variance",
+        ),
+        row=2,
+        col=1,
+    )
+
+    # Plot 3: Size vs Variance scatter (clickable)
+    fig.add_trace(
+        go.Scatter(
+            x=cluster_df["size"],
+            y=cluster_df["variance"],
+            mode="markers",
+            text=cluster_df["cluster_id"],
+            name="Clusters",
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            hovertemplate="Cluster %{text}<br>Size: %{x}<br>Variance: %{y}<br>Click to view images<extra></extra>",
+        ),
+        row=3,
+        col=1,
+    )
+
+    # Plot 4: Size distribution
+    fig.add_trace(
+        go.Histogram(x=cluster_df["size"], name="Size Distribution"), row=4, col=1
+    )
+
+    # Add JavaScript for click handling
+    fig.update_layout(
+        title_text=f"Cluster Analysis for {dataset_name}", showlegend=False, height=1200
+    )
+
+    fig.update_layout(
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=40, r=40, t=40, b=40),
+    )
+    _update_subplot_axes(fig)
+
+    return fig
+
+
+def _update_subplot_axes(fig: go.Figure) -> None:
+    """Update axes labels for all subplots."""
+    fig.update_xaxes(title_text="Cluster ID", row=1, col=1)
+    fig.update_yaxes(title_text="Size", row=1, col=1)
+    fig.update_xaxes(title_text="Cluster ID", row=2, col=1)
+    fig.update_yaxes(title_text="Variance", row=2, col=1)
+    fig.update_xaxes(title_text="Size", row=3, col=1)
+    fig.update_yaxes(title_text="Variance", row=3, col=1)
+    fig.update_xaxes(title_text="Size", row=4, col=1)
+    fig.update_yaxes(title_text="Count", row=4, col=1)
+
+
+# --------------------------
+# Globals
+# --------------------------
+embedding_sources = [
+    "paper_kernel=4",
+    "layout",
+    "image",
+    "text",
+    "combined",
+]  # example embedding models
+intermediate_options = [100]
+min_cluster_size_options = [5, 10]
+dataset_options = [
+    {
+        "label": name,
+        "value": name,
+    }
+    for name in os.listdir(ENV.CLUSTERS_DIR)
+]
+dataset_name = "tobacco3482"
+dataset = load_dataset(
+    dataset_name=dataset_name,
+    split="train",
+)
+
+seed = 42
+metric = "euclidean"
+k_nn_n_neighbors = 5
+do_knn = False
+labels = None
+df = None
+
+
+# --------------------------
+# Callbacks
+# --------------------------
+@app.callback(
+    [
+        Input("dataset-dropdown", "value"),
+    ],
+)
+def update_dataset(new_dataset_name):
+    global dataset, dataset_name
+    dataset_name = new_dataset_name
+    dataset = load_dataset(
+        dataset_name=dataset_name,
+        split="train",
+    )
+
+
+# --------------------------
+# Callbacks
+# --------------------------
+@app.callback(
+    [
+        Output("scatter", "figure"),
+        Output("cluster-analysis", "figure"),
+    ],
+    [
+        Input("dataset-dropdown", "value"),
+        Input("intermediate-dropdown", "value"),
+        Input("min-cluster-size-dropdown", "value"),
+        Input("embedding-dropdown", "value"),
+        Input("method-dropdown", "value"),
+        # Input("cluster-size-bar", "clickData"),
+    ],
+)
+def update_scatter(
+    dataset_name,
+    intermediate_dims,
+    min_cluster_size,
+    embedding_src,
+    method,
+    # bar_click,
+):
+    global labels, df
+
+    output_dir = ENV.CLUSTERS_DIR / dataset_name / embedding_src
+    clusters_path = _get_clustering_output_path(
+        output_dir=output_dir,
+        intermediate_num_dims=intermediate_dims,
+        hdbscan_min_cluster_size=1 if method == "kmeans" else min_cluster_size,
+        hdbscan_metric=metric,
+        k_nn_n_neighbors=k_nn_n_neighbors,
+        method=method,
+    )
+    # Create cache key from parameters
+    cluster_data = np.load(clusters_path, allow_pickle=True).item()
+    labels = cluster_data["cluster_labels"]
+    soft_clusters = cluster_data["soft_clusters"]
+    noise_mask = cluster_data.get("noise_mask", np.array([False] * len(labels)))
+
+    cluster_stats = pd.read_csv(
+        clusters_path.parent / clusters_path.name.replace(".npy", "_stats.csv")
+    )
+
+    emb_2d_path = output_dir / f"reduced_embeddings_2_{metric}_{seed}.pkl"
+    # Try to load from cache
+    if not os.path.exists(emb_2d_path):
+        raise ValueError(f"2D embeddings not found: {emb_2d_path}")
+    with open(emb_2d_path, "rb") as f:
+        emb_2d = pickle.load(f)
+
+    x, y = emb_2d[:, 0], emb_2d[:, 1]
+    df = pd.DataFrame(
+        {
+            "doc_id": np.arange(len(labels)),
+            "x": x,
+            "y": y,
+            "label": labels,
+            "prob": np.max(soft_clusters, axis=1),
+            "index": np.arange(len(labels)),
+            "noise_mask": noise_mask,
+        }
+    )
+
+    # Optional: filter by cluster if user clicked a bar
+    # if bar_click and "points" in bar_click:
+    #     cluster_id = int(bar_click["points"][0]["x"])
+    #     df = df[df["label"] == cluster_id]
+    #     # df_noise = df_noise[df_noise["label"] == cluster_id]
+
+    # Create main (non-noise) scatter
+    fig = px.scatter(
+        df,
+        x="x",
+        y="y",
+        color="label",
+        hover_data={"index": True, "label": True, "doc_id": True},
+        title=f"Embeddings ({embedding_src}) — Click a point to view its PDF",
+    )
+
+    fig.update_traces(marker=dict(size=7), customdata=df["index"])
+    fig.update_layout(
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=20, r=20, t=40, b=20),
+        legend_title="Cluster",
+    )
+
+    # Cluster size bar chart
+    counts = pd.Series(labels).value_counts().sort_index()
+    df_counts = counts.reset_index()
+    df_counts.columns = ["Cluster", "Count"]
+
+    # Create cluster analysis visualization
+    cluster_analysis_fig = create_cluster_visualization(
+        cluster_stats, dataset_name, labels
+    )
+
+    return fig, cluster_analysis_fig
+
+
+@app.callback(
+    [
+        Output("pdf-viewer", "src"),
+        Output("pdf-viewer", "hidden"),
+        Output("doc-info", "children"),
+    ],
+    [
+        Input("scatter", "clickData"),
+        Input("cluster-analysis", "clickData"),
+    ],
+    prevent_initial_call=False,
+)
+def display_pdfs(scatter_click, cluster_click):
+    from dash import callback_context
+
+    # Check which input triggered the callback
+    ctx = callback_context
+    if not ctx.triggered:
+        return "", True, "Click a point or cluster to view documents"
+
+    trigger_id = ctx.triggered[0]["prop_id"].split(".")[0]
+
+    # Handle cluster click - show multiple documents in grid
+    if trigger_id == "cluster-analysis" and cluster_click:
+        try:
+            # Get cluster indices from customdata
+            cluster_indices = cluster_click["points"][0]["customdata"]
+            if not cluster_indices:
+                return "", True, "No documents in this cluster"
+
+            # Limit to first 12 documents for performance
+            display_indices = cluster_indices[:12]
+            cluster_id = labels[display_indices[0]]
+
+            # Create comma-separated list of document IDs for the grid endpoint
+            doc_indices_list = [str(idx) for idx in display_indices]
+            doc_ids_str = ",".join(doc_indices_list)
+
+            # Add timestamp to force browser refresh
+            import time
+
+            timestamp = int(time.time() * 1000)  # milliseconds
+
+            return (
+                "",
+                True,
+                html.Div(
+                    [
+                        html.H5(
+                            f"Cluster {cluster_id} ({len(cluster_indices)} documents)"
+                        ),
+                        html.Img(
+                            src=f"/cluster_grid/{doc_ids_str}?t={timestamp}",
+                            style={
+                                "width": "100%",
+                                "max-height": "600px",
+                                "object-fit": "contain",
+                                "border": "1px solid #ddd",
+                                "border-radius": "4px",
+                            },
+                        ),
+                    ]
+                ),
+            )
+        except Exception as e:
+            print(f"Error displaying cluster: {e}")
+            return "", True, f"Error displaying cluster: {e}"
+
+    # Handle single document click from scatter plot
+    if trigger_id == "scatter" and scatter_click:
+        try:
+            idx = int(scatter_click["points"][0]["pointIndex"])
+            return (
+                f"/image/{idx}",
+                False,
+                html.Div(
+                    [
+                        html.P(f"Index: {idx}"),
+                        html.P(f"Cluster: {labels[idx]}"),
+                        html.P(f"DocID: {idx}"),
+                    ]
+                ),
+            )
+        except Exception as e:
+            return "", True, f"Error selecting point: {e}"
+
+    return "", True, "Click a point or cluster to view documents"
+
+
+# --------------------------
+# Layout
+# --------------------------
+app.layout = html.Div(
+    [
+        html.Div(
+            [
+                # Dataset
+                dbc.Row(
+                    [
+                        dbc.Col(
+                            [
+                                html.Label("Dataset", className="fw-bold"),
+                                html.Div(
+                                    "Choose the dataset to analyze",
+                                    className="text-muted small mb-2",
+                                ),
+                            ],
+                            width=7,
+                        ),
+                        dbc.Col(
+                            dcc.Dropdown(
+                                id="dataset-dropdown",
+                                options=dataset_options,
+                                value="rvlcdip",
+                                clearable=False,
+                            ),
+                            width=5,
+                        ),
+                    ]
+                ),
+                # Embedding Source
+                dbc.Row(
+                    [
+                        dbc.Col(
+                            [
+                                html.Label("Embedding source", className="fw-bold"),
+                                html.Div(
+                                    "Which embedding model to use",
+                                    className="text-muted small mb-2",
+                                ),
+                            ],
+                            width=7,
+                        ),
+                        dbc.Col(
+                            dcc.Dropdown(
+                                id="embedding-dropdown",
+                                options=[
+                                    {"label": src, "value": src}
+                                    for src in embedding_sources
+                                ],
+                                value=embedding_sources[0],
+                                clearable=False,
+                            ),
+                            width=5,
+                        ),
+                    ]
+                ),
+                # Intermediate dimensions
+                dbc.Row(
+                    [
+                        dbc.Col(
+                            [
+                                html.Label(
+                                    "Intermediate dimensions", className="fw-bold"
+                                ),
+                                html.Div(
+                                    "Projection size before clustering",
+                                    className="text-muted small mb-2",
+                                ),
+                            ],
+                            width=7,
+                        ),
+                        dbc.Col(
+                            dcc.Dropdown(
+                                id="intermediate-dropdown",
+                                options=[
+                                    {"label": str(d), "value": d}
+                                    for d in intermediate_options
+                                ],
+                                value=intermediate_options[0],
+                                clearable=False,
+                            ),
+                            width=5,
+                        ),
+                    ]
+                ),
+                # Minimum cluster size
+                dbc.Row(
+                    [
+                        dbc.Col(
+                            [
+                                html.Label("Minimum cluster size", className="fw-bold"),
+                                html.Div(
+                                    "Smallest allowed cluster size",
+                                    className="text-muted small mb-2",
+                                ),
+                            ],
+                            width=7,
+                        ),
+                        dbc.Col(
+                            dcc.Dropdown(
+                                id="min-cluster-size-dropdown",
+                                options=[
+                                    {"label": str(d), "value": d}
+                                    for d in min_cluster_size_options
+                                ],
+                                value=min_cluster_size_options[0],
+                                clearable=False,
+                            ),
+                            width=5,
+                        ),
+                    ]
+                ),
+                # method
+                dbc.Row(
+                    [
+                        dbc.Col(
+                            [
+                                html.Label("Clustering method", className="fw-bold"),
+                                html.Div(
+                                    "Which clustering algorithm to use",
+                                ),
+                            ],
+                            width=7,
+                        ),
+                        dbc.Col(
+                            dcc.Dropdown(
+                                id="method-dropdown",
+                                options=[
+                                    {"label": "k-means", "value": "kmeans"},
+                                    {"label": "HDBSCAN", "value": "hdbscan"},
+                                ],
+                                value="hdbscan",
+                                clearable=False,
+                            ),
+                            width=5,
+                        ),
+                    ]
+                ),
+            ],
+            style={"gap": "15px"},
+        ),
+        html.Div(
+            [
+                dcc.Graph(id="scatter", style={"height": "700px"}),
+                # dcc.Graph(id="cluster-size-bar", style={"height": "400px"}),
+                dcc.Graph(id="cluster-analysis", style={"height": "800px"}),
+            ],
+            style={"width": "65%", "display": "inline-block", "verticalAlign": "top"},
+        ),
+        html.Div(
+            [
+                html.H4("Selected document"),
+                html.Div(id="doc-info", children="Click a point to open its PDF"),
+                html.Iframe(
+                    id="pdf-viewer",
+                    src="",
+                    style={"width": "100%", "height": "700px"},
+                    hidden=True,
+                ),
+            ],
+            style={
+                "width": "34%",
+                "display": "inline-block",
+                "paddingLeft": "10px",
+                "verticalAlign": "top",
+            },
+        ),
+    ]
+)
+
+# --------------------------
+if __name__ == "__main__":
+    app.run(debug=True, port=8055)
diff --git a/docgenie/analyzation/clustering/webapp/_deprecated/visualize_metrics.py b/docgenie/analyzation/clustering/webapp/_deprecated/visualize_metrics.py
new file mode 100755
index 0000000000000000000000000000000000000000..12bb3dc98fb5b14b25caaa689b89c59e0bdc515a
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/_deprecated/visualize_metrics.py
@@ -0,0 +1,266 @@
+import dash
+import dash_bootstrap_components as dbc
+import pandas as pd
+from dash import Input, Output, State, dash_table, dcc, html
+from sklearn.preprocessing import MinMaxScaler
+
+from docgenie import ENV
+
+csv_fpath = ENV.CLUSTERS_DIR / "metrics-seed=42.csv"
+df = pd.read_csv(csv_fpath)
+
+# Available metrics and their default optimization direction
+METRICS = {
+    "num_clusters": "min",
+    "noise_percent": "min",
+    "connectivity__normalized_connectivity": "max",
+    "compactness__silhouette_score": "max",
+    "compactness__calinski_harabasz_score": "max",
+    "compactness__davies_bouldin_score": "min",
+    "balance__entropy": "max",
+    "balance__coefficient_of_variation": "min",
+    "balance__min-to-max-ratio": "max",
+    "balance__gini-coefficient": "min",
+}
+
+METRIC_DESCRIPTIONS = {
+    "noise_percent (min)": "Proportion of points labeled as noise by HDBSCAN.",
+    "connectivity__normalized_connectivity (max)": "How connected clusters are (higher = more connected).",
+    "compactness__silhouette_score (max)": "Silhouette score (higher = better cluster separation).",
+    "compactness__calinski_harabasz_score (max)": "Calinski-Harabasz index (higher = better defined clusters).",
+    "compactness__davies_bouldin_score (min)": "Davies-Bouldin index (lower = better clustering).",
+    "balance__entropy (max)": "Entropy of cluster size distribution (higher = more balanced).",
+    "balance__coefficient_of_variation (min)": "Coefficient of variation of cluster sizes (lower = more balanced).",
+    "balance__min-to-max-ratio (max)": "Ratio of smallest to largest cluster size (higher = more balanced).",
+    "balance__gini-coefficient (min)": "Gini coefficient of cluster sizes (lower = more balanced).",
+}
+
+# === Dash app ===
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+
+
+@app.callback(Input("kernel-size-dropdown", "value"))
+def update_direction_selectors(kernel_size):
+    global df
+    csv_fpath = ENV.CLUSTERS_DIR / "metrics-seed=42.csv"
+    df = pd.read_csv(csv_fpath)
+    print(f"Read {csv_fpath}")
+
+
+app.layout = dbc.Container(
+    [
+        dbc.Row(
+            [
+                dbc.Col(
+                    html.H2(
+                        "Clustering Evaluation Dashboard", className="text-center my-3"
+                    )
+                )
+            ]
+        ),
+        dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        dbc.Alert(
+                            [
+                                html.H5(
+                                    "How embeddings and clustering are created",
+                                    className="fw-bold",
+                                ),
+                                html.Ol(
+                                    [
+                                        html.Li(
+                                            [
+                                                "Embeddings are created akin to ",
+                                                html.A(
+                                                    "Unsupervised Document and Template Clustering using Multimodal Embeddings",
+                                                    href="https://arxiv.org/pdf/2506.12116",
+                                                    target="_blank",
+                                                ),
+                                                ":",
+                                                html.Br(),
+                                                "Get mean of all text tokens, concatenate with image embedding. Image embedding is concatenation of all image patch tokens and then applying a kernel.",
+                                            ]
+                                        ),
+                                        html.Li(
+                                            "Embeddings are clustered in 2 stages: first HDBSCAN, the points labeled as noise (no cluster membership) are then assigned to identified clusters via k-NN"
+                                        ),
+                                    ]
+                                ),
+                            ],
+                            color="light",
+                            className="shadow-sm mb-4",
+                        )
+                    ]
+                )
+            ]
+        ),
+        dbc.Row(
+            [
+                dbc.Col(
+                    [
+                        dbc.Card(
+                            [
+                                dbc.CardHeader("Metric Selection"),
+                                dbc.CardBody(
+                                    [
+                                        html.Label("Choose metrics to evaluate:"),
+                                        dcc.Checklist(
+                                            id="metric-checklist",
+                                            options=[
+                                                {"label": m, "value": m}
+                                                for m in METRICS.keys()
+                                            ],
+                                            value=[],
+                                            className="mb-3",
+                                        ),
+                                        html.Div(id="direction-selectors"),
+                                        dbc.Button(
+                                            "Compute Best Results",
+                                            id="compute-btn",
+                                            color="primary",
+                                            className="mt-3",
+                                        ),
+                                    ]
+                                ),
+                            ],
+                            className="mb-4",
+                        )
+                    ],
+                    width=4,
+                ),
+                dbc.Col(
+                    [
+                        dbc.Card(
+                            [
+                                dbc.CardHeader("Top Results"),
+                                dbc.CardBody(
+                                    [
+                                        dash_table.DataTable(
+                                            id="results-table",
+                                            page_size=10,
+                                            style_table={"overflowX": "auto"},
+                                            style_cell={
+                                                "textAlign": "left",
+                                                "padding": "8px",
+                                                "font-family": "monospace",
+                                            },
+                                            style_header={
+                                                "fontWeight": "bold",
+                                                "backgroundColor": "#f8f9fa",
+                                            },
+                                            style_data_conditional=[
+                                                {
+                                                    "if": {"state": "active"},
+                                                    "backgroundColor": "#e9ecef",
+                                                    "border": "1px solid #adb5bd",
+                                                },
+                                            ],
+                                        )
+                                    ]
+                                ),
+                            ]
+                        )
+                    ],
+                    width=8,
+                ),
+            ]
+        ),
+    ],
+    fluid=True,
+)
+
+
+@app.callback(
+    Output("direction-selectors", "children"), Input("metric-checklist", "value")
+)
+def update_direction_selectors(selected_metrics):
+    """Show dropdowns for choosing min/max and a description for each selected metric."""
+    controls = []
+    for m in selected_metrics:
+        description = METRIC_DESCRIPTIONS.get(m, "")
+        controls.append(
+            dbc.Card(
+                [
+                    dbc.CardBody(
+                        [
+                            dbc.Row(
+                                [
+                                    dbc.Col(
+                                        [
+                                            html.Label(m, className="fw-bold"),
+                                            html.Div(
+                                                description,
+                                                className="text-muted small mb-2",
+                                            ),
+                                        ],
+                                        width=7,
+                                    ),
+                                    dbc.Col(
+                                        dcc.Dropdown(
+                                            id={
+                                                "type": "direction-dropdown",
+                                                "metric": m,
+                                            },
+                                            options=[
+                                                {"label": "Maximize", "value": "max"},
+                                                {"label": "Minimize", "value": "min"},
+                                            ],
+                                            value=METRICS[m],
+                                            clearable=False,
+                                        ),
+                                        width=5,
+                                    ),
+                                ]
+                            )
+                        ]
+                    )
+                ],
+                className="mb-2",
+            )
+        )
+    return controls
+
+
+@app.callback(
+    Output("results-table", "data"),
+    Output("results-table", "columns"),
+    Input("compute-btn", "n_clicks"),
+    State("metric-checklist", "value"),
+    State({"type": "direction-dropdown", "metric": dash.ALL}, "value"),
+    State({"type": "direction-dropdown", "metric": dash.ALL}, "id"),
+)
+def compute_best_results(n_clicks, selected_metrics, directions, ids):
+    if n_clicks == 0 or not selected_metrics:
+        return [], []
+
+    # Map metrics to directions
+    metric_directions = {i["metric"]: d for i, d in zip(ids, directions)}
+
+    # Copy for normalization but keep original df for output
+    df_norm = df.copy()
+
+    for col in selected_metrics:
+        scaler = MinMaxScaler()
+        values = df[[col]].values
+        normed = scaler.fit_transform(values)
+        if metric_directions[col] == "min":
+            normed = 1 - normed  # flip so higher is better
+        df_norm[col] = normed
+
+    df_norm["final_score"] = df_norm[selected_metrics].mean(axis=1)
+
+    # Select top rows
+    best_idx = df_norm.sort_values("final_score", ascending=False).index
+    best = df.loc[best_idx].copy()
+    best["final_score"] = df_norm.loc[best_idx, "final_score"]
+
+    # Convert to table
+    columns = [{"name": c, "id": c} for c in best.columns]
+    data = best.to_dict("records")
+    return data, columns
+
+
+if __name__ == "__main__":
+    app.run(debug=True, port=8052)
diff --git a/docgenie/analyzation/clustering/webapp/app.py b/docgenie/analyzation/clustering/webapp/app.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb49b1249f3d71db94ff8ab8e15c1ec02302dff2
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/app.py
@@ -0,0 +1,37 @@
+import dash
+import dash_bootstrap_components as dbc
+
+from docgenie.analyzation.clustering.webapp.config import settings
+from docgenie.analyzation.clustering.webapp.components import create_app_layout
+from docgenie.analyzation.clustering.webapp.callbacks import register_callbacks
+from docgenie.analyzation.clustering.webapp.server_routes import setup_server_routes
+from docgenie.analyzation.clustering.webapp.data_manager import data_manager
+
+
+def create_app():
+    """Create and configure the Dash application."""
+    app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+
+    # Setup Flask server routes
+    setup_server_routes(app.server)
+
+    # Load initial dataset
+    data_manager.load_dataset(settings.default_dataset)
+
+    # Set layout
+    app.layout = create_app_layout()
+
+    # Register callbacks
+    register_callbacks(app)
+
+    return app
+
+
+def main():
+    """Main entry point."""
+    app = create_app()
+    app.run(debug=settings.debug, port=settings.port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docgenie/analyzation/clustering/webapp/callbacks.py b/docgenie/analyzation/clustering/webapp/callbacks.py
new file mode 100755
index 0000000000000000000000000000000000000000..ee2200eb0b15fd0848f4eede43d78dbc32fe32fa
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/callbacks.py
@@ -0,0 +1,457 @@
+import time
+import numpy as np
+from dash import Input, Output, html, callback_context, dcc
+import dash_bootstrap_components as dbc
+import dash
+import dash_bootstrap_components as dbc
+import pandas as pd
+from dash import Input, Output, State, dash_table, dcc, html
+import plotly.graph_objects as go
+from .utils.save_utils import get_graph_save_path, save_plotly_figure
+from sklearn.preprocessing import MinMaxScaler
+from .data_manager import data_manager
+from .visualizations import (
+    create_scatter_plot,
+    create_cluster_analysis_plot,
+    generate_individual_cluster_plots,
+)
+from .config import settings
+
+
+def register_callbacks(app):
+    """Register all Dash callbacks."""
+
+    @app.callback(
+        [Input("dataset-dropdown", "value")],
+    )
+    def update_dataset(dataset_name: str):
+        """Update global dataset when dropdown changes."""
+        data_manager.load_dataset(dataset_name)
+
+    @app.callback(
+        [
+            Output("scatter", "figure"),
+            Output("cluster-analysis", "figure"),
+        ],
+        [
+            Input("dataset-dropdown", "value"),
+            Input("intermediate-dropdown", "value"),
+            Input("min-cluster-size-dropdown", "value"),
+            Input("embedding-dropdown", "value"),
+            Input("method-dropdown", "value"),
+        ],
+    )
+    def update_visualizations(
+        dataset_name: str,
+        intermediate_dims: int,
+        min_cluster_size: int,
+        embedding_src: str,
+        method: str,
+    ):
+        """Update scatter plot and cluster analysis when parameters change."""
+        # Get cluster data
+        cluster_data = data_manager.get_cluster_data(
+            dataset_name, embedding_src, intermediate_dims, min_cluster_size, method
+        )
+
+        labels = cluster_data["cluster_data"]["cluster_labels"]
+        soft_clusters = cluster_data["cluster_data"]["soft_clusters"]
+        noise_mask = cluster_data["cluster_data"].get(
+            "noise_mask", np.array([False] * len(labels))
+        )
+        cluster_stats = cluster_data["cluster_stats"]
+        emb_2d = cluster_data["emb_2d"]
+        sample_ids = cluster_data["sample_ids"]
+
+        # Create scatter plot dataframe
+        df = data_manager.create_scatter_dataframe(
+            labels, emb_2d, soft_clusters, sample_ids, noise_mask
+        )
+
+        # Create visualizations
+        scatter_fig = create_scatter_plot(
+            df,
+            embedding_src,
+            dataset_name,
+            min_cluster_size,
+            len(set(cluster_data["cluster_data"]["cluster_labels"])),
+        )
+        cluster_analysis_fig = create_cluster_analysis_plot(
+            cluster_stats, dataset_name, labels
+        )
+
+        return scatter_fig, cluster_analysis_fig
+
+    @app.callback(
+        [
+            Output("pdf-viewer", "src"),
+            Output("pdf-viewer", "hidden"),
+            Output("doc-info", "children"),
+        ],
+        [
+            Input("scatter", "clickData"),
+            Input("cluster-analysis", "clickData"),
+            Input("dataset-dropdown", "value"),
+            Input("intermediate-dropdown", "value"),
+            Input("min-cluster-size-dropdown", "value"),
+            Input("embedding-dropdown", "value"),
+            Input("method-dropdown", "value"),
+        ],
+        prevent_initial_call=False,
+    )
+    def display_documents(
+        scatter_click: dict,
+        cluster_click: dict,
+        dataset_name: str,
+        intermediate_dims: int,
+        min_cluster_size: int,
+        embedding_src: str,
+        method: str,
+    ):
+        """Handle document display for both single and cluster clicks."""
+        ctx = callback_context
+        if not ctx.triggered:
+            return "", True, "Click a point or cluster to view documents"
+
+        trigger_id = ctx.triggered[0]["prop_id"].split(".")[0]
+
+        cluster_data = data_manager.get_cluster_data(
+            dataset_name, embedding_src, intermediate_dims, min_cluster_size, method
+        )
+        labels = cluster_data["cluster_data"]["cluster_labels"]
+        sample_ids = cluster_data["sample_ids"]
+        # Handle cluster click - show grid of documents
+        if trigger_id == "cluster-analysis" and cluster_click:
+            return _handle_cluster_click(cluster_click, labels, sample_ids)
+
+        # Handle single document click
+        if trigger_id == "scatter" and scatter_click:
+            return _handle_scatter_click(scatter_click, labels, sample_ids)
+
+        return "", True, "Click a point or cluster to view documents"
+
+    @app.callback(
+        Output("direction-selectors", "children"), Input("metric-checklist", "value")
+    )
+    def update_direction_selectors(selected_metrics: list):
+        """Show dropdowns for choosing min/max and a description for each selected metric."""
+        controls = []
+        for m in selected_metrics:
+            description = settings.metrics_list[m]["description"]
+            controls.append(
+                dbc.Card(
+                    [
+                        dbc.CardBody(
+                            [
+                                dbc.Row(
+                                    [
+                                        dbc.Col(
+                                            [
+                                                html.Label(m, className="fw-bold"),
+                                                html.Div(
+                                                    description,
+                                                    className="text-muted small mb-2",
+                                                ),
+                                            ],
+                                            width=7,
+                                        ),
+                                        dbc.Col(
+                                            dcc.Dropdown(
+                                                id={
+                                                    "type": "direction-dropdown",
+                                                    "metric": m,
+                                                },
+                                                options=[
+                                                    {
+                                                        "label": "Maximize",
+                                                        "value": "max",
+                                                    },
+                                                    {
+                                                        "label": "Minimize",
+                                                        "value": "min",
+                                                    },
+                                                ],
+                                                value=settings.metrics_list[m][
+                                                    "direction"
+                                                ],
+                                                clearable=False,
+                                            ),
+                                            width=5,
+                                        ),
+                                    ]
+                                )
+                            ]
+                        )
+                    ],
+                    className="mb-2",
+                )
+            )
+        return controls
+
+    @app.callback(
+        Output("results-table", "data"),
+        Output("results-table", "columns"),
+        Input("compute-btn", "n_clicks"),
+        State("metric-checklist", "value"),
+        State({"type": "direction-dropdown", "metric": dash.ALL}, "value"),
+        State({"type": "direction-dropdown", "metric": dash.ALL}, "id"),
+    )
+    def compute_best_results(n_clicks, selected_metrics, directions, ids):
+        if n_clicks == 0 or not selected_metrics:
+            return [], []
+
+        # Map metrics to directions
+        metric_directions = {i["metric"]: d for i, d in zip(ids, directions)}
+
+        # Copy for normalization but keep original df for output
+        df = data_manager.metrics.copy()
+        df_norm = df.copy()
+
+        for col in selected_metrics:
+            scaler = MinMaxScaler()
+            values = df[[col]].values
+            normed = scaler.fit_transform(values)
+            if metric_directions[col] == "min":
+                normed = 1 - normed  # flip so higher is better
+            df_norm[col] = normed
+
+        df_norm["final_score"] = df_norm[selected_metrics].mean(axis=1)
+
+        # Select top rows
+        best_idx = df_norm.sort_values("final_score", ascending=False).index
+        best = df.loc[best_idx].copy()
+        best["final_score"] = df_norm.loc[best_idx, "final_score"]
+
+        # Convert to table
+        columns = [{"name": c, "id": c} for c in best.columns]
+        data = best.to_dict("records")
+        return data, columns
+
+    @app.callback(
+        Output("embedding-overview-table", "data"),
+        Output("embedding-overview-table", "columns"),
+        Input("dataset-dropdown", "value"),
+        Input("min-cluster-size-dropdown", "value"),
+    )
+    def update_embedding_overview(dataset_name, min_cluster_size):
+        """Compute summary metrics (num clusters, silhouette, entropy) for all embeddings."""
+        # Placeholder for results
+        rows = []
+
+        for embedding_src in settings.embedding_sources:
+            try:
+                # Load cluster data for each embedding
+                cluster_data = data_manager.get_cluster_data(
+                    dataset_name,
+                    embedding_src,
+                    settings.default_intermediate,
+                    min_cluster_size,
+                    settings.default_method,
+                )
+
+                labels = cluster_data["cluster_data"]["cluster_labels"]
+
+                # Number of clusters (excluding noise if labeled as -1)
+                valid_labels = labels[labels >= 0]
+                n_clusters = len(np.unique(valid_labels))
+
+                # Silhouette score (skip if only 1 cluster)
+                if n_clusters > 1:
+                    from sklearn.metrics import silhouette_score
+
+                    emb = cluster_data["emb_2d"]  # or full embeddings if available
+                    sil = silhouette_score(emb, labels)
+                else:
+                    sil = np.nan
+
+                # Entropy of cluster distribution
+                from scipy.stats import entropy
+
+                cluster_sizes = np.bincount(valid_labels)
+                probs = cluster_sizes / cluster_sizes.sum()
+                ent = entropy(probs)
+
+                rows.append(
+                    dict(
+                        embedding=embedding_src,
+                        n_clusters=n_clusters,
+                        silhouette=round(sil, 3) if not np.isnan(sil) else "—",
+                        entropy=round(ent, 3),
+                    )
+                )
+
+            except Exception as e:
+                rows.append(
+                    dict(
+                        embedding=embedding_src,
+                        n_clusters="Error",
+                        silhouette="Error",
+                        entropy=str(e),
+                    )
+                )
+
+        df = pd.DataFrame(rows)
+        columns = [{"name": c.replace("_", " ").title(), "id": c} for c in df.columns]
+        return df.to_dict("records"), columns
+
+    @app.callback(
+        Output("save-feedback", "children"),
+        Input("save-all-graphs-btn", "n_clicks"),
+        [
+            State("scatter", "figure"),
+            State("cluster-analysis", "figure"),
+            State("dataset-dropdown", "value"),
+            State("embedding-dropdown", "value"),
+            State("min-cluster-size-dropdown", "value"),
+            State("intermediate-dropdown", "value"),
+            State("method-dropdown", "value"),
+        ],
+        prevent_initial_call=True,
+    )
+    def save_all_graphs(
+        n_clicks,
+        scatter_fig,
+        cluster_fig,
+        dataset_name,
+        embedding_src,
+        min_cluster_size,
+        intermediate_dims,
+        method,
+    ):
+        """Save scatter plot and the cluster-analysis subplots separately."""
+        if not dataset_name or not embedding_src:
+            return dbc.Alert("Missing dataset or embedding source.", color="danger")
+
+        saved_paths = []
+        errors = []
+        ext = "pdf"
+
+        cluster_data = data_manager.get_cluster_data(
+            dataset_name, embedding_src, intermediate_dims, min_cluster_size, method
+        )
+
+        if scatter_fig:
+            try:
+                nclusters = len(set(cluster_data["cluster_data"]["cluster_labels"]))
+                fig = go.Figure(scatter_fig)
+                path = get_graph_save_path(
+                    dataset_name,
+                    "scatter",
+                    embedding_src,
+                    min_cluster_size,
+                    ext=ext,
+                    nclusters=nclusters,
+                )
+                saved = save_plotly_figure(fig, path, fmt=ext)
+                saved_paths.append(saved)
+            except Exception as e:
+                errors.append(f"Scatter: {e}")
+
+        try:
+            cluster_data = data_manager.get_cluster_data(
+                dataset_name, embedding_src, intermediate_dims, min_cluster_size, method
+            )
+            nclusters = len(set(cluster_data["cluster_data"]["cluster_labels"]))
+            cluster_stats = cluster_data["cluster_stats"]
+
+            """This function is generating cluster analysis figurs sepratley
+            because in the webapp there is on plot having further subplots
+            if I save that which I tried it looks odd like text is overlapping
+            so I created separate plots for it and save that separately."""
+            per_plot_figs = generate_individual_cluster_plots(
+                cluster_stats, dataset_name=dataset_name
+            )
+
+            for plot_name, fig in per_plot_figs.items():
+                try:
+                    path = get_graph_save_path(
+                        dataset_name,
+                        plot_name,
+                        embedding_src,
+                        min_cluster_size,
+                        ext=ext,
+                    )
+                    saved = save_plotly_figure(fig, path, fmt=ext)
+                    saved_paths.append(saved)
+                except Exception as e_plot:
+                    errors.append(f"{plot_name}: {e_plot}")
+        except Exception as e:
+            errors.append(f"Cluster subplots generation failed: {e}")
+
+        if errors and saved_paths:
+            msg = "<br>".join(errors)
+            return dbc.Alert(
+                f"Some graphs saved, some failed:<br>{msg}",
+                color="warning",
+                dismissable=True,
+            )
+        elif errors and not saved_paths:
+            msg = "<br>".join(errors)
+            return dbc.Alert(
+                f"Failed to save graphs:<br>{msg}", color="danger", dismissable=True
+            )
+        else:
+            return dbc.Alert(
+                f"Saved {len(saved_paths)} files:<br>" + "<br>".join(saved_paths),
+                color="success",
+                dismissable=True,
+            )
+
+
+def _handle_cluster_click(cluster_click: dict, labels, sample_ids):
+    """Handle cluster visualization clicks."""
+    try:
+        cluster_indices = cluster_click["points"][0]["customdata"]
+        if not cluster_indices:
+            return "", True, "No documents in this cluster"
+
+        # Limit documents for performance
+        display_indices = cluster_indices[:12]
+
+        # Create grid URL
+        doc_ids_str = ",".join(str(sample_ids[idx]) for idx in display_indices)
+        timestamp = int(time.time() * 1000)
+        cluster_id = labels[display_indices[0]]
+
+        return (
+            "",
+            True,
+            html.Div(
+                [
+                    html.H5(f"Cluster {cluster_id} ({len(cluster_indices)} documents)"),
+                    html.Img(
+                        src=f"/cluster_grid/{doc_ids_str}?t={timestamp}",
+                        style={
+                            "width": "100%",
+                            "max-height": "600px",
+                            "object-fit": "contain",
+                            "border": "1px solid #ddd",
+                            "border-radius": "4px",
+                        },
+                    ),
+                ]
+            ),
+        )
+    except Exception as e:
+        print(f"Error displaying cluster: {e}")
+        return "", True, f"Error displaying cluster: {e}"
+
+
+def _handle_scatter_click(scatter_click, labels, sample_ids):
+    """Handle scatter plot clicks."""
+    try:
+        idx = int(scatter_click["points"][0]["pointIndex"])
+        sample_id = str(sample_ids[idx])
+        return (
+            f"/image/{sample_id}",
+            False,
+            html.Div(
+                [
+                    html.P(f"Index: {idx}"),
+                    html.P(f"Cluster: {labels[idx]}"),
+                    html.P(f"DocID: {idx}"),
+                ]
+            ),
+        )
+    except Exception as e:
+        return "", True, f"Error selecting point: {e}"
diff --git a/docgenie/analyzation/clustering/webapp/components.py b/docgenie/analyzation/clustering/webapp/components.py
new file mode 100755
index 0000000000000000000000000000000000000000..25fa43932d11274c7ab74f78ff22eb265c80b23d
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/components.py
@@ -0,0 +1,313 @@
+import dash_bootstrap_components as dbc
+from dash import dcc, html
+
+from .config import settings
+from dash import dash_table, dcc, html
+
+
+def create_control_panel():
+    """Create the main control panel with all dropdowns."""
+    return html.Div(
+        [
+            _create_dropdown_row(
+                "Dataset",
+                "Choose the dataset to analyze",
+                "dataset-dropdown",
+                [{"label": ds, "value": ds} for ds in settings.dataset_options],
+                settings.default_dataset,
+            ),
+            _create_dropdown_row(
+                "Embedding source",
+                "Which embedding model to use",
+                "embedding-dropdown",
+                [{"label": src, "value": src} for src in settings.embedding_sources],
+                settings.default_embedding,
+            ),
+            _create_dropdown_row(
+                "Intermediate dimensions",
+                "Projection size before clustering",
+                "intermediate-dropdown",
+                [{"label": str(d), "value": d} for d in settings.intermediate_options],
+                settings.default_intermediate,
+            ),
+            _create_dropdown_row(
+                "Minimum cluster size",
+                "Smallest allowed cluster size",
+                "min-cluster-size-dropdown",
+                [
+                    {"label": str(d), "value": d}
+                    for d in settings.min_cluster_size_options
+                ],
+                settings.default_min_cluster_size,
+            ),
+            _create_dropdown_row(
+                "Clustering method",
+                "Which clustering algorithm to use",
+                "method-dropdown",
+                [
+                    {"label": "HDBSCAN", "value": "hdbscan"},
+                ],
+                settings.default_method,
+            ),
+        ],
+        style={"gap": "15px"},
+    )
+
+
+def _create_dropdown_row(label, description, dropdown_id, options, value):
+    """Create a standardized dropdown row."""
+    return dbc.Row(
+        [
+            dbc.Col(
+                [
+                    html.Label(label, className="fw-bold"),
+                    html.Div(description, className="text-muted small mb-2"),
+                ],
+                width=7,
+            ),
+            dbc.Col(
+                dcc.Dropdown(
+                    id=dropdown_id,
+                    options=options,
+                    value=value,
+                    clearable=False,
+                ),
+                width=5,
+            ),
+        ]
+    )
+
+
+def create_visualization_panel():
+    """Create main visualization panel with Save Graphs button."""
+    return html.Div(
+        [
+            # Top Row: Control Buttons
+            dbc.Row(
+                [
+                    dbc.Col(
+                        dbc.Button(
+                            "Save Graphs",
+                            id="save-all-graphs-btn",
+                            color="primary",
+                            className="me-2",
+                            style={"width": "100%"},
+                        ),
+                        width=3,
+                    ),
+                    dbc.Col(html.Div(id="save-feedback", style={"marginTop": "5px"}), width=9),
+                ],
+                className="mb-3",
+            ),
+
+            # Graphs
+            dcc.Graph(id="scatter", style={"height": "700px"}),
+            dcc.Graph(id="cluster-analysis", style={"height": "800px"}),
+        ],
+        style={"width": "65%", "display": "inline-block", "verticalAlign": "top"},
+    )
+
+
+
+def create_document_viewer():
+    """Create the document viewer panel."""
+    return html.Div(
+        [
+            html.H4("Selected document"),
+            html.Div(id="doc-info", children="Click a point to open its document"),
+            html.Iframe(
+                id="pdf-viewer",
+                src="",
+                style={"width": "100%", "height": "700px"},
+                hidden=True,
+            ),
+        ],
+        style={
+            "width": "34%",
+            "display": "inline-block",
+            "paddingLeft": "10px",
+            "verticalAlign": "top",
+        },
+    )
+
+
+def create_metrics_viewer():
+    """Create the metrics evaluation panel."""
+    return html.Div(
+        [
+            dbc.Container(
+                [
+                    dbc.Row(
+                        [
+                            dbc.Col(
+                                html.H2(
+                                    "Clustering Evaluation Dashboard",
+                                    className="text-center my-3",
+                                )
+                            )
+                        ]
+                    ),
+                    dbc.Row(
+                        [
+                            dbc.Col(
+                                [
+                                    dbc.Alert(
+                                        [
+                                            html.H5(
+                                                "How embeddings and clustering are created",
+                                                className="fw-bold",
+                                            ),
+                                            html.Ol(
+                                                [
+                                                    html.Li(
+                                                        [
+                                                            "Embeddings are created akin to ",
+                                                            html.A(
+                                                                "Unsupervised Document and Template Clustering using Multimodal Embeddings",
+                                                                href="https://arxiv.org/pdf/2506.12116",
+                                                                target="_blank",
+                                                            ),
+                                                            ":",
+                                                            html.Br(),
+                                                            "Get mean of all text tokens, concatenate with image embedding. Image embedding is concatenation of all image patch tokens and then applying a kernel.",
+                                                        ]
+                                                    ),
+                                                    html.Li(
+                                                        "Embeddings are clustered in 2 stages: first HDBSCAN, the points labeled as noise (no cluster membership) are then assigned to identified clusters via k-NN"
+                                                    ),
+                                                ]
+                                            ),
+                                        ],
+                                        color="light",
+                                        className="shadow-sm mb-4",
+                                    )
+                                ]
+                            )
+                        ]
+                    ),
+                    dbc.Row(
+                        [
+                            dbc.Col(
+                                [
+                                    dbc.Card(
+                                        [
+                                            dbc.CardHeader("Metric Selection"),
+                                            dbc.CardBody(
+                                                [
+                                                    html.Label(
+                                                        "Choose metrics to evaluate:"
+                                                    ),
+                                                    dcc.Checklist(
+                                                        id="metric-checklist",
+                                                        options=[
+                                                            {"label": m, "value": m}
+                                                            for m in settings.metrics_list.keys()
+                                                        ],
+                                                        value=[],
+                                                        className="mb-3",
+                                                    ),
+                                                    html.Div(id="direction-selectors"),
+                                                    dbc.Button(
+                                                        "Compute Best Results",
+                                                        id="compute-btn",
+                                                        color="primary",
+                                                        className="mt-3",
+                                                    ),
+                                                ]
+                                            ),
+                                        ],
+                                        className="mb-4",
+                                    )
+                                ],
+                                width=4,
+                            ),
+                            dbc.Col(
+                                [
+                                    dbc.Card(
+                                        [
+                                            dbc.CardHeader("Top Results"),
+                                            dbc.CardBody(
+                                                [
+                                                    dash_table.DataTable(
+                                                        id="results-table",
+                                                        page_size=10,
+                                                        style_table={
+                                                            "overflowX": "auto"
+                                                        },
+                                                        style_cell={
+                                                            "textAlign": "left",
+                                                            "padding": "8px",
+                                                            "font_family": "monospace",
+                                                        },
+                                                        style_header={
+                                                            "fontWeight": "bold",
+                                                            "backgroundColor": "#f8f9fa",
+                                                        },
+                                                        style_data_conditional=[
+                                                            {
+                                                                "if": {
+                                                                    "state": "active"
+                                                                },
+                                                                "backgroundColor": "#e9ecef",
+                                                                "border": "1px solid #adb5bd",
+                                                            },
+                                                        ],
+                                                    )
+                                                ]
+                                            ),
+                                        ]
+                                    )
+                                ],
+                                width=8,
+                            ),
+                        ]
+                    ),
+                ],
+                fluid=True,
+            )
+        ]
+    )
+
+
+def create_overview_panel():
+    """Overview table: shows per-embedding metrics for selected dataset."""
+    return html.Div(
+        [
+            html.H4("Embedding Overview", className="mt-4"),
+            html.Div(
+                [
+                    html.P(
+                        "Shows summary metrics for each embedding method on the selected dataset:",
+                        className="text-muted small",
+                    ),
+                    dash_table.DataTable(
+                        id="embedding-overview-table",
+                        style_table={"overflowX": "auto"},
+                        style_cell={
+                            "textAlign": "left",
+                            "padding": "8px",
+                            "font_family": "monospace",
+                        },
+                        style_header={
+                            "fontWeight": "bold",
+                            "backgroundColor": "#f8f9fa",
+                        },
+                    ),
+                ]
+            ),
+        ],
+        style={"marginTop": "30px"},
+    )
+
+
+def create_app_layout():
+    """Create the complete app layout."""
+    return html.Div(
+        [
+            create_control_panel(),
+            create_overview_panel(),
+            create_visualization_panel(),
+            create_document_viewer(),
+            create_metrics_viewer(),
+        ]
+    )
diff --git a/docgenie/analyzation/clustering/webapp/config.py b/docgenie/analyzation/clustering/webapp/config.py
new file mode 100755
index 0000000000000000000000000000000000000000..da90e5cfc2dd8da1949aa5705767c40fdd34a880
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/config.py
@@ -0,0 +1,102 @@
+import os
+from typing import List
+from pydantic_settings import BaseSettings
+from docgenie import ENV
+from pathlib import Path
+
+
+class AppSettings(BaseSettings):
+    # App Config
+    debug: bool = True
+    port: int = 8055
+    graphs_base_dir: Path = ENV.CLUSTER_PLOTS
+    external_stylesheets: List[str] = [
+        "https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
+    ]
+
+    # Clustering Options
+    embedding_sources: List[str] = [
+        "paper_kernel=4",
+        "layout",
+        "image",
+        "text",
+        "combined",
+    ]
+    intermediate_options: List[int] = [100]
+    min_cluster_size_options: List[int] = [5, 10]
+    dataset_options: List[str] = sorted([d for d in os.listdir(ENV.CLUSTERS_DIR)])
+
+    # Default Values
+    default_dataset: str = "tobacco3482"
+    default_embedding: str = "paper_kernel=4"
+    default_intermediate: int = 100
+    default_min_cluster_size: int = 5
+    default_method: str = "hdbscan"
+
+    # Clustering Params
+    seed: int = 42
+    metric: str = "euclidean"
+    k_nn_n_neighbors: int = 5
+    do_knn: bool = False
+
+    # Grid Config
+    max_images: int = 12
+    thumb_width: int = 200
+    thumb_height: int = 280
+    spacing: int = 10
+    max_cols: int = 4
+
+    # Metric Descriptions and Optimization Direction
+    @property
+    def metrics_list(self) -> dict:
+        return {
+            "num_clusters": {
+                "direction": "min",
+                "description": "Total number of clusters formed (excluding noise).",
+            },
+            "noise_percent": {
+                "direction": "min",
+                "description": "Proportion of points labeled as noise by HDBSCAN.",
+            },
+            "connectivity__normalized_connectivity": {
+                "direction": "max",
+                "description": "How connected clusters are (higher = more connected).",
+            },
+            "compactness__silhouette_score": {
+                "direction": "max",
+                "description": "Silhouette score (higher = better cluster separation).",
+            },
+            "compactness__calinski_harabasz_score": {
+                "direction": "max",
+                "description": "Calinski-Harabasz index (higher = better defined clusters).",
+            },
+            "compactness__davies_bouldin_score": {
+                "direction": "min",
+                "description": "Davies-Bouldin index (lower = better clustering).",
+            },
+            "balance__entropy": {
+                "direction": "max",
+                "description": "Entropy of cluster size distribution (higher = more balanced).",
+            },
+            "balance__coefficient_of_variation": {
+                "direction": "min",
+                "description": "Coefficient of variation of cluster sizes (lower = more balanced).",
+            },
+            "balance__min-to-max-ratio": {
+                "direction": "max",
+                "description": "Ratio of smallest to largest cluster size (higher = more balanced).",
+            },
+            "balance__gini-coefficient": {
+                "direction": "min",
+                "description": "Gini coefficient of cluster sizes (lower = more balanced).",
+            },
+        }
+
+    # Load metrics CSV
+    @property
+    def metrics_csv_path(self) -> str:
+        return str(ENV.CLUSTERS_DIR / f"metrics-seed={self.seed}.csv")
+
+
+# Initialize settings
+settings = AppSettings()
diff --git a/docgenie/analyzation/clustering/webapp/data_manager.py b/docgenie/analyzation/clustering/webapp/data_manager.py
new file mode 100755
index 0000000000000000000000000000000000000000..f222f1bf1698ac588b2b8c06121ffbd581faf5a0
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/data_manager.py
@@ -0,0 +1,123 @@
+import os
+import pickle
+import numpy as np
+import pandas as pd
+from typing import Dict, Tuple, Optional
+
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._embeddings import (
+    _load_sample_ids_from_embeddings,
+)
+from docgenie.data import load_dataset
+from docgenie.analyzation.clustering.core._utilities import _get_clustering_output_path
+from .config import settings
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+class DataManager:
+    """Manages dataset and clustering data loading."""
+
+    def __init__(self):
+        self.dataset = None
+        self.dataset_name = None
+        self.metrics = None
+        self.cluster_data_cache = {}
+
+    def load_dataset(self, dataset_name: str):
+        """Load dataset and update internal state."""
+        if self.dataset_name != dataset_name:
+            self.dataset = load_dataset(dataset_name=dataset_name, split="train")
+            self.dataset_name = dataset_name
+            self.metrics = pd.read_csv(settings.metrics_csv_path)
+            self.metrics = self.metrics[self.metrics["dataset_name"] == dataset_name]
+
+    def get_cluster_data(
+        self,
+        dataset_name: str,
+        embedding_src: str,
+        intermediate_dims: int,
+        min_cluster_size: int,
+        method: str,
+    ) -> Dict:
+        """Load clustering results with caching."""
+        cache_key = (
+            dataset_name,
+            embedding_src,
+            intermediate_dims,
+            min_cluster_size,
+            method,
+        )
+
+        if cache_key not in self.cluster_data_cache:
+            output_dir = ENV.CLUSTERS_DIR / dataset_name / embedding_src
+            sample_ids = _load_sample_ids_from_embeddings(
+                file_path=ENV.EMBEDDINGS_DIR / dataset_name / f"{embedding_src}.h5"
+            )
+            logger.info("Loading clustering results from %s", output_dir)
+            clusters_path = _get_clustering_output_path(
+                output_dir=output_dir,
+                intermediate_num_dims=intermediate_dims,
+                hdbscan_min_cluster_size=1 if method == "kmeans" else min_cluster_size,
+                hdbscan_metric=settings.metric,
+                k_nn_n_neighbors=settings.k_nn_n_neighbors,
+                method=method,
+            )
+
+            # Load cluster data
+            cluster_data = np.load(clusters_path, allow_pickle=True).item()
+
+            # Load cluster statistics
+            stats_path = clusters_path.parent / clusters_path.name.replace(
+                ".npy", "_stats.csv"
+            )
+            cluster_stats = pd.read_csv(stats_path)
+
+            # Load 2D embeddings
+            emb_2d_path = (
+                output_dir
+                / f"reduced_embeddings_2_{settings.metric}_{settings.seed}.pkl"
+            )
+            if not os.path.exists(emb_2d_path):
+                raise ValueError(f"2D embeddings not found: {emb_2d_path}")
+
+            with open(emb_2d_path, "rb") as f:
+                emb_2d = pickle.load(f)
+
+            self.cluster_data_cache[cache_key] = {
+                "sample_ids": sample_ids,
+                "cluster_data": cluster_data,
+                "cluster_stats": cluster_stats,
+                "emb_2d": emb_2d,
+            }
+
+        return self.cluster_data_cache[cache_key]
+
+    def create_scatter_dataframe(
+        self,
+        labels: np.ndarray,
+        emb_2d: np.ndarray,
+        soft_clusters: np.ndarray,
+        sample_ids: np.ndarray,
+        noise_mask: Optional[np.ndarray] = None,
+    ) -> pd.DataFrame:
+        """Create DataFrame for scatter plot visualization."""
+        if noise_mask is None:
+            noise_mask = np.array([False] * len(labels))
+
+        x, y = emb_2d[:, 0], emb_2d[:, 1]
+        return pd.DataFrame(
+            {
+                "doc_id": sample_ids,
+                "x": x,
+                "y": y,
+                "label": labels,
+                "prob": np.max(soft_clusters, axis=1),
+                "index": np.arange(len(labels)),
+                "noise_mask": noise_mask,
+            }
+        )
+
+
+# Global instance
+data_manager = DataManager()
diff --git a/docgenie/analyzation/clustering/webapp/server_routes.py b/docgenie/analyzation/clustering/webapp/server_routes.py
new file mode 100755
index 0000000000000000000000000000000000000000..b7d5540d1592e3ceb4d7c9cca63ed5a3abfbba8e
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/server_routes.py
@@ -0,0 +1,118 @@
+import io
+from flask import Response
+from PIL import Image, ImageDraw, ImageFont
+
+from .data_manager import data_manager
+from .config import settings
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+def setup_server_routes(server):
+    """Setup Flask server routes."""
+
+    @server.route("/image/<string:sample_id>")
+    def serve_image(sample_id: str):
+        """Serve individual document image."""
+        try:
+            sample = data_manager.dataset.train.get_by_id(sample_id)
+            assert sample.sample_id == sample_id, (
+                f"Mismatched sample ID, found {sample.sample_id}, expected {sample_id}"
+            )
+            image = sample.image.content
+            img_io = io.BytesIO()
+            image.save(img_io, "PNG")
+            img_io.seek(0)
+            return Response(img_io.getvalue(), mimetype="image/png")
+        except (ValueError, IndexError) as e:
+            return f"Invalid sample ID: {str(e)}", 404
+        except Exception as e:
+            return f"Error serving image: {str(e)}", 500
+
+    @server.route("/cluster_grid/<string:sample_ids>")
+    def serve_cluster_grid(sample_ids):
+        """Create and serve a grid image from multiple document images."""
+        try:
+            sample_ids = _parse_ids(sample_ids)
+            grid_img = _create_grid_image(sample_ids)
+            return _image_response(grid_img)
+        except Exception as e:
+            return f"Error creating grid: {str(e)}", 500
+
+
+def _parse_ids(sample_ids):
+    """Parse and validate sample_ids list."""
+    return sample_ids.split(",")[: settings.max_images]
+
+
+def _create_grid_image(sample_ids):
+    """Create a grid image from document sample_ids."""
+    cols = min(settings.max_cols, len(sample_ids))
+    rows = (len(sample_ids) + cols - 1) // cols
+
+    grid_width = cols * settings.thumb_width + (cols - 1) * settings.spacing
+    grid_height = rows * settings.thumb_height + (rows - 1) * settings.spacing
+
+    grid_img = Image.new("RGB", (grid_width, grid_height), "white")
+
+    for i, sample_id in enumerate(sample_ids):
+        _add_thumbnail_to_grid(grid_img, sample_id, i, cols)
+
+    return grid_img
+
+
+def _add_thumbnail_to_grid(grid_img, sample_id, position, cols):
+    """Add a single thumbnail to the grid."""
+    row = position // cols
+    col = position % cols
+    x = col * (settings.thumb_width + settings.spacing)
+    y = row * (settings.thumb_height + settings.spacing)
+
+    try:
+        sample = data_manager.dataset.train.get_by_id(sample_id)
+        assert sample.sample_id == sample_id, (
+            f"Mismatched sample ID, found {sample.sample_id}, expected {sample_id}"
+        )
+        image = sample.image.content
+        image.thumbnail(
+            (settings.thumb_width, settings.thumb_height - 30), Image.Resampling.LANCZOS
+        )
+        grid_img.paste(image, (x, y))
+        _add_label(grid_img, sample_id, x, y + image.height + 5)
+    except Exception as e:
+        logger.exception(f"Error loading image for sample ID {sample_id}")
+        _draw_error_placeholder(grid_img, sample_id, x, y)
+
+
+def _add_label(grid_img, text, x, y):
+    """Add text label to the grid."""
+    draw = ImageDraw.Draw(grid_img)
+    font = _get_font()
+    draw.text((x, y), text, fill="black", font=font)
+
+
+def _get_font():
+    """Get font for text rendering."""
+    try:
+        return ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
+    except Exception:
+        return ImageFont.load_default()
+
+
+def _draw_error_placeholder(grid_img, index, x, y):
+    """Draw error placeholder when image fails to load."""
+    draw = ImageDraw.Draw(grid_img)
+    draw.rectangle(
+        [x, y, x + settings.thumb_width, y + settings.thumb_height - 30],
+        outline="gray",
+        fill="lightgray",
+    )
+    draw.text((x + 10, y + 10), f"Error loading\n{index}", fill="black")
+
+
+def _image_response(image):
+    """Convert PIL Image to Flask Response."""
+    img_io = io.BytesIO()
+    image.save(img_io, "PNG")
+    img_io.seek(0)
+    return Response(img_io.getvalue(), mimetype="image/png")
diff --git a/docgenie/analyzation/clustering/webapp/utils/save_utils.py b/docgenie/analyzation/clustering/webapp/utils/save_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..3d79e6f9d8ee18f0883b0a24073e04dd17e62644
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/utils/save_utils.py
@@ -0,0 +1,50 @@
+import os
+from pathlib import Path
+import plotly.graph_objects as go
+import plotly.io as pio
+from datetime import datetime
+
+from ..config import settings
+
+
+def ensure_dir(path: Path):
+    """Ensure directory exists."""
+    path.mkdir(parents=True, exist_ok=True)
+
+
+def get_graph_save_path(
+    dataset_name: str,
+    graph_type: str,
+    embedding_src: str,
+    min_cluster_size: int,
+    ext: str = "png",
+    nclusters: int | None = None,
+) -> Path:
+    """
+    Construct structured path to save graph image.
+
+    Example:
+        graphs/tobacco3482/scatter/tobacco3482_scatter_paper_kernel=4_min5.png
+    """
+    base_dir = Path(settings.graphs_base_dir)
+    dataset_dir = base_dir / dataset_name / graph_type
+    ensure_dir(dataset_dir)
+
+    filename = f"{dataset_name}_{graph_type}_{embedding_src=}_{min_cluster_size=}_{nclusters=}.{ext}"
+
+    return dataset_dir / filename
+
+
+def save_plotly_figure(
+    fig: go.Figure, save_path: Path, fmt: str = "png", scale: int = 2
+):
+    """
+    Save Plotly figure to disk.
+    Requires `kaleido` to be installed.
+    """
+    ensure_dir(save_path.parent)
+    try:
+        pio.write_image(fig, str(save_path), format=fmt, scale=scale)
+        return str(save_path)
+    except Exception as e:
+        raise RuntimeError(f"Error saving figure to {save_path}: {e}")
diff --git a/docgenie/analyzation/clustering/webapp/visualizations.py b/docgenie/analyzation/clustering/webapp/visualizations.py
new file mode 100755
index 0000000000000000000000000000000000000000..17231b8642480166c8635313acdf454dc78d4ef7
--- /dev/null
+++ b/docgenie/analyzation/clustering/webapp/visualizations.py
@@ -0,0 +1,262 @@
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from pathlib import Path
+
+
+def map_embedding_name_to_final_name(embedding_name: str):
+    match embedding_name:
+        case "layout":
+            return "layoutlm"
+        case "image":
+            return "clip"
+        case "text":
+            return "sentence"
+        case "paper_kernel=4":
+            return "pooled"
+        case "combined":
+            return "combined"
+
+
+def create_scatter_plot(
+    df: pd.DataFrame,
+    embedding_src: str,
+    dataset_name: str,
+    min_cluster_size: int,
+    n_cluster: int,
+) -> go.Figure:
+    """Create interactive scatter plot of document embeddings."""
+    embedding_src = map_embedding_name_to_final_name(embedding_src)
+
+    # # Force categorical colors if labels are numeric
+    # df = df.copy()  # Avoid modifying original
+    # df["label"] = df["label"].astype(str)
+
+    fig = px.scatter(
+        df,
+        x="x",
+        y="y",
+        color="label",
+        # labels={"label": ""},
+        hover_data={"index": True, "label": True, "doc_id": True},
+        # title=f"{dataset_name}: '{embedding_src}' Embeddings, κ={min_cluster_size}, {n_cluster} Clusters",
+    )
+    margin = 0
+
+    fig.update_traces(marker=dict(size=7, showscale=False), customdata=df["index"])
+    fig.update_layout(
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=margin, r=margin, t=margin, b=margin),
+        # legend_title="Cluster",
+        showlegend=False,
+        coloraxis_showscale=False,
+    )
+
+    fig.update_xaxes(visible=False)
+    fig.update_yaxes(visible=False)
+
+    return fig
+
+
+"""This function is used to display the analysis plots as subplot i.e. one figure containing all plots"""
+
+
+def create_cluster_analysis_plot(
+    cluster_df: pd.DataFrame,
+    dataset_name: str,
+    cluster_labels: np.ndarray,
+) -> go.Figure:
+    """Create comprehensive cluster analysis visualization with clickable clusters."""
+    fig = make_subplots(
+        rows=4,
+        cols=1,
+        subplot_titles=(
+            "Cluster Sizes",
+            "Cluster Variances",
+            "Size vs Variance",
+            "Distribution",
+        ),
+        specs=[
+            [{"type": "bar"}],
+            [{"type": "bar"}],
+            [{"type": "scatter"}],
+            [{"type": "histogram"}],
+        ],
+    )
+
+    # Prepare cluster indices for click events
+    cluster_indices = {}
+    for cluster_id in cluster_df["cluster_id"]:
+        indices = np.where(cluster_labels == cluster_id)[0].tolist()
+        cluster_indices[cluster_id] = indices
+
+    # Plot 1: Cluster sizes (clickable)
+    fig.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["size"],
+            name="Size",
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            hovertemplate="Cluster %{x}<br>Size: %{y}<br>Click to view images<extra></extra>",
+        ),
+        row=1,
+        col=1,
+    )
+
+    # Plot 2: Cluster variances
+    fig.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["variance"],
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            name="Variance",
+        ),
+        row=2,
+        col=1,
+    )
+
+    # Plot 3: Size vs Variance scatter (clickable)
+    fig.add_trace(
+        go.Scatter(
+            x=cluster_df["size"],
+            y=cluster_df["variance"],
+            mode="markers",
+            text=cluster_df["cluster_id"],
+            name="Clusters",
+            customdata=[cluster_indices[cid] for cid in cluster_df["cluster_id"]],
+            hovertemplate="Cluster %{text}<br>Size: %{x}<br>Variance: %{y}<br>Click to view images<extra></extra>",
+        ),
+        row=3,
+        col=1,
+    )
+
+    # Plot 4: Size distribution
+    fig.add_trace(
+        go.Histogram(x=cluster_df["size"], name="Size Distribution"),
+        row=4,
+        col=1,
+    )
+
+    fig.update_layout(
+        title_text=f"Cluster Analysis for {dataset_name}",
+        showlegend=False,
+        height=1200,
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=40, r=40, t=40, b=40),
+    )
+
+    _update_subplot_axes(fig)
+    return fig
+
+
+def _update_subplot_axes(fig: go.Figure) -> None:
+    """Update axes labels for all subplots."""
+    fig.update_xaxes(title_text="Cluster ID", row=1, col=1)
+    fig.update_yaxes(title_text="Size", row=1, col=1)
+    fig.update_xaxes(title_text="Cluster ID", row=2, col=1)
+    fig.update_yaxes(title_text="Variance", row=2, col=1)
+    fig.update_xaxes(title_text="Size", row=3, col=1)
+    fig.update_yaxes(title_text="Variance", row=3, col=1)
+    fig.update_xaxes(title_text="Size", row=4, col=1)
+    fig.update_yaxes(title_text="Count", row=4, col=1)
+
+
+"""This function is used to save cluster analysis plots separately not as a single plot"""
+
+
+def generate_individual_cluster_plots(cluster_df, dataset_name: str) -> dict:
+    """
+    Given cluster_df (DataFrame with columns 'cluster_id', 'size', 'variance'),
+    return a dict of plot_name -> go.Figure, one per subplot:
+      - cluster_sizes
+      - cluster_variances
+      - size_vs_variance
+      - distribution
+
+    Note: dataset_name is unused for plotting but kept for potential titles.
+    """
+    plots = {}
+
+    # Ensure expected columns exist
+    if not {"cluster_id", "size", "variance"}.issubset(cluster_df.columns):
+        raise ValueError(
+            "cluster_df must contain 'cluster_id', 'size', and 'variance' columns"
+        )
+
+    # Cluster Sizes (bar)
+    fig_sizes = go.Figure()
+    fig_sizes.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["size"],
+            name="Size",
+        )
+    )
+    fig_sizes.update_layout(
+        title_text=f"Cluster Sizes{' — ' + dataset_name if dataset_name else ''}",
+        xaxis_title="Cluster ID",
+        yaxis_title="Size",
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=20, r=20, t=40, b=20),
+    )
+    plots["cluster_sizes"] = fig_sizes
+
+    # Cluster Variances (bar)
+    fig_var = go.Figure()
+    fig_var.add_trace(
+        go.Bar(
+            x=cluster_df["cluster_id"],
+            y=cluster_df["variance"],
+            name="Variance",
+        )
+    )
+    fig_var.update_layout(
+        title_text=f"Cluster Variances{' — ' + dataset_name if dataset_name else ''}",
+        xaxis_title="Cluster ID",
+        yaxis_title="Variance",
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=20, r=20, t=40, b=20),
+    )
+    plots["cluster_variances"] = fig_var
+
+    # Size vs Variance (scatter)
+    fig_sv = go.Figure()
+    fig_sv.add_trace(
+        go.Scatter(
+            x=cluster_df["size"],
+            y=cluster_df["variance"],
+            mode="markers",
+            text=cluster_df["cluster_id"],
+            name="Size vs Variance",
+        )
+    )
+    fig_sv.update_layout(
+        title_text=f"Size vs Variance{' — ' + dataset_name if dataset_name else ''}",
+        xaxis_title="Size",
+        yaxis_title="Variance",
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=20, r=20, t=40, b=20),
+    )
+    plots["size_vs_variance"] = fig_sv
+
+    # Distribution (histogram)
+    fig_dist = go.Figure()
+    fig_dist.add_trace(go.Histogram(x=cluster_df["size"], name="Size Distribution"))
+    fig_dist.update_layout(
+        title_text=f"Distribution{' — ' + dataset_name if dataset_name else ''}",
+        xaxis_title="Size",
+        yaxis_title="Count",
+        plot_bgcolor="white",
+        paper_bgcolor="white",
+        margin=dict(l=20, r=20, t=40, b=20),
+    )
+    plots["distribution"] = fig_dist
+
+    return plots
diff --git a/docgenie/analyzation/gt/cls/cls_qa_analysis.py b/docgenie/analyzation/gt/cls/cls_qa_analysis.py
new file mode 100755
index 0000000000000000000000000000000000000000..f2248c523a40abc42fe24d7fc2e35945907021a0
--- /dev/null
+++ b/docgenie/analyzation/gt/cls/cls_qa_analysis.py
@@ -0,0 +1,381 @@
+import argparse
+import json
+import numpy as np
+from collections import Counter
+from scipy.stats import entropy
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from docgenie import ENV
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+from docgenie.data.interfaces.dataset import load_dataset
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+# Set seaborn style for CVPR-quality plots
+sns.set_theme(style="whitegrid", context="paper", palette="colorblind")
+plt.rcParams["font.family"] = "serif"
+plt.rcParams["font.size"] = 10
+
+
+def extract_labels(dataset, label_mapping: dict[str, str] = None):
+    """Extract classification labels from dataset."""
+    labels = []
+    for sample in dataset.train:
+        if sample.annotations:
+            label = sample.annotations[0].label.name
+
+            if label_mapping is not None and len(label_mapping) > 0:
+                label = label_mapping[label]
+
+            labels.append(label)
+    return labels
+
+
+def compute_distribution(labels):
+    """Compute label distribution."""
+    counter = Counter(labels)
+    total = sum(counter.values())
+    distribution = {k: v / total for k, v in sorted(counter.items())}
+    return counter, distribution
+
+
+def compare_distributions(real_dist, synth_dist):
+    """Compare two distributions using various metrics."""
+    # Align keys
+    all_labels = sorted(set(real_dist.keys()) | set(synth_dist.keys()))
+
+    real_probs = np.array([real_dist.get(k, 0) for k in all_labels])
+    synth_probs = np.array([synth_dist.get(k, 0) for k in all_labels])
+
+    # KL divergence (add small epsilon to avoid log(0))
+    epsilon = 1e-10
+    kl_div = entropy(real_probs + epsilon, synth_probs + epsilon)
+
+    # Total Variation Distance
+    tvd = 0.5 * np.sum(np.abs(real_probs - synth_probs))
+
+    # L2 distance
+    l2_dist = np.linalg.norm(real_probs - synth_probs)
+
+    return {
+        "kl_divergence": float(kl_div),
+        "total_variation_distance": float(tvd),
+        "l2_distance": float(l2_dist),
+    }
+
+
+def plot_absolute_counts(real_counter, synth_counter, synth_dataset_name, save_path):
+    """Plot absolute class counts comparison."""
+    all_labels = sorted(set(real_counter.keys()) | set(synth_counter.keys()))
+    real_counts = [real_counter.get(k, 0) for k in all_labels]
+    synth_counts = [synth_counter.get(k, 0) for k in all_labels]
+
+    x = np.arange(len(all_labels))
+    width = 0.35
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    bars1 = ax.bar(
+        x - width / 2,
+        real_counts,
+        width,
+        label="Real",
+        color=sns.color_palette("colorblind")[0],
+        alpha=0.85,
+        edgecolor="black",
+        linewidth=0.5,
+    )
+    bars2 = ax.bar(
+        x + width / 2,
+        synth_counts,
+        width,
+        label="Synthetic",
+        color=sns.color_palette("colorblind")[1],
+        alpha=0.85,
+        edgecolor="black",
+        linewidth=0.5,
+    )
+
+    ax.set_xlabel("Class Label", fontsize=11, fontweight="bold")
+    ax.set_ylabel("Count", fontsize=11, fontweight="bold")
+    ax.set_title(
+        f"{synth_dataset_name} Absolute Class Counts Comparison",
+        fontsize=12,
+        fontweight="bold",
+        pad=15,
+    )
+    ax.set_xticks(x)
+
+    # Rotate labels if there are many or if they're long
+    max_label_len = max(len(str(label)) for label in all_labels)
+    rotation = 45 if len(all_labels) > 8 or max_label_len > 10 else 0
+    ha = "right" if rotation > 0 else "center"
+
+    ax.set_xticklabels(all_labels, rotation=rotation, ha=ha)
+    ax.legend(frameon=True, loc="upper right", fontsize=10)
+    ax.grid(axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
+    print(f"Absolute counts plot saved to {save_path}")
+    plt.close()
+
+
+def plot_distribution(real_dist, synth_dist, synth_dataset_name, save_path):
+    """Plot normalized class distribution comparison."""
+    all_labels = sorted(set(real_dist.keys()) | set(synth_dist.keys()))
+    real_probs = [real_dist.get(k, 0) for k in all_labels]
+    synth_probs = [synth_dist.get(k, 0) for k in all_labels]
+
+    x = np.arange(len(all_labels))
+    width = 0.35
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    bars1 = ax.bar(
+        x - width / 2,
+        real_probs,
+        width,
+        label="Real",
+        color=sns.color_palette("colorblind")[0],
+        alpha=0.85,
+        edgecolor="black",
+        linewidth=0.5,
+    )
+    bars2 = ax.bar(
+        x + width / 2,
+        synth_probs,
+        width,
+        label="Synthetic",
+        color=sns.color_palette("colorblind")[1],
+        alpha=0.85,
+        edgecolor="black",
+        linewidth=0.5,
+    )
+
+    ax.set_xticks(x)
+
+    # Rotate labels if there are many or if they're long
+    max_label_len = max(len(str(label)) for label in all_labels)
+    rotation = 45 if len(all_labels) > 8 or max_label_len > 10 else 0
+    ha = "right" if rotation > 0 else "center"
+
+    ax.set_xticklabels(all_labels, rotation=rotation, ha=ha)
+    ax.legend(frameon=True, loc="upper right", fontsize=10)
+    ax.grid(axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    # Format y-axis as percentage
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.1%}"))
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
+    print(f"Distribution plot saved to {save_path}")
+    plt.close()
+
+
+def plot_difference_heatmap(real_dist, synth_dist, synth_dataset_name, save_path):
+    """Plot heatmap showing per-class differences."""
+    all_labels = sorted(set(real_dist.keys()) | set(synth_dist.keys()))
+    differences = [
+        (synth_dist.get(k, 0) - real_dist.get(k, 0)) * 100 for k in all_labels
+    ]
+
+    fig, ax = plt.subplots(figsize=(10, max(6, len(all_labels) * 0.4)))
+
+    # Create diverging colormap centered at 0
+    cmap = sns.diverging_palette(250, 10, as_cmap=True)
+
+    # Create heatmap data
+    data = np.array(differences).reshape(-1, 1)
+
+    sns.heatmap(
+        data,
+        annot=True,
+        fmt=".2f",
+        cmap=cmap,
+        center=0,
+        yticklabels=all_labels,
+        xticklabels=["Diff (%)"],
+        cbar_kws={"label": "Percentage Point Difference"},
+        linewidths=0.5,
+        linecolor="gray",
+        ax=ax,
+    )
+
+    ax.set_title(
+        f"{synth_dataset_name} Per-Class Distribution Difference (Synthetic - Real)",
+        fontsize=12,
+        fontweight="bold",
+        pad=15,
+    )
+    ax.set_ylabel("Class Label", fontsize=11, fontweight="bold")
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
+    print(f"Difference heatmap saved to {save_path}")
+    plt.close()
+
+
+def save_metrics(
+    metrics, real_counter, synth_counter, real_dist, synth_dist, save_path
+):
+    """Save all metrics and distributions to JSON."""
+    all_labels = sorted(set(real_counter.keys()) | set(synth_counter.keys()))
+
+    output = {
+        "metrics": metrics,
+        "class_statistics": {
+            label: {
+                "real_count": real_counter.get(label, 0),
+                "synth_count": synth_counter.get(label, 0),
+                "real_proportion": float(real_dist.get(label, 0)),
+                "synth_proportion": float(synth_dist.get(label, 0)),
+                "difference_percentage_points": float(
+                    (synth_dist.get(label, 0) - real_dist.get(label, 0)) * 100
+                ),
+            }
+            for label in all_labels
+        },
+        "summary": {
+            "total_real_samples": sum(real_counter.values()),
+            "total_synth_samples": sum(synth_counter.values()),
+            "num_classes": len(all_labels),
+        },
+    }
+
+    with open(save_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    print(f"Metrics saved to {save_path}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="CLS GT Comparison",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "synthdataset",
+        type=str,
+        help="Name of the synthetic dataset",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    # Configuration
+    synth_dataset_name = parse_args().synthdataset
+
+    # Load datasets
+    base_dataset_name = get_base_dataset_name(synth_dataset_name)
+    print(
+        f"Loading datasets: {base_dataset_name} (real) vs {synth_dataset_name} (synthetic)"
+    )
+
+    base_dataset = load_dataset(base_dataset_name, is_synthetic=False)
+    synth_dataset = load_dataset(synth_dataset_name, is_synthetic=True)
+
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{synth_dataset_name}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    label_mapping = dsdef.label_mapping
+
+    # Extract labels
+    real_labels = extract_labels(base_dataset, label_mapping=None)
+    synth_labels = extract_labels(synth_dataset, label_mapping=label_mapping)
+
+    print(f"\nDataset sizes:")
+    print(f"  Real: {len(real_labels)} samples")
+    print(f"  Synthetic: {len(synth_labels)} samples")
+
+    # Compute distributions
+    real_counter, real_dist = compute_distribution(real_labels)
+    synth_counter, synth_dist = compute_distribution(synth_labels)
+
+    # Print distributions
+    print("\n" + "=" * 80)
+    print("CLASS DISTRIBUTION COMPARISON")
+    print("=" * 80)
+    print(
+        f"{'Class':<20} {'Real Count':<15} {'Real %':<12} {'Synth Count':<15} {'Synth %':<12} {'Diff %':<10}"
+    )
+    print("-" * 80)
+
+    all_labels = sorted(set(real_counter.keys()) | set(synth_counter.keys()))
+    for label in all_labels:
+        real_count = real_counter.get(label, 0)
+        synth_count = synth_counter.get(label, 0)
+        real_pct = real_dist.get(label, 0) * 100
+        synth_pct = synth_dist.get(label, 0) * 100
+        diff_pct = synth_pct - real_pct
+
+        print(
+            f"{label:<20} {real_count:<15} {real_pct:<12.2f} {synth_count:<15} {synth_pct:<12.2f} {diff_pct:+.2f}"
+        )
+
+    # Compare distributions
+    metrics = compare_distributions(real_dist, synth_dist)
+
+    print("\n" + "=" * 80)
+    print("DISTRIBUTION SIMILARITY METRICS")
+    print("=" * 80)
+    print(f"KL Divergence (Real || Synth):    {metrics['kl_divergence']:.4f}")
+    print(
+        f"Total Variation Distance:          {metrics['total_variation_distance']:.4f}"
+    )
+    print(f"L2 Distance:                       {metrics['l2_distance']:.4f}")
+    print("=" * 80)
+    print("\nInterpretation:")
+    print("  - Lower values indicate more similar distributions")
+    print("  - KL divergence: 0 = identical, higher = more different")
+    print("  - TVD: [0, 1], 0 = identical, 1 = completely different")
+
+    # Create output directory
+    output_dir = ENV.CLS_GT_ANALYZATION_DIR / synth_dataset_name
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Generate and save plots
+    print("\n" + "=" * 80)
+    print("GENERATING PLOTS")
+    print("=" * 80)
+
+    # plot_absolute_counts(
+    #     real_counter,
+    #     synth_counter,
+    #     synth_dataset_name=synth_dataset_name,
+    #     save_path=output_dir / "absolute_counts.pdf",
+    # )
+
+    plot_distribution(
+        real_dist,
+        synth_dist,
+        synth_dataset_name=synth_dataset_name,
+        save_path=output_dir / f"{synth_dataset_name}_distribution.pdf",
+    )
+
+    # plot_difference_heatmap(
+    #     real_dist,
+    #     synth_dist,
+    #     synth_dataset_name=synth_dataset_name,
+    #     save_path=output_dir / "difference_heatmap.pdf",
+    # )
+
+    # Save metrics to JSON
+    save_metrics(
+        metrics,
+        real_counter,
+        synth_counter,
+        real_dist,
+        synth_dist,
+        save_path=output_dir / "metrics.json",
+    )
+
+    print("\n" + "=" * 80)
+    print(f"All outputs saved to: {output_dir}")
+    print("=" * 80)
diff --git a/docgenie/analyzation/gt/dla/dla_gt_analysis.py b/docgenie/analyzation/gt/dla/dla_gt_analysis.py
new file mode 100755
index 0000000000000000000000000000000000000000..834110e8b5b12bd5b349244e32c309b0f954b156
--- /dev/null
+++ b/docgenie/analyzation/gt/dla/dla_gt_analysis.py
@@ -0,0 +1,885 @@
+#!/usr/bin/env python3
+"""
+Compare Document Layout Analysis Ground Truth between Synthetic and Real Datasets
+For CVPR paper on synthesis of document understanding datasets
+"""
+
+import argparse
+import os
+from pathlib import Path
+from collections import defaultdict
+import atria_core
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Tuple
+from dataclasses import dataclass
+from scipy.stats import entropy, wasserstein_distance
+from scipy.spatial.distance import jensenshannon
+
+from docgenie import ENV
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+from docgenie.data.interfaces.dataset import load_dataset
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+# Set seaborn style for CVPR-quality figures
+sns.set_theme(style="whitegrid", context="paper", palette="colorblind")
+plt.rcParams["figure.dpi"] = 300
+plt.rcParams["savefig.dpi"] = 300
+plt.rcParams["font.size"] = 9
+plt.rcParams["axes.labelsize"] = 10
+plt.rcParams["axes.titlesize"] = 11
+plt.rcParams["xtick.labelsize"] = 8
+plt.rcParams["ytick.labelsize"] = 8
+plt.rcParams["legend.fontsize"] = 9
+plt.rcParams["figure.titlesize"] = 12
+
+
+@dataclass
+class DatasetMetrics:
+    """Container for computed dataset metrics"""
+
+    bbox_sizes: Dict[str, List[Tuple[float, float]]]  # label -> [(width, height), ...]
+    bbox_areas: Dict[str, List[float]]  # label -> [area, ...]
+    bbox_aspect_ratios: Dict[str, List[float]]  # label -> [aspect_ratio, ...]
+    centroids: Dict[str, List[Tuple[float, float]]]  # label -> [(x, y), ...]
+    region_counts: Dict[str, List[int]]  # label -> [count_per_doc, ...]
+    page_coverages: List[float]  # coverage per document
+    pairwise_distances: Dict[
+        Tuple[str, str], List[float]
+    ]  # (label1, label2) -> [distances, ...]
+    adjacency_counts: Dict[Tuple[str, str], int]  # (label1, label2) -> count
+
+
+def compute_bbox_metrics(
+    bbox_abs: List[float],
+    img_width: int,
+    img_height: int,
+    bbox_norm: List[float] = None,
+) -> Dict:
+    """Compute metrics for a single bounding box
+
+    Args:
+        bbox_abs: Bounding box in absolute coordinates [x1, y1, x2, y2]
+        img_width: Image width
+        img_height: Image height
+        bbox_norm: Bounding box in normalized coordinates [x1, y1, x2, y2] (0-1 range)
+    """
+    x1, y1, x2, y2 = bbox_abs
+    width = x2 - x1
+    height = y2 - y1
+    area = width * height
+    aspect_ratio = width / height if height > 0 else 0
+
+    # Use normalized coords for centroid if provided, otherwise compute from absolute
+    if bbox_norm is not None:
+        x1_n, y1_n, x2_n, y2_n = bbox_norm
+        centroid_x = (x1_n + x2_n) / 2
+        centroid_y = (y1_n + y2_n) / 2
+    else:
+        centroid_x = (x1 + x2) / 2 / img_width
+        centroid_y = (y1 + y2) / 2 / img_height
+
+    return {
+        "size": (width, height),
+        "area": area,
+        "aspect_ratio": aspect_ratio,
+        "centroid": (centroid_x, centroid_y),
+        "norm_area": area / (img_width * img_height),
+    }
+
+
+def compute_pairwise_distances(
+    bboxes: List[Dict], labels: List[str]
+) -> Dict[Tuple[str, str], List[float]]:
+    """Compute pairwise distances between region centroids"""
+    distances = defaultdict(list)
+
+    for i in range(len(bboxes)):
+        for j in range(i + 1, len(bboxes)):
+            label_pair = tuple(sorted([labels[i], labels[j]]))
+            c1 = bboxes[i]["centroid"]
+            c2 = bboxes[j]["centroid"]
+            dist = np.sqrt((c1[0] - c2[0]) ** 2 + (c1[1] - c2[1]) ** 2)
+            distances[label_pair].append(dist)
+
+    return distances
+
+
+def compute_adjacency(
+    bboxes: List[Dict], labels: List[str], threshold: float = 0.1
+) -> Dict[Tuple[str, str], int]:
+    """Compute adjacency matrix (regions within threshold distance)"""
+    adjacency = defaultdict(int)
+
+    for i in range(len(bboxes)):
+        for j in range(i + 1, len(bboxes)):
+            c1 = bboxes[i]["centroid"]
+            c2 = bboxes[j]["centroid"]
+            dist = np.sqrt((c1[0] - c2[0]) ** 2 + (c1[1] - c2[1]) ** 2)
+
+            if dist < threshold:
+                label_pair = tuple(sorted([labels[i], labels[j]]))
+                adjacency[label_pair] += 1
+
+    return adjacency
+
+
+def extract_metrics(dataset, label_mapping: dict[str, str] = None) -> DatasetMetrics:
+    """Extract all metrics from a dataset"""
+    bbox_sizes = defaultdict(list)
+    bbox_areas = defaultdict(list)
+    bbox_aspect_ratios = defaultdict(list)
+    centroids = defaultdict(list)
+    region_counts = defaultdict(list)
+    page_coverages = []
+    all_pairwise_distances = defaultdict(list)
+    all_adjacency_counts = defaultdict(int)
+
+    # Debug: check first sample
+    first_sample = True
+
+    for sample in dataset.train:
+        img_width = sample.image.width
+        img_height = sample.image.height
+
+        # Get annotations
+        annotation = sample.annotations[0]  # LayoutAnalysisAnnotation
+        if type(annotation).__name__ == "ClassificationAnnotation":
+            annotation = sample.annotations[1]  # LayoutAnalysisAnnotation
+
+        labels = annotation.annotated_objects.label.name
+        bboxes_list = annotation.annotated_objects.bbox.value
+
+        # Check if bboxes are normalized
+        is_normalized = annotation.annotated_objects.bbox.normalized
+
+        if first_sample:
+            print(f"  First sample debug info:")
+            print(f"    Image size: {img_width}x{img_height}")
+            print(f"    Bboxes normalized flag: {is_normalized}")
+            if len(bboxes_list) > 0:
+                print(f"    First bbox value: {bboxes_list[0]}")
+                print(f"    Number of bboxes: {len(bboxes_list)}")
+            first_sample = False
+
+        if label_mapping is not None and len(label_mapping) > 0:
+            labels = [label_mapping[l] for l in labels]
+
+        # Count regions per label
+        label_count = defaultdict(int)
+        doc_bboxes = []
+        doc_labels = []
+        total_area = 0
+
+        for label, bbox_value in zip(labels, bboxes_list):
+            # Handle normalized vs absolute coordinates
+            if is_normalized:
+                # Bboxes are already normalized (0-1), convert to absolute for metrics
+                x1, y1, x2, y2 = bbox_value
+                x1_abs = x1 * img_width
+                y1_abs = y1 * img_height
+                x2_abs = x2 * img_width
+                y2_abs = y2 * img_height
+                bbox_abs = [x1_abs, y1_abs, x2_abs, y2_abs]
+                bbox_norm = bbox_value  # Already normalized for centroid
+            else:
+                # Bboxes are in absolute coordinates
+                bbox_abs = bbox_value
+                x1, y1, x2, y2 = bbox_value
+                # Normalize for centroid calculation
+                bbox_norm = [
+                    x1 / img_width,
+                    y1 / img_height,
+                    x2 / img_width,
+                    y2 / img_height,
+                ]
+
+            metrics = compute_bbox_metrics(bbox_abs, img_width, img_height, bbox_norm)
+
+            # Store per-label metrics
+            bbox_sizes[label].append(metrics["size"])
+            bbox_areas[label].append(metrics["area"])
+            bbox_aspect_ratios[label].append(metrics["aspect_ratio"])
+            centroids[label].append(metrics["centroid"])
+
+            label_count[label] += 1
+            doc_bboxes.append(metrics)
+            doc_labels.append(label)
+            total_area += metrics["norm_area"]
+
+        # Region counts per document
+        for label, count in label_count.items():
+            region_counts[label].append(count)
+
+        # Page coverage
+        page_coverages.append(min(total_area, 1.0))  # Cap at 1.0
+
+        # Pairwise distances
+        doc_distances = compute_pairwise_distances(doc_bboxes, doc_labels)
+        for pair, dists in doc_distances.items():
+            all_pairwise_distances[pair].extend(dists)
+
+        # Adjacency
+        doc_adjacency = compute_adjacency(doc_bboxes, doc_labels)
+        for pair, count in doc_adjacency.items():
+            all_adjacency_counts[pair] += count
+
+    return DatasetMetrics(
+        bbox_sizes=dict(bbox_sizes),
+        bbox_areas=dict(bbox_areas),
+        bbox_aspect_ratios=dict(bbox_aspect_ratios),
+        centroids=dict(centroids),
+        region_counts=dict(region_counts),
+        page_coverages=page_coverages,
+        pairwise_distances=dict(all_pairwise_distances),
+        adjacency_counts=dict(all_adjacency_counts),
+    )
+
+
+def compute_distribution_metrics(
+    real_data: np.ndarray, synth_data: np.ndarray, bins: int = 50
+) -> Dict[str, float]:
+    """Compute various distribution comparison metrics"""
+    # Create histograms with same bins for both
+    min_val = min(real_data.min(), synth_data.min())
+    max_val = max(real_data.max(), synth_data.max())
+    bin_edges = np.linspace(min_val, max_val, bins + 1)
+
+    real_hist, _ = np.histogram(real_data, bins=bin_edges, density=True)
+    synth_hist, _ = np.histogram(synth_data, bins=bin_edges, density=True)
+
+    # Normalize histograms to sum to 1 for probability distributions
+    real_hist = real_hist / (real_hist.sum() + 1e-10)
+    synth_hist = synth_hist / (synth_hist.sum() + 1e-10)
+
+    # Add small epsilon to avoid log(0)
+    real_hist = real_hist + 1e-10
+    synth_hist = synth_hist + 1e-10
+
+    # KL Divergence (synth || real)
+    kl_div = entropy(synth_hist, real_hist)
+
+    # Jensen-Shannon Divergence (symmetric)
+    js_div = jensenshannon(real_hist, synth_hist) ** 2
+
+    # Wasserstein Distance (Earth Mover's Distance)
+    w_dist = wasserstein_distance(real_data, synth_data)
+
+    # Mean and std comparison
+    mean_diff = abs(np.mean(synth_data) - np.mean(real_data))
+    std_diff = abs(np.std(synth_data) - np.std(real_data))
+
+    return {
+        "kl_divergence": kl_div,
+        "js_divergence": js_div,
+        "wasserstein_distance": w_dist,
+        "mean_difference": mean_diff,
+        "std_difference": std_diff,
+        "real_mean": np.mean(real_data),
+        "synth_mean": np.mean(synth_data),
+        "real_std": np.std(real_data),
+        "synth_std": np.std(synth_data),
+    }
+
+
+# def plot_size_distribution_comparison(
+#     real_metrics: DatasetMetrics,
+#     synth_metrics: DatasetMetrics,
+#     output_dir: Path,
+#     dataset_name: str,
+# ):
+#     """Plot comprehensive size distribution comparison for all classes"""
+#     all_labels = sorted(
+#         set(real_metrics.bbox_areas.keys()) | set(synth_metrics.bbox_areas.keys())
+#     )
+
+#     # Compute metrics for each label
+#     area_metrics = {}
+#     width_metrics = {}
+#     height_metrics = {}
+#     aspect_ratio_metrics = {}
+
+#     for label in all_labels:
+#         if label in real_metrics.bbox_areas and label in synth_metrics.bbox_areas:
+#             # Area metrics
+#             real_areas = np.array(real_metrics.bbox_areas[label])
+#             synth_areas = np.array(synth_metrics.bbox_areas[label])
+#             area_metrics[label] = compute_distribution_metrics(real_areas, synth_areas)
+
+#             # Width and height metrics
+#             real_sizes = np.array(real_metrics.bbox_sizes[label])
+#             synth_sizes = np.array(synth_metrics.bbox_sizes[label])
+#             width_metrics[label] = compute_distribution_metrics(
+#                 real_sizes[:, 0], synth_sizes[:, 0]
+#             )
+#             height_metrics[label] = compute_distribution_metrics(
+#                 real_sizes[:, 1], synth_sizes[:, 1]
+#             )
+
+#             # Aspect ratio metrics
+#             real_ar = np.array(real_metrics.bbox_aspect_ratios[label])
+#             synth_ar = np.array(synth_metrics.bbox_aspect_ratios[label])
+#             aspect_ratio_metrics[label] = compute_distribution_metrics(
+#                 real_ar, synth_ar
+#             )
+
+#     # Create comprehensive comparison plot
+#     fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+#     fig.suptitle(
+#         f"{dataset_name}: Size Distribution Comparison", fontsize=14, fontweight="bold"
+#     )
+
+#     labels_list = list(area_metrics.keys())
+#     x_pos = np.arange(len(labels_list))
+
+#     # Plot 1: KL Divergence comparison
+#     kl_areas = [area_metrics[l]["kl_divergence"] for l in labels_list]
+#     kl_widths = [width_metrics[l]["kl_divergence"] for l in labels_list]
+#     kl_heights = [height_metrics[l]["kl_divergence"] for l in labels_list]
+#     kl_ar = [aspect_ratio_metrics[l]["kl_divergence"] for l in labels_list]
+
+#     width = 0.2
+#     axes[0, 0].bar(x_pos - 1.5 * width, kl_areas, width, label="Area", alpha=0.8)
+#     axes[0, 0].bar(x_pos - 0.5 * width, kl_widths, width, label="Width", alpha=0.8)
+#     axes[0, 0].bar(x_pos + 0.5 * width, kl_heights, width, label="Height", alpha=0.8)
+#     axes[0, 0].bar(x_pos + 1.5 * width, kl_ar, width, label="Aspect Ratio", alpha=0.8)
+#     axes[0, 0].set_ylabel("KL Divergence")
+#     axes[0, 0].set_title("KL Divergence (Synth || Real)", fontweight="bold")
+#     axes[0, 0].set_xticks(x_pos)
+#     axes[0, 0].set_xticklabels([l[:20] for l in labels_list], rotation=45, ha="right")
+#     axes[0, 0].legend()
+#     axes[0, 0].grid(axis="y", alpha=0.3)
+
+#     # Plot 2: Jensen-Shannon Divergence
+#     js_areas = [area_metrics[l]["js_divergence"] for l in labels_list]
+#     js_widths = [width_metrics[l]["js_divergence"] for l in labels_list]
+#     js_heights = [height_metrics[l]["js_divergence"] for l in labels_list]
+#     js_ar = [aspect_ratio_metrics[l]["js_divergence"] for l in labels_list]
+
+#     axes[0, 1].bar(x_pos - 1.5 * width, js_areas, width, label="Area", alpha=0.8)
+#     axes[0, 1].bar(x_pos - 0.5 * width, js_widths, width, label="Width", alpha=0.8)
+#     axes[0, 1].bar(x_pos + 0.5 * width, js_heights, width, label="Height", alpha=0.8)
+#     axes[0, 1].bar(x_pos + 1.5 * width, js_ar, width, label="Aspect Ratio", alpha=0.8)
+#     axes[0, 1].set_ylabel("JS Divergence")
+#     axes[0, 1].set_title("Jensen-Shannon Divergence", fontweight="bold")
+#     axes[0, 1].set_xticks(x_pos)
+#     axes[0, 1].set_xticklabels([l[:20] for l in labels_list], rotation=45, ha="right")
+#     axes[0, 1].legend()
+#     axes[0, 1].grid(axis="y", alpha=0.3)
+
+#     # Plot 3: Wasserstein Distance
+#     w_areas = [area_metrics[l]["wasserstein_distance"] for l in labels_list]
+#     w_widths = [width_metrics[l]["wasserstein_distance"] for l in labels_list]
+#     w_heights = [height_metrics[l]["wasserstein_distance"] for l in labels_list]
+
+#     axes[1, 0].bar(x_pos - width, w_areas, width, label="Area", alpha=0.8)
+#     axes[1, 0].bar(x_pos, w_widths, width, label="Width", alpha=0.8)
+#     axes[1, 0].bar(x_pos + width, w_heights, width, label="Height", alpha=0.8)
+#     axes[1, 0].set_ylabel("Wasserstein Distance")
+#     axes[1, 0].set_title("Wasserstein Distance (Earth Mover)", fontweight="bold")
+#     axes[1, 0].set_xticks(x_pos)
+#     axes[1, 0].set_xticklabels([l[:20] for l in labels_list], rotation=45, ha="right")
+#     axes[1, 0].legend()
+#     axes[1, 0].grid(axis="y", alpha=0.3)
+
+#     # Plot 4: Mean and Std differences for area
+#     mean_diffs = [area_metrics[l]["mean_difference"] for l in labels_list]
+#     std_diffs = [area_metrics[l]["std_difference"] for l in labels_list]
+
+#     ax4_twin = axes[1, 1].twinx()
+#     p1 = axes[1, 1].bar(
+#         x_pos - width / 2, mean_diffs, width, label="Mean Diff", alpha=0.8, color="C0"
+#     )
+#     p2 = ax4_twin.bar(
+#         x_pos + width / 2, std_diffs, width, label="Std Diff", alpha=0.8, color="C1"
+#     )
+#     axes[1, 1].set_ylabel("Mean Difference (pixels²)", color="C0")
+#     ax4_twin.set_ylabel("Std Difference (pixels²)", color="C1")
+#     axes[1, 1].set_title("Area: Mean & Std Differences", fontweight="bold")
+#     axes[1, 1].set_xticks(x_pos)
+#     axes[1, 1].set_xticklabels([l[:20] for l in labels_list], rotation=45, ha="right")
+#     axes[1, 1].tick_params(axis="y", labelcolor="C0")
+#     ax4_twin.tick_params(axis="y", labelcolor="C1")
+#     axes[1, 1].grid(axis="y", alpha=0.3)
+
+#     # Add combined legend for plot 4
+#     lines = [p1, p2]
+#     labels = ["Mean Diff", "Std Diff"]
+#     axes[1, 1].legend(lines, labels, loc="upper left")
+
+#     plt.tight_layout()
+#     plt.savefig(
+#         output_dir / "size_distribution_comparison.png", dpi=300, bbox_inches="tight"
+#     )
+#     plt.close()
+
+#     # Create detailed overlay histograms for each class
+#     n_labels = len(labels_list)
+#     print(labels_list)
+#     n_cols = 3
+#     n_rows = (n_labels + n_cols - 1) // n_cols
+
+#     fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
+#     fig.suptitle(
+#         f"{dataset_name}: Area Distribution Overlays", fontsize=14, fontweight="bold"
+#     )
+#     axes = axes.flatten() if n_labels > 1 else [axes]
+
+#     for idx, label in enumerate(labels_list):
+#         if label in real_metrics.bbox_areas and label in synth_metrics.bbox_areas:
+#             real_areas = np.array(real_metrics.bbox_areas[label])
+#             synth_areas = np.array(synth_metrics.bbox_areas[label])
+
+#             axes[idx].hist(
+#                 real_areas,
+#                 bins=40,
+#                 alpha=0.5,
+#                 label="Real",
+#                 density=True,
+#                 color="C0",
+#                 edgecolor="black",
+#                 linewidth=0.5,
+#             )
+#             axes[idx].hist(
+#                 synth_areas,
+#                 bins=40,
+#                 alpha=0.5,
+#                 label="Synth",
+#                 density=True,
+#                 color="C1",
+#                 edgecolor="black",
+#                 linewidth=0.5,
+#             )
+
+#             axes[idx].set_xlabel("Area (pixels²)")
+#             axes[idx].set_ylabel("Density")
+#             axes[idx].set_title(f"{label}", fontweight="bold")
+#             axes[idx].legend()
+#             axes[idx].grid(axis="y", alpha=0.3)
+
+#             # Add metrics text
+#             metrics_text = (
+#                 f"KL: {area_metrics[label]['kl_divergence']:.3f}\n"
+#                 f"JS: {area_metrics[label]['js_divergence']:.3f}"
+#             )
+#             axes[idx].text(
+#                 0.98,
+#                 0.98,
+#                 metrics_text,
+#                 transform=axes[idx].transAxes,
+#                 verticalalignment="top",
+#                 horizontalalignment="right",
+#                 bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
+#                 fontsize=8,
+#             )
+
+#     # Hide unused subplots
+#     for idx in range(n_labels, len(axes)):
+#         axes[idx].axis("off")
+
+#     plt.tight_layout()
+#     plt.savefig(
+#         output_dir / "area_distributions_overlay.png", dpi=300, bbox_inches="tight"
+#     )
+#     plt.close()
+
+
+def plot_spatial_heatmaps(
+    real_metrics: DatasetMetrics,
+    synth_metrics: DatasetMetrics,
+    output_dir: Path,
+    dataset_name: str,
+):
+    """Plot 2D heatmaps showing complete region coverage (location and size)"""
+    all_labels = sorted(
+        set(real_metrics.centroids.keys()) | set(synth_metrics.centroids.keys())
+    )
+
+    n_classes = len(all_labels)
+    n_cols = min(3, n_classes)
+    n_rows = (n_classes + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols * 2, figsize=(6 * n_cols, 4 * n_rows))
+
+    # Handle different subplot array shapes based on ACTUAL subplot dimensions
+    # We always have n_cols * 2 columns (real + synth for each class)
+    if n_rows == 1 and n_cols == 1:
+        # Single class: 1 row, 2 columns → axes is 1D array of length 2
+        axes = axes.reshape(1, -1)
+    elif n_rows == 1:
+        # Multiple classes, single row → axes is 1D array
+        axes = axes.reshape(1, -1)
+    elif n_cols == 1:
+        # Multiple rows, single column of classes → axes is 2D
+        # Already correct shape: (n_rows, 2)
+        pass
+    # else: axes is already 2D with correct shape
+
+    for idx, label in enumerate(all_labels):
+        row = idx // n_cols
+        col_base = (idx % n_cols) * 2
+
+        # Real data coverage map
+        if label in real_metrics.bbox_sizes and len(real_metrics.bbox_sizes[label]) > 0:
+            bbox_sizes = real_metrics.bbox_sizes[label]
+            bbox_areas = real_metrics.bbox_areas[label]
+            centroids = real_metrics.centroids[label]
+
+            # Create high-resolution coverage grid
+            grid_size = 200
+            coverage = np.zeros((grid_size, grid_size))
+
+            # For each region, fill in the coverage area based on size
+            for (width, height), area, (cx, cy) in zip(
+                bbox_sizes, bbox_areas, centroids
+            ):
+                typical_img_size = 2000.0
+                width_norm = min(width / typical_img_size, 0.5)
+                height_norm = min(height / typical_img_size, 0.5)
+
+                x_start = np.clip(cx - width_norm / 2, 0, 1)
+                x_end = np.clip(cx + width_norm / 2, 0, 1)
+                y_start = np.clip(cy - height_norm / 2, 0, 1)
+                y_end = np.clip(cy + height_norm / 2, 0, 1)
+
+                x_start_idx = int(x_start * grid_size)
+                x_end_idx = min(int(x_end * grid_size) + 1, grid_size)
+                y_start_idx = int(y_start * grid_size)
+                y_end_idx = min(int(y_end * grid_size) + 1, grid_size)
+
+                coverage[y_start_idx:y_end_idx, x_start_idx:x_end_idx] += 1
+
+            print(f"\n  {label} - Real coverage:")
+            print(f"    Total regions: {len(bbox_sizes)}")
+            print(f"    Coverage area: {np.sum(coverage > 0) / (grid_size**2):.2%}")
+            print(f"    Max overlap: {coverage.max():.0f} regions")
+
+            im1 = axes[row, col_base].imshow(
+                coverage,
+                origin="upper",
+                cmap="YlOrRd",
+                extent=[0, 1, 0, 1],
+                aspect="auto",
+                interpolation="bilinear",
+            )
+            axes[row, col_base].set_title(
+                f"{label}\n(Real, n={len(bbox_sizes)})", fontweight="bold", fontsize=9
+            )
+            axes[row, col_base].set_xlabel("Normalized X", fontsize=8)
+            axes[row, col_base].set_ylabel("Normalized Y", fontsize=8)
+            axes[row, col_base].grid(True, alpha=0.3, linewidth=0.5)
+            plt.colorbar(im1, ax=axes[row, col_base], label="Overlap")
+        else:
+            axes[row, col_base].text(
+                0.5, 0.5, "No data", ha="center", va="center", fontsize=12
+            )
+            axes[row, col_base].set_title(
+                f"{label}\n(Real, n=0)", fontweight="bold", fontsize=9
+            )
+            axes[row, col_base].set_xlim(0, 1)
+            axes[row, col_base].set_ylim(0, 1)
+            axes[row, col_base].set_xlabel("Normalized X", fontsize=8)
+            axes[row, col_base].set_ylabel("Normalized Y", fontsize=8)
+
+        # Synthetic data coverage map
+        if (
+            label in synth_metrics.bbox_sizes
+            and len(synth_metrics.bbox_sizes[label]) > 0
+        ):
+            bbox_sizes = synth_metrics.bbox_sizes[label]
+            bbox_areas = synth_metrics.bbox_areas[label]
+            centroids = synth_metrics.centroids[label]
+
+            grid_size = 200
+            coverage = np.zeros((grid_size, grid_size))
+
+            for (width, height), area, (cx, cy) in zip(
+                bbox_sizes, bbox_areas, centroids
+            ):
+                typical_img_size = 2000.0
+                width_norm = min(width / typical_img_size, 0.5)
+                height_norm = min(height / typical_img_size, 0.5)
+
+                x_start = np.clip(cx - width_norm / 2, 0, 1)
+                x_end = np.clip(cx + width_norm / 2, 0, 1)
+                y_start = np.clip(cy - height_norm / 2, 0, 1)
+                y_end = np.clip(cy + height_norm / 2, 0, 1)
+
+                x_start_idx = int(x_start * grid_size)
+                x_end_idx = min(int(x_end * grid_size) + 1, grid_size)
+                y_start_idx = int(y_start * grid_size)
+                y_end_idx = min(int(y_end * grid_size) + 1, grid_size)
+
+                coverage[y_start_idx:y_end_idx, x_start_idx:x_end_idx] += 1
+
+            print(f"\n  {label} - Synth coverage:")
+            print(f"    Total regions: {len(bbox_sizes)}")
+            print(f"    Coverage area: {np.sum(coverage > 0) / (grid_size**2):.2%}")
+            print(f"    Max overlap: {coverage.max():.0f} regions")
+
+            im2 = axes[row, col_base + 1].imshow(
+                coverage,
+                origin="upper",
+                cmap="YlOrRd",
+                extent=[0, 1, 0, 1],
+                aspect="auto",
+                interpolation="bilinear",
+            )
+            axes[row, col_base + 1].set_title(
+                f"{label}\n(Synth, n={len(bbox_sizes)})", fontweight="bold", fontsize=9
+            )
+            axes[row, col_base + 1].set_xlabel("Normalized X", fontsize=8)
+            axes[row, col_base + 1].set_ylabel("Normalized Y", fontsize=8)
+            axes[row, col_base + 1].grid(True, alpha=0.3, linewidth=0.5)
+            plt.colorbar(im2, ax=axes[row, col_base + 1], label="Overlap")
+        else:
+            axes[row, col_base + 1].text(
+                0.5, 0.5, "No data", ha="center", va="center", fontsize=12
+            )
+            axes[row, col_base + 1].set_title(
+                f"{label}\n(Synth, n=0)", fontweight="bold", fontsize=9
+            )
+            axes[row, col_base + 1].set_xlim(0, 1)
+            axes[row, col_base + 1].set_ylim(0, 1)
+            axes[row, col_base + 1].set_xlabel("Normalized X", fontsize=8)
+            axes[row, col_base + 1].set_ylabel("Normalized Y", fontsize=8)
+
+    # Hide empty subplots
+    for idx in range(n_classes, n_rows * n_cols):
+        row = idx // n_cols
+        col_base = (idx % n_cols) * 2
+        axes[row, col_base].axis("off")
+        axes[row, col_base + 1].axis("off")
+
+    plt.tight_layout()
+    plt.savefig(
+        output_dir / f"{dataset_name}_spatial_heatmaps_grid.pdf",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    plt.close()
+
+
+def plot_region_counts(
+    real_metrics: DatasetMetrics,
+    synth_metrics: DatasetMetrics,
+    output_dir: Path,
+    dataset_name: str,
+):
+    """Plot distribution of region counts per document in a grid layout"""
+    all_labels = sorted(
+        set(real_metrics.region_counts.keys()) | set(synth_metrics.region_counts.keys())
+    )
+
+    n_classes = len(all_labels)
+    n_cols = min(4, n_classes)
+    n_rows = (n_classes + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows))
+
+    # Handle different subplot array shapes
+    if n_classes == 1:
+        axes = np.array([axes])
+    else:
+        axes = axes.flatten()
+
+    for idx, label in enumerate(all_labels):
+        ax = axes[idx]
+
+        # Get data
+        real_counts = real_metrics.region_counts.get(label, [])
+        synth_counts = synth_metrics.region_counts.get(label, [])
+
+        if len(real_counts) > 0 or len(synth_counts) > 0:
+            # Determine data range
+            all_counts = real_counts + synth_counts
+            min_count = min(all_counts) if all_counts else 0
+            max_count = max(all_counts) if all_counts else 1
+
+            # Create bins with equal width
+            # Use integer bins for count data (each count gets its own bin)
+            bin_width = 1
+            bins = np.arange(min_count - 0.5, max_count + 1.5, bin_width)
+
+            # If there are too many bins, increase bin width
+            if len(bins) > 30:
+                bin_width = max(1, int(np.ceil((max_count - min_count) / 30)))
+                bins = np.arange(min_count - 0.5, max_count + 1.5, bin_width)
+
+            # Plot histograms with explicit bin edges for consistent bar width
+            if len(real_counts) > 0:
+                ax.hist(
+                    real_counts,
+                    bins=bins,
+                    alpha=0.6,
+                    label="Real",
+                    density=True,
+                    color="C0",
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+            if len(synth_counts) > 0:
+                ax.hist(
+                    synth_counts,
+                    bins=bins,
+                    alpha=0.6,
+                    label="Synth",
+                    density=True,
+                    color="C1",
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+
+            ax.set_xlabel("Count per Document", fontsize=9)
+            ax.set_ylabel("Density", fontsize=9)
+            ax.set_title(f"{label}", fontweight="bold", fontsize=10)
+            ax.legend(fontsize=8)
+            ax.grid(axis="y", alpha=0.3)
+            ax.tick_params(labelsize=8)
+
+            # Add statistics as text
+            if len(real_counts) > 0 and len(synth_counts) > 0:
+                stats_text = (
+                    f"Real: μ={np.mean(real_counts):.1f}\n"
+                    f"Synth: μ={np.mean(synth_counts):.1f}"
+                )
+                ax.text(
+                    0.98,
+                    0.98,
+                    stats_text,
+                    transform=ax.transAxes,
+                    verticalalignment="top",
+                    horizontalalignment="right",
+                    bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5),
+                    fontsize=7,
+                )
+        else:
+            ax.text(
+                0.5,
+                0.5,
+                "No data",
+                ha="center",
+                va="center",
+                transform=ax.transAxes,
+                fontsize=10,
+            )
+            ax.set_title(f"{label}", fontweight="bold", fontsize=10)
+
+    # Hide unused subplots
+    for idx in range(n_classes, len(axes)):
+        axes[idx].axis("off")
+
+    plt.tight_layout()
+    plt.savefig(
+        output_dir / f"{dataset_name}_region_counts.pdf", dpi=300, bbox_inches="tight"
+    )
+    plt.close()
+
+
+def save_summary_stats(
+    real_metrics: DatasetMetrics,
+    synth_metrics: DatasetMetrics,
+    output_dir: Path,
+    dataset_name: str,
+):
+    """Save summary statistics to text file"""
+    with open(output_dir / "summary_stats.txt", "w") as f:
+        f.write("=" * 80 + "\n")
+        f.write(f"DATASET COMPARISON SUMMARY: {dataset_name}\n")
+        f.write("=" * 80 + "\n\n")
+
+        # Page coverage
+        f.write("PAGE COVERAGE:\n")
+        f.write(
+            f"  Real    - Mean: {np.mean(real_metrics.page_coverages):.3f}, "
+            f"Std: {np.std(real_metrics.page_coverages):.3f}, "
+            f"Median: {np.median(real_metrics.page_coverages):.3f}\n"
+        )
+        f.write(
+            f"  Synth   - Mean: {np.mean(synth_metrics.page_coverages):.3f}, "
+            f"Std: {np.std(synth_metrics.page_coverages):.3f}, "
+            f"Median: {np.median(synth_metrics.page_coverages):.3f}\n\n"
+        )
+
+        # Region counts
+        f.write("AVERAGE REGION COUNTS PER DOCUMENT:\n")
+        all_labels = sorted(
+            set(real_metrics.region_counts.keys())
+            | set(synth_metrics.region_counts.keys())
+        )
+        for label in all_labels:
+            real_mean = np.mean(real_metrics.region_counts.get(label, [0]))
+            synth_mean = np.mean(synth_metrics.region_counts.get(label, [0]))
+            real_std = np.std(real_metrics.region_counts.get(label, [0]))
+            synth_std = np.std(synth_metrics.region_counts.get(label, [0]))
+            f.write(
+                f"  {label:30s} - Real: {real_mean:6.2f}±{real_std:5.2f}, "
+                f"Synth: {synth_mean:6.2f}±{synth_std:5.2f}\n"
+            )
+
+        f.write("\n" + "=" * 80 + "\n")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="DLA GT Comparison",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "synthdataset",
+        type=str,
+        help="Name of the synthetic dataset",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """Main comparison workflow"""
+    args = parse_args()
+    synth_dataset_name = args.synthdataset
+
+    # Setup output directory
+    output_dir = ENV.DLA_GT_ANALYZATION_DIR / synth_dataset_name
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print("Loading datasets...")
+    base_dataset_name = get_base_dataset_name(synth_dataset_name)
+    base_dataset = load_dataset(base_dataset_name, is_synthetic=False)
+    print(base_dataset.metadata.dataset_labels)
+    synth_dataset = load_dataset(synth_dataset_name, is_synthetic=True)
+
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{synth_dataset_name}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    label_mapping = dsdef.label_mapping
+
+    print("Extracting metrics from real dataset...")
+    real_metrics = extract_metrics(base_dataset, label_mapping=None)
+
+    print("Extracting metrics from synthetic dataset...")
+    synth_metrics = extract_metrics(synth_dataset, label_mapping=label_mapping)
+
+    print("Generating visualizations...")
+
+    # print("  - Size distribution comparison...")
+    # plot_size_distribution_comparison(
+    #     real_metrics, synth_metrics, output_dir, synth_dataset_name
+    # )
+
+    print("  - Spatial heatmaps...")
+    plot_spatial_heatmaps(real_metrics, synth_metrics, output_dir, synth_dataset_name)
+
+    print("  - Region count distributions...")
+    plot_region_counts(real_metrics, synth_metrics, output_dir, synth_dataset_name)
+
+    print("Saving summary statistics...")
+    save_summary_stats(real_metrics, synth_metrics, output_dir, synth_dataset_name)
+
+    print(f"\nAnalysis complete! Results saved to: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docgenie/analyzation/gt/embeddings_qa.py b/docgenie/analyzation/gt/embeddings_qa.py
new file mode 100755
index 0000000000000000000000000000000000000000..59287f63adb067e7984566dcd1f453bd5359990f
--- /dev/null
+++ b/docgenie/analyzation/gt/embeddings_qa.py
@@ -0,0 +1,263 @@
+"""
+TODO: include answers in QA GT embeddings?
+"""
+
+from __future__ import annotations
+import h5py
+import argparse
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, TypeVar
+import numpy as np
+import tqdm
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data._core._data_types import DocumentInstanceModelInput
+from docgenie.data.interfaces.dataset import load_dataset
+from docgenie.logging import get_logger
+from atria_core.types.data_instance.base import (
+    BaseDataInstance,
+)
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data.interfaces.data_pipeline import (
+    load_preprocessed_data_pipeline,
+)
+from typing import Literal
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interface import load_transform
+from docgenie.generation.models import (
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+)
+
+import numpy as np
+from torch.utils.data import DataLoader
+
+T_BaseDataInstance = TypeVar("T_BaseDataInstance", bound=BaseDataInstance)
+
+
+logger = get_logger(__name__)
+
+
+def _iterate_dataset(
+    model_fn: Callable,
+    embedding_fn: Callable,
+    dataloader: "DataLoader",
+    device: str = "cuda",
+):
+    """Inner function that actually generates the embeddings."""
+    import torch
+
+    model = model_fn()
+    model.to(device)
+    model.eval()
+    print("Model is on:", next(model.parameters()).device)
+
+    sample_ids = []
+    doc_ids = []
+    questions = []
+    answers = []
+    embeddings = []
+    with torch.no_grad():
+        for batch in tqdm.tqdm(dataloader, desc="Extracting embeddings"):
+            embeddings.append(embedding_fn(model, batch))
+            sample_ids.extend(batch["sample_ids"])
+            doc_ids.extend(batch["doc_ids"])
+            questions.extend(batch["questions"])
+            answers.extend(batch["answers"])
+
+    embeddings = torch.cat(embeddings, dim=0)
+    return embeddings.cpu().numpy(), questions, answers, sample_ids, doc_ids
+
+
+def _extract_text_embeddings(dataloader: "DataLoader", device: str = "cuda"):
+    """Inner function that actually generates the embeddings."""
+
+    def model_fn():
+        from sentence_transformers import SentenceTransformer
+
+        model = SentenceTransformer("all-mpnet-base-v2")
+        model.to(device)
+        model.eval()
+        return model
+
+    def embedding_fn(model, inputs):
+        sentences = [qa_question for qa_question in inputs["questions"]]
+        return model.encode(sentences, convert_to_tensor=True)
+
+    question_embeddings, questions, answers, sample_ids, doc_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    def qa_embedding_fn(model, inputs):
+        sentences = [
+            f"Question: {q} Answer: {a}"
+            for q, a in zip(inputs["questions"], inputs["answers"])
+        ]
+        return model.encode(sentences, convert_to_tensor=True)
+
+    qa_embeddings, questions, answers, sample_ids, doc_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=qa_embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return question_embeddings, qa_embeddings, questions, answers, sample_ids, doc_ids
+
+
+def extract_embeddings(  # MsgpackDatasetReader[T_BaseDataInstance] | None
+    dataloader: "DataLoader",
+    output_dir: Path,
+    device: str = "cuda",
+):
+    q_path = output_dir / f"Q.h5"
+    qa_path = output_dir / f"QA.h5"
+
+    if q_path.exists() and qa_path.exists():
+        print(f"Found existing QA embeddings at {q_path} and {qa_path} - SKIPPING")
+    else:
+        extraction_func = _extract_text_embeddings
+        question_embeddings, qa_embeddings, questions, answers, sample_ids, doc_ids = (
+            extraction_func(dataloader, device)
+        )
+
+        _save_embeddings(
+            embeddings=question_embeddings,
+            questions=questions,
+            answers=answers,
+            sample_ids=sample_ids,
+            document_ids=doc_ids,
+            file_path=q_path,
+        )
+
+        _save_embeddings(
+            embeddings=qa_embeddings,
+            questions=questions,
+            answers=answers,
+            sample_ids=sample_ids,
+            document_ids=doc_ids,
+            file_path=Path(output_dir) / f"QA.h5",
+        )
+
+
+def _save_embeddings(
+    embeddings: "np.ndarray",
+    questions: list[str],
+    answers: list[str],
+    sample_ids: list[str],
+    document_ids: list[str],
+    file_path: Path,
+):
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    with h5py.File(file_path, "w") as f:
+        f.create_dataset("embeddings", data=embeddings)
+        f.create_dataset("questions", data=questions)
+        f.create_dataset("answers", data=answers)
+        f.create_dataset("sample_ids", data=sample_ids)
+        f.create_dataset("document_ids", data=document_ids)
+
+
+def load_qa_embeddings(dataset_name: str, embedding_type: Literal["Q", "QA"]):
+    file_path: Path = ENV.GT_EMBEDDINGS_DIR / dataset_name / f"{embedding_type}.h5"
+    print(f"Loading embeddings from {file_path}")
+
+    def decode_str_collection(col):
+        return [s.decode("utf-8") if isinstance(s, bytes) else s for s in col]
+
+    with h5py.File(file_path, "r") as f:
+        embeddings = f["embeddings"][:]
+        questions = f["questions"][:]
+        answers = f["answers"][:]
+        sample_ids = f["sample_ids"][:]
+        doc_ids = f["document_ids"][:]
+
+    return (
+        embeddings,
+        decode_str_collection(questions),
+        decode_str_collection(answers),
+        decode_str_collection(sample_ids),
+        decode_str_collection(doc_ids),
+    )
+
+
+def collate_fn_extract_questions(batch):
+    all_questions = []
+    all_answers = []
+    all_doc_ids = []
+    all_sample_ids = []
+
+    for doc in batch:
+        for a in doc.annotations:
+            for qa in a.qa_pairs:
+                all_questions.append(qa.question_text)
+                all_answers.append(qa.answer_text[0])
+                all_doc_ids.append(doc.sample_id)
+                all_sample_ids.append(f"{doc.sample_id}_{qa.id}")
+
+    return {
+        "questions": all_questions,
+        "answers": all_answers,
+        "sample_ids": all_sample_ids,
+        "doc_ids": all_doc_ids,
+    }
+
+
+def main(dataset_name: str, is_synth: bool):
+    if is_synth:
+        ymal_file = ENV.SYN_DATA_DEFINITIONS_DIR / f"{dataset_name}.yaml"
+        dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(
+            yaml_path=ymal_file
+        )
+        # prepare_synthetic_dataset(dsdef=dsdef)
+
+    # data_pipeline = load_preprocessed_data_pipeline(
+    #     dataset_name=dataset_name,
+    #     # task_type=TaskType.generate_embeddings,
+    #     is_synthetic=is_synth,
+    # )
+    # train_dataloader = data_pipeline.train_dataloader(batch_size=512, num_workers=2)
+
+    dataset = load_dataset(dataset_name=dataset_name, is_synthetic=is_synth)
+    train_dataloader = DataLoader(
+        dataset=dataset.train,
+        batch_size=512,
+        num_workers=0,
+        collate_fn=collate_fn_extract_questions,
+    )
+
+    output_dir = ENV.GT_EMBEDDINGS_DIR / dataset_name
+    extract_embeddings(
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+    )
+
+    # embeddings, questions, answers, sample_ids, doc_ids = load_qa_embeddings(
+    #     dataset_name=dataset_name, embedding_type="Q"
+    # )
+    # for e, q, a, s, d in zip(embeddings, questions, answers, sample_ids, doc_ids):
+    #     print(e, q, a, s, d)
+    #     input()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate GT embeddings")
+    parser.add_argument(
+        "dataset",
+        type=str,
+        help="Name of the dataset (e.g., docvqa, mysynthetic, pubtabnet)",
+    )
+
+    parser.add_argument(
+        "--is_synth",
+        action="store_true",
+        help="If set, determines that the dataset is a synthetic dataset",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(dataset_name=args.dataset, is_synth=args.is_synth)
diff --git a/docgenie/analyzation/gt/embeddings_qa_using_datapipeline.py b/docgenie/analyzation/gt/embeddings_qa_using_datapipeline.py
new file mode 100755
index 0000000000000000000000000000000000000000..2cef91050ee115c0df3dfd26879a9a5fef9177ac
--- /dev/null
+++ b/docgenie/analyzation/gt/embeddings_qa_using_datapipeline.py
@@ -0,0 +1,291 @@
+"""
+TODO: include answers in QA GT embeddings?
+"""
+
+from __future__ import annotations
+import h5py
+import argparse
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, TypeVar
+import numpy as np
+import tqdm
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data._core._data_types import DocumentInstanceModelInput
+from docgenie.logging import get_logger
+from atria_core.types.data_instance.base import (
+    BaseDataInstance,
+)
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data.interfaces.synthetic_data import (
+    prepare_synthetic_dataset,
+)
+from docgenie.data.interfaces.data_pipeline import (
+    load_preprocessed_data_pipeline,
+)
+from typing import Literal
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interface import load_transform
+from docgenie.generation.models import (
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+)
+from docgenie.data._core._dataset import Dataset
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+
+T_BaseDataInstance = TypeVar("T_BaseDataInstance", bound=BaseDataInstance)
+if TYPE_CHECKING:
+    import numpy as np
+    from torch.utils.data import DataLoader
+
+logger = get_logger(__name__)
+
+
+def _iterate_dataset(
+    model_fn: Callable,
+    embedding_fn: Callable,
+    dataloader: "DataLoader",
+    device: str = "cuda",
+):
+    """Inner function that actually generates the embeddings."""
+    import torch
+
+    model = model_fn()
+    model.to(device)
+    model.eval()
+    print("Model is on:", next(model.parameters()).device)
+
+    sample_ids = []
+    embeddings = []
+    with torch.no_grad():
+        for batch in tqdm.tqdm(dataloader, desc="Extracting embeddings"):
+            batch_dict = batch.to_dict()
+            batch: DocumentInstanceModelInput
+            batch = batch.select_first_overflow_samples()
+            batch = batch.to(device)
+
+            token_bboxes = batch.token_bboxes
+            if token_bboxes is not None:
+                if token_bboxes.min() >= 0 and token_bboxes.max() <= 1.0:
+                    # if bboxes are normalized to [0, 1], convert to [0, 1000] as expected by layoutlmv3
+                    token_bboxes = (token_bboxes * 1000).long()
+                else:
+                    logger.warning(
+                        f"Token bboxes must be in the range [0, 1], but got min {token_bboxes.min()} and max {token_bboxes.max()}"
+                    )
+                    token_bboxes = (token_bboxes.clip(0, 1.0) * 1000).long()
+
+                # assert check
+                assert token_bboxes.min() >= 0 and token_bboxes.max() <= 1000, (
+                    f"Token bboxes must be in the range [0, 1000], but got min {token_bboxes.min()} and max {token_bboxes.max()}"
+                )
+
+            # make sure if image is normlized 0-1 as in layoutlm we renormalize using clip stats
+            assert batch.image.min() >= -1.1 and batch.image.max() <= 1.1, (
+                f"Image pixel values must be in the range [0, 1], but got min {batch.image.min()} and max {batch.image.max()}"
+            )
+
+            # make inputs
+            inputs = dict(
+                qa_answers=batch.qa_answers,
+                qa_question=batch.qa_question,
+                sample_ids=batch.sample_id,
+            )
+
+            embeddings.append(embedding_fn(model, inputs))
+
+            # in our preprocessed dataset indices are always unqiue
+            # but sample_ids may not be always unique in some rare cases
+            sample_ids.extend(batch.sample_id)
+
+    embeddings = torch.cat(embeddings, dim=0)
+    return embeddings.cpu().numpy(), sample_ids
+
+
+def _extract_text_embeddings(
+    dataloader: "DataLoader",
+    device: str = "cuda",
+):
+    """Inner function that actually generates the embeddings."""
+
+    def model_fn():
+        from sentence_transformers import SentenceTransformer
+
+        model = SentenceTransformer("all-mpnet-base-v2")
+        model.to(device)
+        model.eval()
+        return model
+
+    print("Extracting embeddings only for Questions.................")
+
+    def embedding_fn(model, inputs):
+        sentences = [qa_question for qa_question in inputs["qa_question"]]
+        return model.encode(sentences, convert_to_tensor=True)
+
+    question_embeddings, question_sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    print("Extracting embeddings for both Questions and Answers...............")
+
+    def qa_embedding_fn(model, inputs):
+        """I asked gpt and It said this type of approach is common in SBERT/Text-encoders"""
+        sentences = [
+            f"Question: {q} Answer: {a}"
+            for q, a in zip(inputs["qa_question"], inputs["qa_answers"])
+        ]
+        return model.encode(sentences, convert_to_tensor=True)
+
+    qa_embeddings, qa_sample_ids = _iterate_dataset(
+        model_fn=model_fn,
+        embedding_fn=qa_embedding_fn,
+        dataloader=dataloader,
+        device=device,
+    )
+
+    return dict(
+        question_embeddings=question_embeddings,
+        question_sample_ids=question_sample_ids,
+        qa_embeddings=qa_embeddings,
+        qa_sample_ids=qa_sample_ids,
+    )
+
+
+def embedding_extraction_with_cache(  # MsgpackDatasetReader[T_BaseDataInstance] | None
+    dataloader: "DataLoader",
+    output_dir: str | Path,
+    embedding_type: EmbeddingType,
+    device: str = "cuda",
+    cache_outputs: bool = True,
+    load_embeddings: Literal[
+        "question_only", "QA"
+    ] = "question_only",  # used to load embeddings from chache
+):
+    """By default it returns question only embeddings"""
+    """Generic cacher function that handles caching logic for any embedding type."""
+    if load_embeddings == "QA":
+        cache_file = Path(output_dir) / f"QA_{embedding_type.value}.h5"
+    elif load_embeddings == "question_only":
+        cache_file = Path(output_dir) / f"Q_{embedding_type.value}.h5"
+
+    if cache_outputs and cache_file.exists():
+        logger.info(
+            f"Loading cached {load_embeddings}_{embedding_type.value} embeddings from {cache_file}"
+        )
+        return _load_embeddings(cache_file)
+
+    extraction_func = _extract_text_embeddings
+    all_embeddings = extraction_func(dataloader, device)
+
+    # Question only embeddings
+    question_embeddings = all_embeddings["question_embeddings"]
+    question_sample_ids = all_embeddings["question_sample_ids"]
+
+    # Question + Answer embeddings
+    qa_embeddings = all_embeddings["qa_embeddings"]
+    qa_sample_ids = all_embeddings["qa_sample_ids"]
+
+    if cache_outputs:
+        """Checking that embeddings and sample_ids have same length"""
+        assert len(question_sample_ids) == question_embeddings.shape[0], logger.warning(
+            f"[Error in Questuion only Embedding] Number of sample IDs ({len(question_sample_ids)}) must match number of embeddings ({question_embeddings.shape[0]})"
+        )
+
+        assert len(qa_sample_ids) == qa_embeddings.shape[0], logger.warning(
+            f"[Error in QA Embedding] Number of sample IDs ({len(qa_sample_ids)}) must match number of embeddings ({qa_embeddings.shape[0]})"
+        )
+        """Checking that sample_ids are unique"""
+        assert len(set(question_sample_ids)) == len(question_sample_ids), (
+            logger.warning(
+                "[ERROR in Question only Embedding] Sample IDs must be unique"
+            )
+        )
+        assert len(set(qa_sample_ids)) == len(qa_sample_ids), logger.warning(
+            "[ERROR in QA Embedding] Sample IDs must be unique"
+        )
+        """Saving question only embeddings"""
+        _save_embeddings(
+            embeddings=question_embeddings,
+            sample_ids=question_sample_ids,
+            file_path=Path(output_dir) / f"Q_{embedding_type.value}.h5",
+        )
+        """Saving QA only embeddings"""
+        _save_embeddings(
+            embeddings=qa_embeddings,
+            sample_ids=qa_sample_ids,
+            file_path=Path(output_dir) / f"QA_{embedding_type.value}.h5",
+        )
+        return _load_embeddings(cache_file)
+
+    return question_embeddings, question_sample_ids
+
+
+def _save_embeddings(embeddings: "np.ndarray", sample_ids: list[str], file_path: Path):
+    import h5py
+
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    with h5py.File(file_path, "w") as f:
+        f.create_dataset("embeddings", data=embeddings)
+        f.create_dataset("sample_ids", data=sample_ids)
+
+
+def _load_embeddings(file_path: Path):
+    import h5py
+
+    print(f"Loading embeddings from {file_path}")
+
+    with h5py.File(file_path, "r") as f:
+        sample_ids = f["sample_ids"][:]
+        embeddings = f["embeddings"][:]
+    return embeddings, [
+        s.decode("utf-8") if isinstance(s, bytes) else s for s in sample_ids
+    ]
+
+
+def main(dataset_name: str, is_synth: bool):
+    if is_synth:
+        ymal_file = ENV.SYN_DATA_DEFINITIONS_DIR / f"{dataset_name}.yaml"
+        dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(
+            yaml_path=ymal_file
+        )
+        prepare_synthetic_dataset(dsdef=dsdef)
+
+    data_pipeline = load_preprocessed_data_pipeline(
+        dataset_name=dataset_name,
+        # task_type=TaskType.generate_embeddings,
+        is_synthetic=is_synth,
+    )
+
+    train_dataloader = data_pipeline.train_dataloader(batch_size=512, num_workers=2)
+
+    output_dir = ENV.GT_EMBEDDINGS_DIR / dataset_name
+    embedding, sample_ids = embedding_extraction_with_cache(
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        embedding_type=EmbeddingType.text,
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate GT embeddings")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Name of the dataset (e.g., docvqa, mysynthetic, pubtabnet)",
+    )
+
+    parser.add_argument(
+        "--is_synth",
+        action="store_true",
+        help="If set, determines that the dataset is a synthetic dataset",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(dataset_name=args.dataset, is_synth=args.is_synth)
diff --git a/docgenie/analyzation/gt/kie/kie_gt_analysis.py b/docgenie/analyzation/gt/kie/kie_gt_analysis.py
new file mode 100755
index 0000000000000000000000000000000000000000..83ace6ae524fd883ed9b5711fa9454519f91c848
--- /dev/null
+++ b/docgenie/analyzation/gt/kie/kie_gt_analysis.py
@@ -0,0 +1,568 @@
+"""
+Compare KIE Ground Truth between Synthetic and Real Datasets
+For CVPR paper on synthesis of document understanding datasets
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict, Counter
+from typing import List, Dict, Tuple
+import pandas as pd
+from scipy import stats
+
+from docgenie import ENV
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+from docgenie.data.interfaces.dataset import load_dataset
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+
+# Set publication-quality style
+sns.set_style("whitegrid")
+sns.set_context("paper", font_scale=1.3)
+plt.rcParams["font.family"] = "serif"
+plt.rcParams["font.serif"] = ["Times New Roman"] + plt.rcParams["font.serif"]
+plt.rcParams["figure.dpi"] = 300
+plt.rcParams["savefig.dpi"] = 300
+plt.rcParams["savefig.bbox"] = "tight"
+
+
+def parse_bio_tags_to_entities(
+    word_labels_names: List[str],
+) -> List[Tuple[str, int, int]]:
+    """
+    Parse BIO tags to extract complete entities.
+
+    Args:
+        word_labels_names: List of BIO tags (e.g., ['B-HEADER', 'I-HEADER', 'B-QUESTION'])
+
+    Returns:
+        List of tuples (entity_class, start_idx, end_idx)
+    """
+    entities = []
+    current_entity = None
+    current_start = None
+
+    for idx, label in enumerate(word_labels_names):
+        if label.startswith("B-"):
+            # Save previous entity if exists
+            if current_entity is not None:
+                entities.append((current_entity, current_start, idx - 1))
+
+            # Start new entity
+            current_entity = label[2:]  # Remove 'B-' prefix
+            current_start = idx
+        elif label.startswith("I-"):
+            # Continue current entity (if it matches)
+            entity_class = label[2:]  # Remove 'I-' prefix
+            if current_entity is None or current_entity != entity_class:
+                # Start new entity if no current or mismatch
+                if current_entity is not None:
+                    entities.append((current_entity, current_start, idx - 1))
+                current_entity = entity_class
+                current_start = idx
+        else:
+            # 'O' tag or other - end current entity
+            if current_entity is not None:
+                entities.append((current_entity, current_start, idx - 1))
+                current_entity = None
+                current_start = None
+
+    # Don't forget last entity
+    if current_entity is not None:
+        entities.append((current_entity, current_start, len(word_labels_names) - 1))
+
+    return entities
+
+
+def get_entity_spatial_info(
+    entities: List[Tuple[str, int, int]], word_bboxes: List[List[float]]
+) -> Dict[str, List[Tuple[float, float]]]:
+    """
+    Extract spatial information (centers) for each entity class.
+
+    Args:
+        entities: List of (entity_class, start_idx, end_idx)
+        word_bboxes: List of normalized bboxes in XYXY format
+
+    Returns:
+        Dict mapping entity_class to list of (x_center, y_center) positions
+    """
+    spatial_info = defaultdict(list)
+
+    for entity_class, start_idx, end_idx in entities:
+        # Get all bboxes for this entity
+        entity_bboxes = word_bboxes[start_idx : end_idx + 1]
+
+        # Calculate entity center (average of all word centers)
+        x_centers = [(bbox[0] + bbox[2]) / 2 for bbox in entity_bboxes]
+        y_centers = [(bbox[1] + bbox[3]) / 2 for bbox in entity_bboxes]
+
+        entity_x_center = np.mean(x_centers)
+        entity_y_center = np.mean(y_centers)
+
+        spatial_info[entity_class].append((entity_x_center, entity_y_center))
+
+    return spatial_info
+
+
+def analyze_dataset(
+    dataset, is_synthetic=False, label_mapping: dict[str, str] = None
+) -> Dict:
+    """
+    Analyze a dataset and extract statistics.
+
+    Returns:
+        Dictionary with various statistics
+    """
+    stats_dict = {
+        "entity_counts": defaultdict(int),
+        "entity_counts_per_sample": defaultdict(list),
+        "spatial_distributions": defaultdict(list),
+        "entity_lengths": defaultdict(list),  # Number of words per entity
+        "total_samples": 0,
+        "total_entities": 0,
+    }
+
+    for sample in dataset.train:
+        stats_dict["total_samples"] += 1
+
+        # Get word labels and bboxes
+        annotation = sample.annotations[0]  # EntityLabelingAnnotation
+        word_labels_names = annotation.word_labels.name
+        word_bboxes = sample.content.word_bboxes.value
+
+        # Parse entities
+        entities = parse_bio_tags_to_entities(word_labels_names)
+
+        if label_mapping is not None and len(label_mapping) > 0:
+            entities = [(label_mapping[e], _s, _e) for (e, _s, _e) in entities]
+
+        # Count entities per class in this sample
+        sample_entity_counts = Counter([e[0] for e in entities])
+
+        for entity_class, count in sample_entity_counts.items():
+            stats_dict["entity_counts"][entity_class] += count
+            stats_dict["entity_counts_per_sample"][entity_class].append(count)
+
+        # Add zeros for missing classes in this sample
+        all_classes = set(stats_dict["entity_counts"].keys())
+        for entity_class in all_classes:
+            if entity_class not in sample_entity_counts:
+                stats_dict["entity_counts_per_sample"][entity_class].append(0)
+
+        # Get spatial info
+        spatial_info = get_entity_spatial_info(entities, word_bboxes)
+        for entity_class, positions in spatial_info.items():
+            stats_dict["spatial_distributions"][entity_class].extend(positions)
+
+        # Entity lengths
+        for entity_class, start_idx, end_idx in entities:
+            length = end_idx - start_idx + 1
+            stats_dict["entity_lengths"][entity_class].append(length)
+
+        stats_dict["total_entities"] += len(entities)
+
+    return stats_dict
+
+
+def plot_entity_distribution_comparison(
+    real_stats: Dict, synth_stats: Dict, output_prefix: str
+):
+    """
+    Plot comparison of entity class distributions.
+    """
+    all_classes = sorted(
+        set(
+            list(real_stats["entity_counts"].keys())
+            + list(synth_stats["entity_counts"].keys())
+        )
+    )
+
+    real_counts = [real_stats["entity_counts"].get(cls, 0) for cls in all_classes]
+    synth_counts = [synth_stats["entity_counts"].get(cls, 0) for cls in all_classes]
+
+    # Normalize to percentages
+    real_total = sum(real_counts)
+    synth_total = sum(synth_counts)
+    real_pcts = [c / real_total * 100 for c in real_counts]
+    synth_pcts = [c / synth_total * 100 for c in synth_counts]
+
+    # Create DataFrame for seaborn
+    df_pct = pd.DataFrame(
+        {
+            "Entity Class": all_classes * 2,
+            "Percentage": real_pcts + synth_pcts,
+            "Dataset": ["Real"] * len(all_classes) + ["Synthetic"] * len(all_classes),
+        }
+    )
+
+    # Plot with seaborn
+    fig, ax = plt.subplots(1, 1, figsize=(7, 5))
+
+    # Color palette
+    palette = sns.color_palette("Set2", 2)
+
+    # Percentages
+    sns.barplot(
+        data=df_pct,
+        x="Entity Class",
+        y="Percentage",
+        hue="Dataset",
+        palette=palette,
+        ax=ax,
+        alpha=0.85,
+    )
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    ax.tick_params(axis="x", rotation=90)
+    plt.setp(ax.xaxis.get_majorticklabels(), rotation=90, ha="center")
+
+    # Format y-axis to show percentage symbol
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{int(y)}%"))
+
+    ax.legend(frameon=True, loc="upper right", fontsize=11)
+    ax.grid(axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
+    ax.set_axisbelow(True)
+
+    plt.tight_layout()
+    plt.savefig(
+        ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_distribution_comparison.pdf",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(
+        f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_distribution_comparison.pdf"
+    )
+    plt.close()
+
+
+def plot_spatial_heatmaps(real_stats: Dict, synth_stats: Dict, output_prefix: str):
+    """
+    Plot spatial heatmaps showing where entities appear on the page.
+    """
+    all_classes = sorted(
+        set(
+            list(real_stats["spatial_distributions"].keys())
+            + list(synth_stats["spatial_distributions"].keys())
+        )
+    )
+
+    n_classes = len(all_classes)
+    n_cols = min(4, n_classes)
+    n_rows = (n_classes + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols * 2, figsize=(6 * n_cols, 4 * n_rows))
+    if n_rows == 1 and n_cols == 1:
+        axes = np.array([[axes]])
+    elif n_rows == 1:
+        axes = axes.reshape(1, -1)
+    elif n_cols == 1:
+        axes = axes.reshape(-1, 1)
+
+    # Use better colormap
+    cmap = sns.color_palette("rocket_r", as_cmap=True)
+
+    for idx, entity_class in enumerate(all_classes):
+        row = idx // n_cols
+        col_base = (idx % n_cols) * 2
+
+        # Real data heatmap
+        real_positions = real_stats["spatial_distributions"].get(entity_class, [])
+        if len(real_positions) > 0:
+            x_coords = [pos[0] for pos in real_positions]
+            y_coords = [pos[1] for pos in real_positions]
+
+            # Create 2D histogram (heatmap)
+            heatmap, xedges, yedges = np.histogram2d(
+                x_coords, y_coords, bins=20, range=[[0, 1], [0, 1]]
+            )
+
+            im1 = axes[row, col_base].imshow(
+                heatmap.T,
+                origin="upper",
+                cmap=cmap,
+                extent=[0, 1, 0, 1],
+                aspect="auto",
+                interpolation="bilinear",
+            )
+            axes[row, col_base].set_title(
+                f"{entity_class}\n(Real, n={len(real_positions)})",
+                fontweight="bold",
+                fontsize=12,
+                pad=10,
+            )
+            axes[row, col_base].set_xlabel("X Position", fontsize=11)
+            axes[row, col_base].set_ylabel("Y Position", fontsize=11)
+            cbar1 = plt.colorbar(im1, ax=axes[row, col_base], fraction=0.046, pad=0.04)
+            cbar1.ax.tick_params(labelsize=9)
+        else:
+            axes[row, col_base].text(
+                0.5, 0.5, "No data", ha="center", va="center", fontsize=12
+            )
+            axes[row, col_base].set_title(
+                f"{entity_class}\n(Real, n=0)", fontweight="bold", fontsize=12, pad=10
+            )
+            axes[row, col_base].set_xticks([])
+            axes[row, col_base].set_yticks([])
+
+        # Synthetic data heatmap
+        synth_positions = synth_stats["spatial_distributions"].get(entity_class, [])
+        if len(synth_positions) > 0:
+            x_coords = [pos[0] for pos in synth_positions]
+            y_coords = [pos[1] for pos in synth_positions]
+
+            heatmap, xedges, yedges = np.histogram2d(
+                x_coords, y_coords, bins=20, range=[[0, 1], [0, 1]]
+            )
+
+            im2 = axes[row, col_base + 1].imshow(
+                heatmap.T,
+                origin="upper",
+                cmap=cmap,
+                extent=[0, 1, 0, 1],
+                aspect="auto",
+                interpolation="bilinear",
+            )
+            axes[row, col_base + 1].set_title(
+                f"{entity_class}\n(Synth, n={len(synth_positions)})",
+                fontweight="bold",
+                fontsize=12,
+                pad=10,
+            )
+            axes[row, col_base + 1].set_xlabel("X Position", fontsize=11)
+            axes[row, col_base + 1].set_ylabel("Y Position", fontsize=11)
+            cbar2 = plt.colorbar(
+                im2, ax=axes[row, col_base + 1], fraction=0.046, pad=0.04
+            )
+            cbar2.ax.tick_params(labelsize=9)
+        else:
+            axes[row, col_base + 1].text(
+                0.5, 0.5, "No data", ha="center", va="center", fontsize=12
+            )
+            axes[row, col_base + 1].set_title(
+                f"{entity_class}\n(Synth, n=0)", fontweight="bold", fontsize=12, pad=10
+            )
+            axes[row, col_base + 1].set_xticks([])
+            axes[row, col_base + 1].set_yticks([])
+
+    # Hide empty subplots
+    for idx in range(n_classes, n_rows * n_cols):
+        row = idx // n_cols
+        col_base = (idx % n_cols) * 2
+        axes[row, col_base].axis("off")
+        axes[row, col_base + 1].axis("off")
+
+    plt.tight_layout()
+    plt.savefig(
+        ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_spatial_heatmaps.pdf",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_spatial_heatmaps.pdf")
+    plt.close()
+
+
+def plot_entity_length_comparison(
+    real_stats: Dict, synth_stats: Dict, output_prefix: str
+):
+    """
+    Compare distribution of entity lengths (number of words per entity).
+    """
+    all_classes = sorted(
+        set(
+            list(real_stats["entity_lengths"].keys())
+            + list(synth_stats["entity_lengths"].keys())
+        )
+    )
+
+    n_classes = len(all_classes)
+    n_cols = min(3, n_classes)
+    n_rows = (n_classes + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 4 * n_rows))
+    if n_classes == 1:
+        axes = np.array([axes])
+    axes = axes.flatten()
+
+    # Color palette
+    colors = sns.color_palette("Set2", 2)
+
+    for idx, entity_class in enumerate(all_classes):
+        real_lengths = real_stats["entity_lengths"].get(entity_class, [])
+        synth_lengths = synth_stats["entity_lengths"].get(entity_class, [])
+
+        ax = axes[idx]
+
+        if len(real_lengths) > 0 or len(synth_lengths) > 0:
+            # Plot histograms
+            bins = (
+                np.arange(
+                    1,
+                    max(max(real_lengths, default=1), max(synth_lengths, default=1))
+                    + 2,
+                )
+                - 0.5
+            )
+
+            ax.hist(
+                real_lengths,
+                bins=bins,
+                alpha=0.7,
+                label="Real",
+                density=True,
+                color=colors[0],
+                edgecolor="black",
+                linewidth=0.5,
+            )
+            ax.hist(
+                synth_lengths,
+                bins=bins,
+                alpha=0.7,
+                label="Synthetic",
+                density=True,
+                color=colors[1],
+                edgecolor="black",
+                linewidth=0.5,
+            )
+
+            # Add statistics
+            real_mean = np.mean(real_lengths) if len(real_lengths) > 0 else 0
+            synth_mean = np.mean(synth_lengths) if len(synth_lengths) > 0 else 0
+
+            ax.axvline(
+                real_mean, color=colors[0], linestyle="--", linewidth=2.5, alpha=0.8
+            )
+            ax.axvline(
+                synth_mean, color=colors[1], linestyle="--", linewidth=2.5, alpha=0.8
+            )
+
+            ax.set_title(
+                f"{entity_class}\nReal μ={real_mean:.2f}, Synth μ={synth_mean:.2f}",
+                fontweight="bold",
+                fontsize=12,
+                pad=10,
+            )
+            ax.set_xlabel("Entity Length (words)", fontsize=11, fontweight="bold")
+            ax.set_ylabel("Density", fontsize=11, fontweight="bold")
+            ax.legend(frameon=True, loc="best", fontsize=10)
+            ax.grid(axis="y", alpha=0.3, linestyle="--", linewidth=0.5)
+            ax.set_axisbelow(True)
+            ax.spines["top"].set_visible(False)
+            ax.spines["right"].set_visible(False)
+        else:
+            ax.text(
+                0.5,
+                0.5,
+                "No data",
+                ha="center",
+                va="center",
+                transform=ax.transAxes,
+                fontsize=12,
+            )
+            ax.set_title(f"{entity_class}", fontweight="bold", fontsize=12, pad=10)
+            ax.set_xticks([])
+            ax.set_yticks([])
+
+    # Hide empty subplots
+    for idx in range(n_classes, len(axes)):
+        axes[idx].axis("off")
+
+    plt.tight_layout()
+    plt.savefig(
+        ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_entity_length_comparison.png",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(
+        f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_entity_length_comparison.png"
+    )
+    plt.close()
+
+
+def compute_statistical_tests(real_stats: Dict, synth_stats: Dict) -> pd.DataFrame:
+    """
+    Compute statistical tests to compare distributions.
+    """
+    all_classes = sorted(
+        set(
+            list(real_stats["entity_counts"].keys())
+            + list(synth_stats["entity_counts"].keys())
+        )
+    )
+
+    results = []
+
+    for entity_class in all_classes:
+        real_counts_per_sample = real_stats["entity_counts_per_sample"].get(
+            entity_class, []
+        )
+        synth_counts_per_sample = synth_stats["entity_counts_per_sample"].get(
+            entity_class, []
+        )
+
+        # Mann-Whitney U test (non-parametric)
+        if len(real_counts_per_sample) > 0 and len(synth_counts_per_sample) > 0:
+            statistic, p_value = stats.mannwhitneyu(
+                real_counts_per_sample, synth_counts_per_sample, alternative="two-sided"
+            )
+        else:
+            statistic, p_value = np.nan, np.nan
+
+        # KS test for spatial distributions
+        real_spatial = real_stats["spatial_distributions"].get(entity_class, [])
+        synth_spatial = synth_stats["spatial_distributions"].get(entity_class, [])
+
+        ks_x, ks_y = np.nan, np.nan
+        if len(real_spatial) > 0 and len(synth_spatial) > 0:
+            real_x = [pos[0] for pos in real_spatial]
+            synth_x = [pos[0] for pos in synth_spatial]
+            real_y = [pos[1] for pos in real_spatial]
+            synth_y = [pos[1] for pos in synth_spatial]
+
+            ks_x, _ = stats.ks_2samp(real_x, synth_x)
+            ks_y, _ = stats.ks_2samp(real_y, synth_y)
+
+        results.append(
+            {
+                "Entity Class": entity_class,
+                "Real Count": real_stats["entity_counts"].get(entity_class, 0),
+                "Synth Count": synth_stats["entity_counts"].get(entity_class, 0),
+                "Real Mean/Sample": np.mean(real_counts_per_sample)
+                if real_counts_per_sample
+                else 0,
+                "Synth Mean/Sample": np.mean(synth_counts_per_sample)
+                if synth_counts_per_sample
+                else 0,
+                "Mann-Whitney p-value": p_value,
+                "KS Stat (X)": ks_x,
+                "KS Stat (Y)": ks_y,
+            }
+        )
+
+    return pd.DataFrame(results)
+
+
+def print_summary_statistics(real_stats: Dict, synth_stats: Dict):
+    """
+    Print summary statistics.
+    """
+    print("\n" + "=" * 80)
+    print("SUMMARY STATISTICS")
+    print("=" * 80)
+
+    print(f"\nReal Dataset:")
+    print(f"  Total samples: {real_stats['total_samples']}")
+    print(f"  Total entities: {real_stats['total_entities']}")
+    print(
+        f"  Avg entities/sample: {real_stats['total_entities'] / real_stats['total_samples']:.2f}"
+    )
+
+    print(f"\nSynthetic Dataset:")
+    print(f"  Total samples: {synth_stats['total_samples']}")
+    print(f"  Total entities: {synth_stats['total_entities']}")
+    print(
+        f"  Avg entities/sample: {synth_stats['total_entities'] / synth_stats['total_samples']:.2f}"
+    )
+
+    print("\n" + "=" * 80)
diff --git a/docgenie/analyzation/gt/kie/kie_gt_analysis_full.py b/docgenie/analyzation/gt/kie/kie_gt_analysis_full.py
new file mode 100755
index 0000000000000000000000000000000000000000..6dcbd659cd79f225a567510be7ce6e80201a71d4
--- /dev/null
+++ b/docgenie/analyzation/gt/kie/kie_gt_analysis_full.py
@@ -0,0 +1,211 @@
+"""
+Complete KIE GT Comparison Pipeline
+Example usage with all metrics and visualizations
+"""
+
+import argparse
+from docgenie import ENV
+from docgenie.analyzation.gt.kie.kie_gt_analysis import (
+    analyze_dataset,
+    plot_entity_distribution_comparison,
+    plot_spatial_heatmaps,
+    plot_entity_length_comparison,
+    compute_statistical_tests,
+    print_summary_statistics,
+)
+
+from docgenie.analyzation.gt.kie.kie_gt_analysis_utils import (
+    compute_jensen_shannon_divergence,
+    compute_spatial_coverage_metrics,
+    plot_entity_co_occurrence_matrix,
+    plot_document_level_statistics,
+    generate_latex_table,
+    comprehensive_analysis,
+)
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+from docgenie.data.interfaces.dataset import load_dataset
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+
+def full_comparison_pipeline(
+    synth_dataset_name: str,
+):
+    """
+    Complete comparison pipeline with all metrics and visualizations.
+
+    Args:
+        synth_dataset_name: Name of synthetic dataset
+        get_base_dataset_name_func: Function to get base dataset name
+        load_dataset_func: Function to load datasets
+        output_prefix: Prefix for output files
+    """
+
+    print("=" * 80)
+    print("COMPLETE KIE GROUND TRUTH COMPARISON PIPELINE")
+    print("=" * 80)
+
+    # ========== STEP 1: Load Datasets ==========
+    print("\n[1/6] Loading datasets...")
+    base_dataset_name = get_base_dataset_name(synth_dataset_name)
+    print(f"  Base dataset: {base_dataset_name}")
+    print(f"  Synthetic dataset: {synth_dataset_name}")
+
+    base_dataset = load_dataset(base_dataset_name, is_synthetic=False)
+    synth_dataset = load_dataset(synth_dataset_name, is_synthetic=True)
+    print("  ✓ Datasets loaded")
+
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{synth_dataset_name}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    label_mapping = dsdef.label_mapping
+
+    # ========== STEP 2: Analyze Datasets ==========
+    print("\n[2/6] Analyzing datasets...")
+    print("  Analyzing real dataset...")
+    real_stats = analyze_dataset(base_dataset, is_synthetic=False, label_mapping=None)
+    print(
+        f"    ✓ {real_stats['total_samples']} samples, {real_stats['total_entities']} entities"
+    )
+
+    print("  Analyzing synthetic dataset...")
+    synth_stats = analyze_dataset(
+        synth_dataset, is_synthetic=True, label_mapping=label_mapping
+    )
+    print(
+        f"    ✓ {synth_stats['total_samples']} samples, {synth_stats['total_entities']} entities"
+    )
+
+    # ========== STEP 3: Summary Statistics ==========
+    print("\n[3/6] Computing summary statistics...")
+    print_summary_statistics(real_stats, synth_stats)
+
+    # ========== STEP 4: Statistical Tests ==========
+    print("\n[4/6] Running statistical tests...")
+    stats_df = compute_statistical_tests(real_stats, synth_stats)
+    print("\n" + stats_df.to_string(index=False))
+    stats_df.to_csv(
+        f"{ENV.KIE_GT_ANALYZATION_DIR / synth_dataset_name}_statistics.csv", index=False
+    )
+    print(
+        f"\n  ✓ Saved: {ENV.KIE_GT_ANALYZATION_DIR / synth_dataset_name}_statistics.csv"
+    )
+
+    # Divergence metrics
+    divergence = compute_jensen_shannon_divergence(real_stats, synth_stats)
+    print(f"\n  Distribution Similarity:")
+    print(f"    Jensen-Shannon Divergence: {divergence['overall_js_divergence']:.4f}")
+    print(f"    (Lower is better, 0 = identical, 1 = completely different)")
+
+    # Spatial metrics
+    spatial_metrics = compute_spatial_coverage_metrics(real_stats, synth_stats)
+    print(f"\n  Spatial Distribution (Centroid Distances):")
+    for entity_class, metrics in spatial_metrics.items():
+        print(f"    {entity_class}: {metrics['centroid_distance']:.4f}")
+
+    # ========== STEP 5: Generate Visualizations ==========
+    print("\n[5/6] Generating visualizations...")
+
+    print("  Creating distribution comparison plots...")
+    plot_entity_distribution_comparison(real_stats, synth_stats, synth_dataset_name)
+
+    print("  Creating spatial heatmaps...")
+    plot_spatial_heatmaps(real_stats, synth_stats, synth_dataset_name)
+
+    print("  Creating entity length comparison plots...")
+    plot_entity_length_comparison(real_stats, synth_stats, synth_dataset_name)
+
+    print("  Creating co-occurrence matrices...")
+    plot_entity_co_occurrence_matrix(
+        real_stats,
+        synth_stats,
+        base_dataset.train,
+        synth_dataset.train,
+        synth_dataset_name,
+    )
+
+    print("  Creating document-level statistics...")
+    plot_document_level_statistics(real_stats, synth_stats, synth_dataset_name)
+
+    # ========== STEP 6: Generate Paper Materials ==========
+    print("\n[6/6] Generating paper materials...")
+    generate_latex_table(stats_df, divergence, synth_dataset_name)
+
+    # ========== Summary Report ==========
+    print("\n" + "=" * 80)
+    print("COMPARISON COMPLETE!")
+    print("=" * 80)
+    print("\nGenerated files:")
+    print(f"  • {synth_dataset_name}_statistics.csv")
+    print(f"  • {synth_dataset_name}_distribution_comparison.png")
+    print(f"  • {synth_dataset_name}_spatial_heatmaps.png")
+    print(f"  • {synth_dataset_name}_entity_length_comparison.png")
+    print(f"  • {synth_dataset_name}_cooccurrence_matrix.png")
+    print(f"  • {synth_dataset_name}_document_statistics.png")
+    print(f"  • {synth_dataset_name}_table.tex")
+
+    print("\nKey Findings:")
+    print(
+        f"  • Dataset sizes: {real_stats['total_samples']} real vs {synth_stats['total_samples']} synthetic"
+    )
+    print(
+        f"  • Total entities: {real_stats['total_entities']} real vs {synth_stats['total_entities']} synthetic"
+    )
+    print(
+        f"  • Distribution similarity (JS): {divergence['overall_js_divergence']:.4f}"
+    )
+    print(
+        f"  • Average entities/doc: {real_stats['total_entities'] / real_stats['total_samples']:.2f} real vs "
+        f"{synth_stats['total_entities'] / synth_stats['total_samples']:.2f} synthetic"
+    )
+
+    # Identify classes with significant differences
+    sig_diff_classes = stats_df[stats_df["Mann-Whitney p-value"] < 0.05][
+        "Entity Class"
+    ].tolist()
+    if sig_diff_classes:
+        print(
+            f"  • Significantly different classes (p<0.05): {', '.join(sig_diff_classes)}"
+        )
+    else:
+        print(f"  • No significantly different classes detected (p<0.05)")
+
+    print("\n" + "=" * 80)
+
+    return {
+        "real_stats": real_stats,
+        "synth_stats": synth_stats,
+        "statistics_df": stats_df,
+        "divergence": divergence,
+        "spatial_metrics": spatial_metrics,
+    }
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="KIE GT Comparison",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "synthdataset",
+        type=str,
+        help="Name of the synthetic dataset",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    # Run the complete pipeline
+    args = parse_args()
+    synth_dataset_name = args.synthdataset
+
+    results = full_comparison_pipeline(
+        synth_dataset_name=synth_dataset_name,
+    )
+
+    # Access results for further analysis
+    print("\nResults dictionary contains:")
+    print(f"  - real_stats: {list(results['real_stats'].keys())}")
+    print(f"  - synth_stats: {list(results['synth_stats'].keys())}")
+    print(f"  - statistics_df: {results['statistics_df'].shape}")
diff --git a/docgenie/analyzation/gt/kie/kie_gt_analysis_utils.py b/docgenie/analyzation/gt/kie/kie_gt_analysis_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..5efb76186a3c5e42561983cbc1df92d9778b3d54
--- /dev/null
+++ b/docgenie/analyzation/gt/kie/kie_gt_analysis_utils.py
@@ -0,0 +1,428 @@
+"""
+Supplementary analysis utilities for KIE GT comparison
+Additional metrics for CVPR paper
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from typing import Dict, List, Tuple
+import seaborn as sns
+from scipy.spatial.distance import jensenshannon
+from scipy.stats import entropy
+
+from docgenie import ENV
+
+
+def compute_jensen_shannon_divergence(
+    real_stats: Dict, synth_stats: Dict
+) -> Dict[str, float]:
+    """
+    Compute Jensen-Shannon divergence for entity distributions.
+    Lower is better (0 = identical distributions, 1 = completely different).
+    """
+    all_classes = sorted(
+        set(
+            list(real_stats["entity_counts"].keys())
+            + list(synth_stats["entity_counts"].keys())
+        )
+    )
+
+    real_counts = np.array(
+        [real_stats["entity_counts"].get(cls, 0) for cls in all_classes]
+    )
+    synth_counts = np.array(
+        [synth_stats["entity_counts"].get(cls, 0) for cls in all_classes]
+    )
+
+    # Normalize to probabilities
+    real_probs = (
+        real_counts / real_counts.sum() if real_counts.sum() > 0 else real_counts
+    )
+    synth_probs = (
+        synth_counts / synth_counts.sum() if synth_counts.sum() > 0 else synth_counts
+    )
+
+    # JS divergence
+    js_div = jensenshannon(real_probs, synth_probs)
+
+    return {
+        "overall_js_divergence": js_div,
+        "overall_kl_divergence": entropy(real_probs, synth_probs),
+    }
+
+
+def compute_spatial_coverage_metrics(real_stats: Dict, synth_stats: Dict) -> Dict:
+    """
+    Compute spatial coverage metrics comparing how well synthetic data
+    covers the spatial distribution of real data.
+    """
+    results = {}
+
+    for entity_class in real_stats["spatial_distributions"].keys():
+        real_pos = real_stats["spatial_distributions"].get(entity_class, [])
+        synth_pos = synth_stats["spatial_distributions"].get(entity_class, [])
+
+        if len(real_pos) == 0 or len(synth_pos) == 0:
+            continue
+
+        real_x = np.array([p[0] for p in real_pos])
+        real_y = np.array([p[1] for p in real_pos])
+        synth_x = np.array([p[0] for p in synth_pos])
+        synth_y = np.array([p[1] for p in synth_pos])
+
+        # Mean absolute difference in centroids
+        real_centroid = (real_x.mean(), real_y.mean())
+        synth_centroid = (synth_x.mean(), synth_y.mean())
+        centroid_distance = np.sqrt(
+            (real_centroid[0] - synth_centroid[0]) ** 2
+            + (real_centroid[1] - synth_centroid[1]) ** 2
+        )
+
+        # Standard deviation comparison
+        real_std = (real_x.std(), real_y.std())
+        synth_std = (synth_x.std(), synth_y.std())
+        std_diff = (abs(real_std[0] - synth_std[0]), abs(real_std[1] - synth_std[1]))
+
+        results[entity_class] = {
+            "centroid_distance": centroid_distance,
+            "std_x_diff": std_diff[0],
+            "std_y_diff": std_diff[1],
+            "real_centroid": real_centroid,
+            "synth_centroid": synth_centroid,
+            "real_std": real_std,
+            "synth_std": synth_std,
+        }
+
+    return results
+
+
+def plot_entity_co_occurrence_matrix(
+    real_stats: Dict, synth_stats: Dict, samples_real, samples_synth, output_prefix: str
+):
+    """
+    Plot co-occurrence matrices showing which entities appear together in documents.
+    Useful for understanding document structure preservation.
+    """
+
+    # Build co-occurrence matrices
+    def build_cooccurrence(samples):
+        from itertools import combinations
+
+        co_occur = defaultdict(int)
+        all_classes = set()
+
+        for sample in samples:
+            annotation = sample.annotations[0]
+            word_labels_names = annotation.word_labels.name
+
+            # Get unique entity classes in this sample
+            entities_in_sample = set()
+            for label in word_labels_names:
+                if label.startswith("B-") or label.startswith("I-"):
+                    entity_class = label[2:]
+                    entities_in_sample.add(entity_class)
+                    all_classes.add(entity_class)
+
+            # Count co-occurrences
+            for pair in combinations(sorted(entities_in_sample), 2):
+                co_occur[pair] += 1
+
+        return co_occur, sorted(all_classes)
+
+    real_cooccur, real_classes = build_cooccurrence(samples_real)
+    synth_cooccur, synth_classes = build_cooccurrence(samples_synth)
+
+    all_classes = sorted(set(real_classes + synth_classes))
+    n = len(all_classes)
+
+    # Build matrices
+    real_matrix = np.zeros((n, n))
+    synth_matrix = np.zeros((n, n))
+
+    class_to_idx = {cls: idx for idx, cls in enumerate(all_classes)}
+
+    for (cls1, cls2), count in real_cooccur.items():
+        i, j = class_to_idx[cls1], class_to_idx[cls2]
+        real_matrix[i, j] = count
+        real_matrix[j, i] = count
+
+    for (cls1, cls2), count in synth_cooccur.items():
+        i, j = class_to_idx[cls1], class_to_idx[cls2]
+        synth_matrix[i, j] = count
+        synth_matrix[j, i] = count
+
+    # Normalize
+    real_matrix = (
+        real_matrix / real_matrix.sum() if real_matrix.sum() > 0 else real_matrix
+    )
+    synth_matrix = (
+        synth_matrix / synth_matrix.sum() if synth_matrix.sum() > 0 else synth_matrix
+    )
+
+    # Plot
+    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
+
+    # Real
+    sns.heatmap(
+        real_matrix,
+        annot=False,
+        cmap="Blues",
+        ax=axes[0],
+        xticklabels=all_classes,
+        yticklabels=all_classes,
+        cbar_kws={"label": "Frequency"},
+    )
+    axes[0].set_title("Real Data Co-occurrence", fontsize=14, fontweight="bold")
+    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha="right")
+    axes[0].set_yticklabels(axes[0].get_yticklabels(), rotation=0)
+
+    # Synthetic
+    sns.heatmap(
+        synth_matrix,
+        annot=False,
+        cmap="Oranges",
+        ax=axes[1],
+        xticklabels=all_classes,
+        yticklabels=all_classes,
+        cbar_kws={"label": "Frequency"},
+    )
+    axes[1].set_title("Synthetic Data Co-occurrence", fontsize=14, fontweight="bold")
+    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha="right")
+    axes[1].set_yticklabels(axes[1].get_yticklabels(), rotation=0)
+
+    # Difference
+    diff_matrix = np.abs(real_matrix - synth_matrix)
+    sns.heatmap(
+        diff_matrix,
+        annot=False,
+        cmap="Reds",
+        ax=axes[2],
+        xticklabels=all_classes,
+        yticklabels=all_classes,
+        cbar_kws={"label": "Abs Difference"},
+    )
+    axes[2].set_title("Absolute Difference", fontsize=14, fontweight="bold")
+    axes[2].set_xticklabels(axes[2].get_xticklabels(), rotation=45, ha="right")
+    axes[2].set_yticklabels(axes[2].get_yticklabels(), rotation=0)
+
+    plt.tight_layout()
+    plt.savefig(
+        ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_cooccurrence_matrix.png",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(
+        f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_cooccurrence_matrix.png"
+    )
+    plt.close()
+
+
+def plot_document_level_statistics(
+    real_stats: Dict, synth_stats: Dict, output_prefix: str
+):
+    """
+    Plot document-level statistics (entities per document, etc.).
+    """
+    # Compute entities per document
+    real_entities_per_doc = []
+    synth_entities_per_doc = []
+
+    for entity_class in real_stats["entity_counts_per_sample"].keys():
+        real_entities_per_doc.extend(
+            real_stats["entity_counts_per_sample"][entity_class]
+        )
+
+    for entity_class in synth_stats["entity_counts_per_sample"].keys():
+        synth_entities_per_doc.extend(
+            synth_stats["entity_counts_per_sample"][entity_class]
+        )
+
+    # Aggregate by document
+    n_docs_real = real_stats["total_samples"]
+    n_docs_synth = synth_stats["total_samples"]
+
+    # Reshape data properly - sum across entity types per document
+    # We need to reorganize the per_sample data
+    all_classes = sorted(
+        set(
+            list(real_stats["entity_counts_per_sample"].keys())
+            + list(synth_stats["entity_counts_per_sample"].keys())
+        )
+    )
+
+    # Get max document count
+    max_docs = max(
+        max(
+            [
+                len(real_stats["entity_counts_per_sample"].get(cls, []))
+                for cls in all_classes
+            ],
+            default=0,
+        ),
+        max(
+            [
+                len(synth_stats["entity_counts_per_sample"].get(cls, []))
+                for cls in all_classes
+            ],
+            default=0,
+        ),
+    )
+
+    real_total_per_doc = np.zeros(n_docs_real)
+    synth_total_per_doc = np.zeros(n_docs_synth)
+
+    for entity_class in all_classes:
+        real_counts = real_stats["entity_counts_per_sample"].get(entity_class, [])
+        synth_counts = synth_stats["entity_counts_per_sample"].get(entity_class, [])
+
+        real_total_per_doc[: len(real_counts)] += np.array(real_counts)
+        synth_total_per_doc[: len(synth_counts)] += np.array(synth_counts)
+
+    # Plot
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+    # Histogram
+    axes[0].hist(real_total_per_doc, bins=20, alpha=0.6, label="Real", density=True)
+    axes[0].hist(
+        synth_total_per_doc, bins=20, alpha=0.6, label="Synthetic", density=True
+    )
+    axes[0].axvline(
+        real_total_per_doc.mean(),
+        color="blue",
+        linestyle="--",
+        linewidth=2,
+        label=f"Real μ={real_total_per_doc.mean():.1f}",
+    )
+    axes[0].axvline(
+        synth_total_per_doc.mean(),
+        color="orange",
+        linestyle="--",
+        linewidth=2,
+        label=f"Synth μ={synth_total_per_doc.mean():.1f}",
+    )
+    axes[0].set_xlabel("Total Entities per Document", fontsize=12)
+    axes[0].set_ylabel("Density", fontsize=12)
+    axes[0].set_title(
+        "Distribution of Entities per Document", fontsize=14, fontweight="bold"
+    )
+    axes[0].legend()
+    axes[0].grid(axis="y", alpha=0.3)
+
+    # Cumulative distribution
+    real_sorted = np.sort(real_total_per_doc)
+    synth_sorted = np.sort(synth_total_per_doc)
+    real_cdf = np.arange(1, len(real_sorted) + 1) / len(real_sorted)
+    synth_cdf = np.arange(1, len(synth_sorted) + 1) / len(synth_sorted)
+
+    axes[1].plot(real_sorted, real_cdf, label="Real", linewidth=2)
+    axes[1].plot(synth_sorted, synth_cdf, label="Synthetic", linewidth=2)
+    axes[1].set_xlabel("Total Entities per Document", fontsize=12)
+    axes[1].set_ylabel("Cumulative Probability", fontsize=12)
+    axes[1].set_title("Cumulative Distribution", fontsize=14, fontweight="bold")
+    axes[1].legend()
+    axes[1].grid(alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig(
+        ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_document_statistics.png",
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(
+        f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_document_statistics.png"
+    )
+    plt.close()
+
+
+def generate_latex_table(stats_df, divergence_metrics, output_prefix: str):
+    """
+    Generate LaTeX table for paper.
+    """
+    latex_str = "\\begin{table}[t]\n"
+    latex_str += "\\centering\n"
+    latex_str += "\\caption{Comparison of Entity Distributions between Real and Synthetic Datasets}\n"
+    latex_str += "\\label{tab:entity_comparison}\n"
+    latex_str += "\\begin{tabular}{lrrrrr}\n"
+    latex_str += "\\toprule\n"
+    latex_str += (
+        "Entity Class & Real & Synth & Real (\\%) & Synth (\\%) & p-value \\\\\n"
+    )
+    latex_str += "\\midrule\n"
+
+    total_real = stats_df["Real Count"].sum()
+    total_synth = stats_df["Synth Count"].sum()
+
+    for _, row in stats_df.iterrows():
+        entity = row["Entity Class"]
+        real_count = int(row["Real Count"])
+        synth_count = int(row["Synth Count"])
+        real_pct = (real_count / total_real * 100) if total_real > 0 else 0
+        synth_pct = (synth_count / total_synth * 100) if total_synth > 0 else 0
+        p_val = row["Mann-Whitney p-value"]
+
+        p_str = f"{p_val:.4f}" if not np.isnan(p_val) else "---"
+        if not np.isnan(p_val) and p_val < 0.001:
+            p_str = "$<$0.001"
+
+        latex_str += f"{entity} & {real_count} & {synth_count} & {real_pct:.1f} & {synth_pct:.1f} & {p_str} \\\\\n"
+
+    latex_str += "\\midrule\n"
+    latex_str += f"Total & {total_real} & {total_synth} & 100.0 & 100.0 & --- \\\\\n"
+    latex_str += "\\bottomrule\n"
+    latex_str += "\\end{tabular}\n"
+    latex_str += "\\end{table}\n"
+
+    # Add divergence metrics as separate note
+    latex_str += "\n% Divergence Metrics:\n"
+    latex_str += f"% JS Divergence: {divergence_metrics['overall_js_divergence']:.4f}\n"
+    latex_str += f"% KL Divergence: {divergence_metrics['overall_kl_divergence']:.4f}\n"
+
+    with open(ENV.KIE_GT_ANALYZATION_DIR / f"{output_prefix}_table.tex", "w") as f:
+        f.write(latex_str)
+
+    print(f"Saved: {ENV.KIE_GT_ANALYZATION_DIR / output_prefix}_table.tex")
+    print("\nLaTeX Table Preview:")
+    print(latex_str)
+
+
+def comprehensive_analysis(
+    synth_dataset_name: str,
+    real_stats: Dict,
+    synth_stats: Dict,
+    real_samples,
+    synth_samples,
+    output_prefix: str,
+):
+    """
+    Run all supplementary analyses.
+    """
+    print("\n" + "=" * 80)
+    print("ADVANCED ANALYSIS")
+    print("=" * 80)
+
+    # JS divergence
+    print("\nComputing distribution divergence metrics...")
+    divergence = compute_jensen_shannon_divergence(real_stats, synth_stats)
+    print(f"  Jensen-Shannon Divergence: {divergence['overall_js_divergence']:.4f}")
+    print(f"  KL Divergence: {divergence['overall_kl_divergence']:.4f}")
+
+    # Spatial coverage
+    print("\nComputing spatial coverage metrics...")
+    spatial_metrics = compute_spatial_coverage_metrics(real_stats, synth_stats)
+    print("  Centroid distances:")
+    for entity_class, metrics in spatial_metrics.items():
+        print(f"    {entity_class}: {metrics['centroid_distance']:.4f}")
+
+    # Generate additional plots
+    print("\nGenerating additional visualizations...")
+    plot_entity_co_occurrence_matrix(
+        real_stats, synth_stats, real_samples, synth_samples, output_prefix
+    )
+    plot_document_level_statistics(real_stats, synth_stats, output_prefix)
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    ...
diff --git a/docgenie/analyzation/gt/qa/qa_gt_analysis old.py b/docgenie/analyzation/gt/qa/qa_gt_analysis old.py
new file mode 100755
index 0000000000000000000000000000000000000000..acf0c79e84e72fddd2354c895a40b3ad3a25112a
--- /dev/null
+++ b/docgenie/analyzation/gt/qa/qa_gt_analysis old.py	
@@ -0,0 +1,589 @@
+"""
+Compare QA Ground Truth between Synthetic and Real Document Understanding Datasets
+
+This script compares question-answer pairs from synthetic and real datasets using:
+1. Question type distribution analysis
+2. Embedding similarity metrics (KL/JS divergence, MMD)
+3. UMAP projection overlay quantification
+
+For CVPR paper on synthesis of document understanding datasets.
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import entropy
+from scipy.spatial.distance import jensenshannon, cdist
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.neighbors import KernelDensity
+from typing import Literal, Tuple, Dict, List
+import re
+from collections import Counter
+
+from docgenie import ENV
+from docgenie.analyzation.gt.embeddings_qa import load_qa_embeddings
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+
+
+def extract_question_type(question: str) -> str:
+    """
+    Extract question type based on starting word.
+
+    Args:
+        question: Question string
+
+    Returns:
+        Question type: 'who', 'what', 'when', 'where', 'why', 'how', or 'other'
+    """
+    question = question.lower().strip()
+
+    # Common question starters
+    if question.startswith("who"):
+        return "who"
+    elif question.startswith("what"):
+        return "what"
+    elif question.startswith("when"):
+        return "when"
+    elif question.startswith("where"):
+        return "where"
+    elif question.startswith("why"):
+        return "why"
+    elif question.startswith("how"):
+        return "how"
+    else:
+        return "other"
+
+
+def compute_question_type_distribution(questions: List[str]) -> Dict[str, float]:
+    """
+    Compute distribution of question types.
+
+    Args:
+        questions: List of question strings
+
+    Returns:
+        Dictionary mapping question type to ratio
+    """
+    types = [extract_question_type(q) for q in questions]
+    counter = Counter(types)
+    total = len(types)
+
+    distribution = {
+        "who": counter.get("who", 0) / total,
+        "what": counter.get("what", 0) / total,
+        "when": counter.get("when", 0) / total,
+        "where": counter.get("where", 0) / total,
+        "why": counter.get("why", 0) / total,
+        "how": counter.get("how", 0) / total,
+        "other": counter.get("other", 0) / total,
+    }
+
+    return distribution
+
+
+def compute_cosine_similarity_histogram(
+    emb1: np.ndarray, emb2: np.ndarray, n_bins: int = 50
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Compute histograms of cosine similarities within each dataset.
+
+    Args:
+        emb1: Embeddings from dataset 1 (N1 x D)
+        emb2: Embeddings from dataset 2 (N2 x D)
+        n_bins: Number of histogram bins
+
+    Returns:
+        hist1, hist2, bins1, bins2
+    """
+    # Compute pairwise cosine similarities within each dataset
+    # Sample if datasets are too large
+    max_samples = 5000
+    if len(emb1) > max_samples:
+        idx1 = np.random.choice(len(emb1), max_samples, replace=False)
+        emb1_sample = emb1[idx1]
+    else:
+        emb1_sample = emb1
+
+    if len(emb2) > max_samples:
+        idx2 = np.random.choice(len(emb2), max_samples, replace=False)
+        emb2_sample = emb2[idx2]
+    else:
+        emb2_sample = emb2
+
+    # Compute cosine similarities
+    sim1 = cosine_similarity(emb1_sample)
+    sim2 = cosine_similarity(emb2_sample)
+
+    # Get upper triangle (excluding diagonal) to avoid self-similarities
+    triu_idx = np.triu_indices_from(sim1, k=1)
+    sim1_values = sim1[triu_idx]
+
+    triu_idx = np.triu_indices_from(sim2, k=1)
+    sim2_values = sim2[triu_idx]
+
+    # Compute histograms on same bins
+    bins = np.linspace(-1, 1, n_bins + 1)
+    hist1, _ = np.histogram(sim1_values, bins=bins, density=True)
+    hist2, _ = np.histogram(sim2_values, bins=bins, density=True)
+
+    # Normalize to get probability distributions
+    hist1 = hist1 / hist1.sum()
+    hist2 = hist2 / hist2.sum()
+
+    return hist1, hist2, bins[:-1], bins
+
+
+def compute_kl_divergence(
+    p: np.ndarray, q: np.ndarray, epsilon: float = 1e-10
+) -> float:
+    """
+    Compute KL divergence KL(P||Q).
+
+    Args:
+        p: Probability distribution P
+        q: Probability distribution Q
+        epsilon: Small value to avoid log(0)
+
+    Returns:
+        KL divergence value
+    """
+    p = np.array(p) + epsilon
+    q = np.array(q) + epsilon
+    p = p / p.sum()
+    q = q / q.sum()
+
+    return entropy(p, q)
+
+
+def compute_js_divergence(p: np.ndarray, q: np.ndarray) -> float:
+    """
+    Compute Jensen-Shannon divergence.
+
+    Args:
+        p: Probability distribution P
+        q: Probability distribution Q
+
+    Returns:
+        JS divergence value (0 to 1)
+    """
+    return jensenshannon(p, q)
+
+
+def compute_mmd_rbf(X: np.ndarray, Y: np.ndarray, gamma: float = None) -> float:
+    """
+    Compute Maximum Mean Discrepancy with RBF kernel.
+
+    Args:
+        X: Samples from distribution P (N1 x D)
+        Y: Samples from distribution Q (N2 x D)
+        gamma: RBF kernel bandwidth (if None, uses median heuristic)
+
+    Returns:
+        MMD^2 value
+    """
+    # Sample if datasets are too large
+    max_samples = 2000
+    if len(X) > max_samples:
+        X = X[np.random.choice(len(X), max_samples, replace=False)]
+    if len(Y) > max_samples:
+        Y = Y[np.random.choice(len(Y), max_samples, replace=False)]
+
+    # Use median heuristic for gamma if not provided
+    if gamma is None:
+        XY = np.vstack([X, Y])
+        dists = cdist(XY, XY)
+        gamma = 1.0 / (2 * np.median(dists[dists > 0]) ** 2)
+
+    def rbf_kernel(X, Y, gamma):
+        """RBF kernel matrix."""
+        XX = np.sum(X**2, axis=1)[:, np.newaxis]
+        YY = np.sum(Y**2, axis=1)[np.newaxis, :]
+        XY = X @ Y.T
+        dists_sq = XX + YY - 2 * XY
+        return np.exp(-gamma * dists_sq)
+
+    K_XX = rbf_kernel(X, X, gamma)
+    K_YY = rbf_kernel(Y, Y, gamma)
+    K_XY = rbf_kernel(X, Y, gamma)
+
+    m = len(X)
+    n = len(Y)
+
+    # MMD^2 estimator
+    mmd_sq = (K_XX.sum() - np.trace(K_XX)) / (m * (m - 1))
+    mmd_sq += (K_YY.sum() - np.trace(K_YY)) / (n * (n - 1))
+    mmd_sq -= 2 * K_XY.mean()
+
+    return mmd_sq
+
+
+def compute_umap_overlay_metrics(
+    umap_real: np.ndarray, umap_synth: np.ndarray
+) -> Dict[str, float]:
+    """
+    Quantify the quality of UMAP projection overlay.
+
+    Args:
+        umap_real: UMAP 2D projections of real data (N1 x 2)
+        umap_synth: UMAP 2D projections of synthetic data (N2 x 2)
+
+    Returns:
+        Dictionary with overlay quality metrics
+    """
+    metrics = {}
+
+    # 1. Wasserstein distance (Earth Mover's Distance)
+    from scipy.stats import wasserstein_distance
+
+    # Compute 1D Wasserstein on each dimension
+    w_dist_x = wasserstein_distance(umap_real[:, 0], umap_synth[:, 0])
+    w_dist_y = wasserstein_distance(umap_real[:, 1], umap_synth[:, 1])
+    metrics["wasserstein_x"] = w_dist_x
+    metrics["wasserstein_y"] = w_dist_y
+    metrics["wasserstein_avg"] = (w_dist_x + w_dist_y) / 2
+
+    # 2. 2D Wasserstein (using optimal transport if available)
+    try:
+        import ot
+
+        # Normalize to uniform weights
+        a = np.ones(len(umap_real)) / len(umap_real)
+        b = np.ones(len(umap_synth)) / len(umap_synth)
+        M = ot.dist(umap_real, umap_synth, metric="euclidean")
+        w_dist_2d = ot.emd2(a, b, M)
+        metrics["wasserstein_2d"] = w_dist_2d
+    except ImportError:
+        print("Note: Python Optimal Transport (POT) not available for 2D Wasserstein")
+
+    # 3. KL divergence of 2D density estimates
+    # Estimate densities using KDE
+    kde_real = KernelDensity(bandwidth=0.5, kernel="gaussian")
+    kde_synth = KernelDensity(bandwidth=0.5, kernel="gaussian")
+
+    kde_real.fit(umap_real)
+    kde_synth.fit(umap_synth)
+
+    # Create grid for density evaluation
+    x_min = min(umap_real[:, 0].min(), umap_synth[:, 0].min())
+    x_max = max(umap_real[:, 0].max(), umap_synth[:, 0].max())
+    y_min = min(umap_real[:, 1].min(), umap_synth[:, 1].min())
+    y_max = max(umap_real[:, 1].max(), umap_synth[:, 1].max())
+
+    x_grid = np.linspace(x_min, x_max, 50)
+    y_grid = np.linspace(y_min, y_max, 50)
+    X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
+    grid_points = np.column_stack([X_grid.ravel(), Y_grid.ravel()])
+
+    # Evaluate densities
+    log_dens_real = kde_real.score_samples(grid_points)
+    log_dens_synth = kde_synth.score_samples(grid_points)
+
+    dens_real = np.exp(log_dens_real)
+    dens_synth = np.exp(log_dens_synth)
+
+    # Normalize
+    dens_real = dens_real / dens_real.sum()
+    dens_synth = dens_synth / dens_synth.sum()
+
+    # Compute KL and JS divergence
+    metrics["kl_divergence_2d"] = compute_kl_divergence(dens_real, dens_synth)
+    metrics["js_divergence_2d"] = compute_js_divergence(dens_real, dens_synth)
+
+    # 4. Chamfer distance (average nearest neighbor distance)
+    from scipy.spatial import distance_matrix
+
+    # Real to Synth
+    dists_r2s = distance_matrix(umap_real, umap_synth)
+    chamfer_r2s = dists_r2s.min(axis=1).mean()
+
+    # Synth to Real
+    dists_s2r = distance_matrix(umap_synth, umap_real)
+    chamfer_s2r = dists_s2r.min(axis=0).mean()
+
+    metrics["chamfer_real_to_synth"] = chamfer_r2s
+    metrics["chamfer_synth_to_real"] = chamfer_s2r
+    metrics["chamfer_symmetric"] = (chamfer_r2s + chamfer_s2r) / 2
+
+    # 5. Coverage metric (what fraction of real data is "covered" by synth)
+    # Define "coverage" as having a synthetic point within threshold distance
+    threshold = np.percentile(dists_r2s.min(axis=1), 95)  # Adaptive threshold
+    coverage = (dists_r2s.min(axis=1) < threshold).mean()
+    metrics["coverage"] = coverage
+
+    return metrics
+
+
+def plot_comparison_results(
+    real_dist: Dict[str, float],
+    synth_dist: Dict[str, float],
+    hist1: np.ndarray,
+    hist2: np.ndarray,
+    bins: np.ndarray,
+    umap_real: np.ndarray,
+    umap_synth: np.ndarray,
+    save_path: str = None,
+):
+    """
+    Create visualization of comparison results.
+
+    Args:
+        real_dist: Question type distribution for real data
+        synth_dist: Question type distribution for synthetic data
+        hist1: Cosine similarity histogram for real data
+        hist2: Cosine similarity histogram for synthetic data
+        bins: Histogram bins
+        umap_real: UMAP projections of real data
+        umap_synth: UMAP projections of synthetic data
+        save_path: Path to save figure (if None, shows plot)
+    """
+    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
+
+    # 1. Question type distribution
+    ax = axes[0, 0]
+    question_types = list(real_dist.keys())
+    x = np.arange(len(question_types))
+    width = 0.35
+
+    ax.bar(
+        x - width / 2,
+        [real_dist[t] for t in question_types],
+        width,
+        label="Real",
+        alpha=0.8,
+    )
+    ax.bar(
+        x + width / 2,
+        [synth_dist[t] for t in question_types],
+        width,
+        label="Synthetic",
+        alpha=0.8,
+    )
+
+    ax.set_xlabel("Question Type")
+    ax.set_ylabel("Ratio")
+    ax.set_title("Question Type Distribution")
+    ax.set_xticks(x)
+    ax.set_xticklabels(question_types)
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    # 2. Cosine similarity histograms
+    ax = axes[0, 1]
+    bin_centers = (bins[:-1] + bins[1:]) / 2
+    ax.plot(bin_centers, hist1, label="Real", alpha=0.7, linewidth=2)
+    ax.plot(bin_centers, hist2, label="Synthetic", alpha=0.7, linewidth=2)
+    ax.fill_between(bin_centers, hist1, alpha=0.3)
+    ax.fill_between(bin_centers, hist2, alpha=0.3)
+
+    ax.set_xlabel("Cosine Similarity")
+    ax.set_ylabel("Density")
+    ax.set_title("Pairwise Cosine Similarity Distribution")
+    ax.legend()
+    ax.grid(alpha=0.3)
+
+    # 3. UMAP overlay - side by side
+    ax = axes[1, 0]
+    ax.scatter(
+        umap_real[:, 0], umap_real[:, 1], c="blue", alpha=0.3, s=10, label="Real"
+    )
+    ax.scatter(
+        umap_synth[:, 0], umap_synth[:, 1], c="red", alpha=0.3, s=10, label="Synthetic"
+    )
+    ax.set_xlabel("UMAP 1")
+    ax.set_ylabel("UMAP 2")
+    ax.set_title("UMAP Projection Overlay")
+    ax.legend()
+    ax.grid(alpha=0.3)
+
+    # 4. UMAP density contours
+    ax = axes[1, 1]
+    from scipy.stats import gaussian_kde
+
+    # Compute KDE for contours
+    kde_real = gaussian_kde(umap_real.T)
+    kde_synth = gaussian_kde(umap_synth.T)
+
+    # Create grid
+    x_min = min(umap_real[:, 0].min(), umap_synth[:, 0].min())
+    x_max = max(umap_real[:, 0].max(), umap_synth[:, 0].max())
+    y_min = min(umap_real[:, 1].min(), umap_synth[:, 1].min())
+    y_max = max(umap_real[:, 1].max(), umap_synth[:, 1].max())
+
+    xx, yy = np.mgrid[x_min:x_max:100j, y_min:y_max:100j]
+    positions = np.vstack([xx.ravel(), yy.ravel()])
+
+    z_real = np.reshape(kde_real(positions).T, xx.shape)
+    z_synth = np.reshape(kde_synth(positions).T, xx.shape)
+
+    ax.contour(xx, yy, z_real, colors="blue", alpha=0.6, levels=5)
+    ax.contour(xx, yy, z_synth, colors="red", alpha=0.6, levels=5, linestyles="dashed")
+
+    ax.set_xlabel("UMAP 1")
+    ax.set_ylabel("UMAP 2")
+    ax.set_title("UMAP Density Contours (Real: solid, Synth: dashed)")
+    ax.grid(alpha=0.3)
+
+    plt.tight_layout()
+
+    if save_path:
+        plt.savefig(save_path, dpi=300, bbox_inches="tight")
+        print(f"Figure saved to {save_path}")
+    else:
+        plt.show()
+
+
+def compare_qa_datasets(
+    synth_dataset_name: str,
+    embedding_type: Literal["Q", "QA"],
+) -> Dict[str, any]:
+    """
+    Main function to compare QA datasets.
+
+    Args:
+        synth_dataset_name: Name of synthetic dataset
+        embedding_type: Type of embeddings to use
+        load_qa_embeddings: Function to load embeddings
+        get_base_dataset_name: Function to get base dataset name
+        umap_real: Optional pre-computed UMAP projections for real data
+        umap_synth: Optional pre-computed UMAP projections for synthetic data
+
+    Returns:
+        Dictionary with all comparison metrics
+    """
+    print("=" * 80)
+    print("QA Dataset Comparison")
+    print("=" * 80)
+
+    # Load data
+    print(f"\nLoading datasets...")
+    base_dataset_name = get_base_dataset_name(synth_dataset_name)
+    print(f"Base dataset: {base_dataset_name}")
+    print(f"Synthetic dataset: {synth_dataset_name}")
+
+    real_emb, real_q, real_a, real_sample_ids, real_doc_ids = load_qa_embeddings(
+        base_dataset_name, embedding_type
+    )
+    synth_emb, synth_q, synth_a, synth_sample_ids, synth_doc_ids = load_qa_embeddings(
+        synth_dataset_name, embedding_type
+    )
+
+    print(f"Real dataset: {len(real_q)} QA pairs")
+    print(f"Synthetic dataset: {len(synth_q)} QA pairs")
+
+    results = {}
+
+    # 1. Question type distribution
+    print("\n" + "-" * 80)
+    print("1. Question Type Distribution")
+    print("-" * 80)
+
+    real_dist = compute_question_type_distribution(real_q)
+    synth_dist = compute_question_type_distribution(synth_q)
+
+    print("\nReal data:")
+    for qtype, ratio in real_dist.items():
+        print(f"  {qtype:8s}: {ratio:6.2%}")
+
+    print("\nSynthetic data:")
+    for qtype, ratio in synth_dist.items():
+        print(f"  {qtype:8s}: {ratio:6.2%}")
+
+    # Compute distribution divergence
+    types_ordered = ["who", "what", "when", "where", "why", "how", "other"]
+    real_dist_arr = np.array([real_dist[t] for t in types_ordered])
+    synth_dist_arr = np.array([synth_dist[t] for t in types_ordered])
+
+    qtype_kl = compute_kl_divergence(real_dist_arr, synth_dist_arr)
+    qtype_js = compute_js_divergence(real_dist_arr, synth_dist_arr)
+
+    print(f"\nKL divergence (Real||Synth): {qtype_kl:.4f}")
+    print(f"JS divergence: {qtype_js:.4f}")
+
+    results["question_type_real"] = real_dist
+    results["question_type_synth"] = synth_dist
+    results["question_type_kl"] = qtype_kl
+    results["question_type_js"] = qtype_js
+
+    # 2. Cosine similarity histograms
+    print("\n" + "-" * 80)
+    print("2. Embedding Similarity Distribution")
+    print("-" * 80)
+
+    hist_real, hist_synth, bin_centers, bins = compute_cosine_similarity_histogram(
+        real_emb, synth_emb, n_bins=50
+    )
+
+    sim_kl = compute_kl_divergence(hist_real, hist_synth)
+    sim_js = compute_js_divergence(hist_real, hist_synth)
+
+    print(f"KL divergence (Real||Synth): {sim_kl:.4f}")
+    print(f"JS divergence: {sim_js:.4f}")
+
+    results["similarity_hist_kl"] = sim_kl
+    results["similarity_hist_js"] = sim_js
+    results["similarity_hist_real"] = hist_real
+    results["similarity_hist_synth"] = hist_synth
+    results["similarity_bins"] = bin_centers
+
+    # 3. MMD
+    print("\n" + "-" * 80)
+    print("3. Maximum Mean Discrepancy (MMD)")
+    print("-" * 80)
+
+    mmd_value = compute_mmd_rbf(real_emb, synth_emb)
+    print(f"MMD² (RBF kernel): {mmd_value:.6f}")
+
+    results["mmd_rbf"] = mmd_value
+
+    # 4. UMAP overlay metrics (if provided)
+    projection_method = "umap"
+    projection_cache_path = (
+        ENV.QA_GT_WEBAPP_CACHE_DIR
+        / f"projection_{projection_method}_{synth_dataset_name}_{embedding_type}.npy"
+    )
+    projection_2d = np.load(projection_cache_path)
+    umap_real = projection_2d[: len(real_emb)]
+    umap_synth = projection_2d[len(real_emb) :]
+
+    if umap_real is not None and umap_synth is not None:
+        print("\n" + "-" * 80)
+        print("4. UMAP Projection Overlay Metrics")
+        print("-" * 80)
+
+        overlay_metrics = compute_umap_overlay_metrics(umap_real, umap_synth)
+
+        for metric_name, value in overlay_metrics.items():
+            print(f"{metric_name:30s}: {value:.6f}")
+
+        results["umap_overlay"] = overlay_metrics
+
+        # Create visualization
+        plot_comparison_results(
+            real_dist,
+            synth_dist,
+            hist_real,
+            hist_synth,
+            bins,
+            umap_real,
+            umap_synth,
+            save_path=ENV.QA_GT_ANALYZATION_DIR / "qa_comparison.png",
+        )
+
+    print("\n" + "=" * 80)
+    print("Comparison complete!")
+    print("=" * 80)
+
+    return results
+
+
+# Example usage
+if __name__ == "__main__":
+    synth_dataset_name = "wtq_alpha=1.0"
+
+    # Run comparison
+    results = compare_qa_datasets(
+        synth_dataset_name=synth_dataset_name, embedding_type="Q"
+    )
+
+    print("\n\nResults summary available in 'results' dictionary:")
+    print(results)
+    print("Visualization saved to 'qa_comparison.png'")
diff --git a/docgenie/analyzation/gt/qa/qa_gt_analysis.py b/docgenie/analyzation/gt/qa/qa_gt_analysis.py
new file mode 100755
index 0000000000000000000000000000000000000000..6e2fda40fa97d51a5ae814fd23af66c58046e86f
--- /dev/null
+++ b/docgenie/analyzation/gt/qa/qa_gt_analysis.py
@@ -0,0 +1,576 @@
+"""
+Compare QA Ground Truth between Synthetic and Real Document Understanding Datasets
+
+This script compares question-answer pairs from synthetic and real datasets using:
+1. Question type distribution analysis
+2. Embedding similarity metrics (KL/JS divergence, MMD)
+3. UMAP projection overlay quantification
+
+For CVPR paper on synthesis of document understanding datasets.
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import entropy
+from scipy.spatial.distance import jensenshannon, cdist
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.neighbors import KernelDensity
+from typing import Literal, Tuple, Dict, List
+import re
+from collections import Counter
+
+from docgenie import ENV
+from docgenie.analyzation.gt.embeddings_qa import load_qa_embeddings
+from docgenie.analyzation.gt.webapp import get_base_dataset_name
+
+
+def extract_question_type(question: str) -> str:
+    """
+    Extract question type based on starting word.
+
+    Args:
+        question: Question string
+
+    Returns:
+        Question type: 'who', 'what', 'when', 'where', 'why', 'how', or 'other'
+    """
+    question = question.lower().strip()
+
+    # Common question starters
+    if question.startswith("who"):
+        return "who"
+    elif question.startswith("what"):
+        return "what"
+    elif question.startswith("when"):
+        return "when"
+    elif question.startswith("where"):
+        return "where"
+    elif question.startswith("why"):
+        return "why"
+    elif question.startswith("how"):
+        return "how"
+    else:
+        return "other"
+
+
+def compute_question_type_distribution(questions: List[str]) -> Dict[str, float]:
+    """
+    Compute distribution of question types.
+
+    Args:
+        questions: List of question strings
+
+    Returns:
+        Dictionary mapping question type to ratio
+    """
+    types = [extract_question_type(q) for q in questions]
+    counter = Counter(types)
+    total = len(types)
+
+    distribution = {
+        "who": counter.get("who", 0) / total,
+        "what": counter.get("what", 0) / total,
+        "when": counter.get("when", 0) / total,
+        "where": counter.get("where", 0) / total,
+        "why": counter.get("why", 0) / total,
+        "how": counter.get("how", 0) / total,
+        "other": counter.get("other", 0) / total,
+    }
+
+    return distribution
+
+
+def compute_cosine_similarity_histogram(
+    emb1: np.ndarray, emb2: np.ndarray, n_bins: int = 50
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Compute histograms of cosine similarities within each dataset.
+
+    Args:
+        emb1: Embeddings from dataset 1 (N1 x D)
+        emb2: Embeddings from dataset 2 (N2 x D)
+        n_bins: Number of histogram bins
+
+    Returns:
+        hist1, hist2, bins1, bins2
+    """
+    # Compute pairwise cosine similarities within each dataset
+    # Sample if datasets are too large
+    max_samples = 5000
+    if len(emb1) > max_samples:
+        idx1 = np.random.choice(len(emb1), max_samples, replace=False)
+        emb1_sample = emb1[idx1]
+    else:
+        emb1_sample = emb1
+
+    if len(emb2) > max_samples:
+        idx2 = np.random.choice(len(emb2), max_samples, replace=False)
+        emb2_sample = emb2[idx2]
+    else:
+        emb2_sample = emb2
+
+    # Compute cosine similarities
+    sim1 = cosine_similarity(emb1_sample)
+    sim2 = cosine_similarity(emb2_sample)
+
+    # Get upper triangle (excluding diagonal) to avoid self-similarities
+    triu_idx = np.triu_indices_from(sim1, k=1)
+    sim1_values = sim1[triu_idx]
+
+    triu_idx = np.triu_indices_from(sim2, k=1)
+    sim2_values = sim2[triu_idx]
+
+    # Compute histograms on same bins
+    bins = np.linspace(-1, 1, n_bins + 1)
+    hist1, _ = np.histogram(sim1_values, bins=bins, density=True)
+    hist2, _ = np.histogram(sim2_values, bins=bins, density=True)
+
+    # Normalize to get probability distributions
+    hist1 = hist1 / hist1.sum()
+    hist2 = hist2 / hist2.sum()
+
+    return hist1, hist2, bins[:-1], bins
+
+
+def compute_kl_divergence(
+    p: np.ndarray, q: np.ndarray, epsilon: float = 1e-10
+) -> float:
+    """
+    Compute KL divergence KL(P||Q).
+
+    Args:
+        p: Probability distribution P
+        q: Probability distribution Q
+        epsilon: Small value to avoid log(0)
+
+    Returns:
+        KL divergence value
+    """
+    p = np.array(p) + epsilon
+    q = np.array(q) + epsilon
+    p = p / p.sum()
+    q = q / q.sum()
+
+    return entropy(p, q)
+
+
+def compute_js_divergence(p: np.ndarray, q: np.ndarray) -> float:
+    """
+    Compute Jensen-Shannon divergence.
+
+    Args:
+        p: Probability distribution P
+        q: Probability distribution Q
+
+    Returns:
+        JS divergence value (0 to 1)
+    """
+    return jensenshannon(p, q)
+
+
+def compute_mmd_rbf(X: np.ndarray, Y: np.ndarray, gamma: float = None) -> float:
+    """
+    Compute Maximum Mean Discrepancy with RBF kernel.
+
+    Args:
+        X: Samples from distribution P (N1 x D)
+        Y: Samples from distribution Q (N2 x D)
+        gamma: RBF kernel bandwidth (if None, uses median heuristic)
+
+    Returns:
+        MMD^2 value
+    """
+    # Sample if datasets are too large
+    max_samples = 2000
+    if len(X) > max_samples:
+        X = X[np.random.choice(len(X), max_samples, replace=False)]
+    if len(Y) > max_samples:
+        Y = Y[np.random.choice(len(Y), max_samples, replace=False)]
+
+    # Use median heuristic for gamma if not provided
+    if gamma is None:
+        XY = np.vstack([X, Y])
+        dists = cdist(XY, XY)
+        gamma = 1.0 / (2 * np.median(dists[dists > 0]) ** 2)
+
+    def rbf_kernel(X, Y, gamma):
+        """RBF kernel matrix."""
+        XX = np.sum(X**2, axis=1)[:, np.newaxis]
+        YY = np.sum(Y**2, axis=1)[np.newaxis, :]
+        XY = X @ Y.T
+        dists_sq = XX + YY - 2 * XY
+        return np.exp(-gamma * dists_sq)
+
+    K_XX = rbf_kernel(X, X, gamma)
+    K_YY = rbf_kernel(Y, Y, gamma)
+    K_XY = rbf_kernel(X, Y, gamma)
+
+    m = len(X)
+    n = len(Y)
+
+    # MMD^2 estimator
+    mmd_sq = (K_XX.sum() - np.trace(K_XX)) / (m * (m - 1))
+    mmd_sq += (K_YY.sum() - np.trace(K_YY)) / (n * (n - 1))
+    mmd_sq -= 2 * K_XY.mean()
+
+    return mmd_sq
+
+
+def compute_umap_overlay_metrics(
+    umap_real: np.ndarray, umap_synth: np.ndarray
+) -> Dict[str, float]:
+    """
+    Quantify the quality of UMAP projection overlay.
+
+    Args:
+        umap_real: UMAP 2D projections of real data (N1 x 2)
+        umap_synth: UMAP 2D projections of synthetic data (N2 x 2)
+
+    Returns:
+        Dictionary with overlay quality metrics
+    """
+    metrics = {}
+
+    # 1. Wasserstein distance (Earth Mover's Distance)
+    from scipy.stats import wasserstein_distance
+
+    # Compute 1D Wasserstein on each dimension
+    w_dist_x = wasserstein_distance(umap_real[:, 0], umap_synth[:, 0])
+    w_dist_y = wasserstein_distance(umap_real[:, 1], umap_synth[:, 1])
+    metrics["wasserstein_x"] = w_dist_x
+    metrics["wasserstein_y"] = w_dist_y
+    metrics["wasserstein_avg"] = (w_dist_x + w_dist_y) / 2
+
+    # 2. 2D Wasserstein (using optimal transport if available)
+    try:
+        import ot
+
+        # Normalize to uniform weights
+        a = np.ones(len(umap_real)) / len(umap_real)
+        b = np.ones(len(umap_synth)) / len(umap_synth)
+        M = ot.dist(umap_real, umap_synth, metric="euclidean")
+        w_dist_2d = ot.emd2(a, b, M)
+        metrics["wasserstein_2d"] = w_dist_2d
+    except ImportError:
+        print("Note: Python Optimal Transport (POT) not available for 2D Wasserstein")
+
+    # 3. KL divergence of 2D density estimates
+    # Estimate densities using KDE
+    kde_real = KernelDensity(bandwidth=0.5, kernel="gaussian")
+    kde_synth = KernelDensity(bandwidth=0.5, kernel="gaussian")
+
+    kde_real.fit(umap_real)
+    kde_synth.fit(umap_synth)
+
+    # Create grid for density evaluation
+    x_min = min(umap_real[:, 0].min(), umap_synth[:, 0].min())
+    x_max = max(umap_real[:, 0].max(), umap_synth[:, 0].max())
+    y_min = min(umap_real[:, 1].min(), umap_synth[:, 1].min())
+    y_max = max(umap_real[:, 1].max(), umap_synth[:, 1].max())
+
+    x_grid = np.linspace(x_min, x_max, 50)
+    y_grid = np.linspace(y_min, y_max, 50)
+    X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
+    grid_points = np.column_stack([X_grid.ravel(), Y_grid.ravel()])
+
+    # Evaluate densities
+    log_dens_real = kde_real.score_samples(grid_points)
+    log_dens_synth = kde_synth.score_samples(grid_points)
+
+    dens_real = np.exp(log_dens_real)
+    dens_synth = np.exp(log_dens_synth)
+
+    # Normalize
+    dens_real = dens_real / dens_real.sum()
+    dens_synth = dens_synth / dens_synth.sum()
+
+    # Compute KL and JS divergence
+    metrics["kl_divergence_2d"] = compute_kl_divergence(dens_real, dens_synth)
+    metrics["js_divergence_2d"] = compute_js_divergence(dens_real, dens_synth)
+
+    # 4. Chamfer distance (average nearest neighbor distance)
+    from scipy.spatial import distance_matrix
+
+    # Real to Synth
+    dists_r2s = distance_matrix(umap_real, umap_synth)
+    chamfer_r2s = dists_r2s.min(axis=1).mean()
+
+    # Synth to Real
+    dists_s2r = distance_matrix(umap_synth, umap_real)
+    chamfer_s2r = dists_s2r.min(axis=0).mean()
+
+    metrics["chamfer_real_to_synth"] = chamfer_r2s
+    metrics["chamfer_synth_to_real"] = chamfer_s2r
+    metrics["chamfer_symmetric"] = (chamfer_r2s + chamfer_s2r) / 2
+
+    # 5. Coverage metric (what fraction of real data is "covered" by synth)
+    # Define "coverage" as having a synthetic point within threshold distance
+    threshold = np.percentile(dists_r2s.min(axis=1), 95)  # Adaptive threshold
+    coverage = (dists_r2s.min(axis=1) < threshold).mean()
+    metrics["coverage"] = coverage
+
+    return metrics
+
+
+def plot_comparison_results(
+    real_dist: Dict[str, float],
+    synth_dist: Dict[str, float],
+    hist1: np.ndarray,
+    hist2: np.ndarray,
+    bins: np.ndarray,
+    umap_real: np.ndarray,
+    umap_synth: np.ndarray,
+    save_path: str = None,
+):
+    """
+    Create visualization of comparison results.
+
+    Args:
+        real_dist: Question type distribution for real data
+        synth_dist: Question type distribution for synthetic data
+        hist1: Cosine similarity histogram for real data
+        hist2: Cosine similarity histogram for synthetic data
+        bins: Histogram bins
+        umap_real: UMAP projections of real data
+        umap_synth: UMAP projections of synthetic data
+        save_path: Path to save figure (if None, shows plot)
+    """
+
+    # 1. UMAP overlay
+    fig1, ax = plt.subplots(1, 1, figsize=(7, 6))
+    ax.scatter(
+        umap_real[:, 0], umap_real[:, 1], c="blue", alpha=0.3, s=10, label="Real"
+    )
+    ax.scatter(
+        umap_synth[:, 0], umap_synth[:, 1], c="red", alpha=0.3, s=10, label="Synthetic"
+    )
+    ax.legend()
+    ax.grid(alpha=0.3)
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+    # Remove borders
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+
+    plt.tight_layout()
+
+    if save_path:
+        save_path = str(save_path)
+        # Create filename for UMAP plot
+        base_path = save_path.rsplit(".", 1)[0]
+        ext = save_path.rsplit(".", 1)[1] if "." in save_path else "pdf"
+        umap_path = f"{base_path}_umap.png"  # PDFs have too many elements
+        plt.savefig(umap_path, dpi=300, bbox_inches="tight")
+        print(f"UMAP figure saved to {umap_path}")
+    else:
+        plt.show()
+
+    plt.close(fig1)
+
+    # 2. Question type distribution
+    fig2, ax = plt.subplots(1, 1, figsize=(7, 6))
+    question_types = list(real_dist.keys())
+    x = np.arange(len(question_types))
+    width = 0.35
+
+    ax.bar(
+        x - width / 2,
+        [real_dist[t] for t in question_types],
+        width,
+        label="Real",
+        alpha=0.8,
+        color="blue",
+    )
+    ax.bar(
+        x + width / 2,
+        [synth_dist[t] for t in question_types],
+        width,
+        label="Synthetic",
+        alpha=0.8,
+        color="red",
+    )
+
+    ax.set_xlabel("Question Type")
+    ax.set_ylabel("Ratio")
+    ax.set_xticks(x)
+    ax.set_xticklabels(question_types)
+    ax.legend()
+    ax.grid(axis="y", alpha=0.3)
+
+    # Remove borders
+    for spine in ax.spines.values():
+        spine.set_visible(False)
+
+    plt.tight_layout()
+
+    if save_path:
+        # Create filename for question types plot
+        base_path = save_path.rsplit(".", 1)[0]
+        ext = save_path.rsplit(".", 1)[1] if "." in save_path else "pdf"
+        qt_path = f"{base_path}_question_types.{ext}"
+        plt.savefig(qt_path, dpi=300, bbox_inches="tight")
+        print(f"Question types figure saved to {qt_path}")
+    else:
+        plt.show()
+
+    plt.close(fig2)
+
+
+def compare_qa_datasets(
+    synth_dataset_name: str,
+    embedding_type: Literal["Q", "QA"],
+) -> Dict[str, any]:
+    """
+    Main function to compare QA datasets.
+
+    Args:
+        synth_dataset_name: Name of synthetic dataset
+        embedding_type: Type of embeddings to use
+        load_qa_embeddings: Function to load embeddings
+        get_base_dataset_name: Function to get base dataset name
+        umap_real: Optional pre-computed UMAP projections for real data
+        umap_synth: Optional pre-computed UMAP projections for synthetic data
+
+    Returns:
+        Dictionary with all comparison metrics
+    """
+    print("=" * 80)
+    print("QA Dataset Comparison")
+    print("=" * 80)
+
+    # Load data
+    print(f"\nLoading datasets...")
+    base_dataset_name = get_base_dataset_name(synth_dataset_name)
+    print(f"Base dataset: {base_dataset_name}")
+    print(f"Synthetic dataset: {synth_dataset_name}")
+
+    real_emb, real_q, real_a, real_sample_ids, real_doc_ids = load_qa_embeddings(
+        base_dataset_name, embedding_type
+    )
+    synth_emb, synth_q, synth_a, synth_sample_ids, synth_doc_ids = load_qa_embeddings(
+        synth_dataset_name, embedding_type
+    )
+
+    print(f"Real dataset: {len(real_q)} QA pairs")
+    print(f"Synthetic dataset: {len(synth_q)} QA pairs")
+
+    results = {}
+
+    # 1. Question type distribution
+    print("\n" + "-" * 80)
+    print("1. Question Type Distribution")
+    print("-" * 80)
+
+    real_dist = compute_question_type_distribution(real_q)
+    synth_dist = compute_question_type_distribution(synth_q)
+
+    print("\nReal data:")
+    for qtype, ratio in real_dist.items():
+        print(f"  {qtype:8s}: {ratio:6.2%}")
+
+    print("\nSynthetic data:")
+    for qtype, ratio in synth_dist.items():
+        print(f"  {qtype:8s}: {ratio:6.2%}")
+
+    # Compute distribution divergence
+    types_ordered = ["who", "what", "when", "where", "why", "how", "other"]
+    real_dist_arr = np.array([real_dist[t] for t in types_ordered])
+    synth_dist_arr = np.array([synth_dist[t] for t in types_ordered])
+
+    qtype_kl = compute_kl_divergence(real_dist_arr, synth_dist_arr)
+    qtype_js = compute_js_divergence(real_dist_arr, synth_dist_arr)
+
+    print(f"\nKL divergence (Real||Synth): {qtype_kl:.4f}")
+    print(f"JS divergence: {qtype_js:.4f}")
+
+    results["question_type_real"] = real_dist
+    results["question_type_synth"] = synth_dist
+    results["question_type_kl"] = qtype_kl
+    results["question_type_js"] = qtype_js
+
+    # 2. Cosine similarity histograms
+    print("\n" + "-" * 80)
+    print("2. Embedding Similarity Distribution")
+    print("-" * 80)
+
+    hist_real, hist_synth, bin_centers, bins = compute_cosine_similarity_histogram(
+        real_emb, synth_emb, n_bins=50
+    )
+
+    sim_kl = compute_kl_divergence(hist_real, hist_synth)
+    sim_js = compute_js_divergence(hist_real, hist_synth)
+
+    print(f"KL divergence (Real||Synth): {sim_kl:.4f}")
+    print(f"JS divergence: {sim_js:.4f}")
+
+    results["similarity_hist_kl"] = sim_kl
+    results["similarity_hist_js"] = sim_js
+    results["similarity_hist_real"] = hist_real
+    results["similarity_hist_synth"] = hist_synth
+    results["similarity_bins"] = bin_centers
+
+    # 3. MMD
+    print("\n" + "-" * 80)
+    print("3. Maximum Mean Discrepancy (MMD)")
+    print("-" * 80)
+
+    mmd_value = compute_mmd_rbf(real_emb, synth_emb)
+    print(f"MMD² (RBF kernel): {mmd_value:.6f}")
+
+    results["mmd_rbf"] = mmd_value
+
+    # 4. UMAP overlay metrics (if provided)
+    projection_method = "umap"
+    projection_cache_path = (
+        ENV.QA_GT_WEBAPP_CACHE_DIR
+        / f"projection_{projection_method}_{synth_dataset_name}_{embedding_type}.npy"
+    )
+    projection_2d = np.load(projection_cache_path)
+    umap_real = projection_2d[: len(real_emb)]
+    umap_synth = projection_2d[len(real_emb) :]
+
+    if umap_real is not None and umap_synth is not None:
+        print("\n" + "-" * 80)
+        print("4. UMAP Projection Overlay Metrics")
+        print("-" * 80)
+
+        overlay_metrics = compute_umap_overlay_metrics(umap_real, umap_synth)
+
+        for metric_name, value in overlay_metrics.items():
+            print(f"{metric_name:30s}: {value:.6f}")
+
+        results["umap_overlay"] = overlay_metrics
+
+        # Create visualization
+        plot_comparison_results(
+            real_dist,
+            synth_dist,
+            hist_real,
+            hist_synth,
+            bins,
+            umap_real,
+            umap_synth,
+            save_path=ENV.QA_GT_ANALYZATION_DIR
+            / f"{synth_dataset_name}_qa_comparison.pdf",
+        )
+
+    print("\n" + "=" * 80)
+    print("Comparison complete!")
+    print("=" * 80)
+
+    return results
+
+
+# Example usage
+if __name__ == "__main__":
+    synth_dataset_name = "wtq_alpha=1.0"
+
+    # Run comparison
+    results = compare_qa_datasets(
+        synth_dataset_name=synth_dataset_name, embedding_type="Q"
+    )
+
+    print("\n\nResults summary available in 'results' dictionary:")
+    print(results)
+    print("Visualization saved to 'qa_comparison.png'")
diff --git a/docgenie/analyzation/layoutfid/fid_calculator.py b/docgenie/analyzation/layoutfid/fid_calculator.py
new file mode 100755
index 0000000000000000000000000000000000000000..912a54bac11906480355d79eff68a27d982c6df4
--- /dev/null
+++ b/docgenie/analyzation/layoutfid/fid_calculator.py
@@ -0,0 +1,241 @@
+import warnings
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pydantic.v1 as pydantic
+import pydantic_argparse
+import torch
+from PIL import Image
+from scipy import linalg
+from torch.utils.data import DataLoader
+import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from docgenie.data._core._data_types import DocumentInstance
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.data.interface import (
+    load_dataset,
+    load_synthetic_dataset,
+)
+from docgenie.logging import get_logger
+
+import torchvision.transforms.functional as TF
+from torch.nn.functional import adaptive_avg_pool2d
+from pytorch_fid.inception import InceptionV3
+
+logger = get_logger(__name__)
+
+warnings.filterwarnings("ignore")
+
+
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert (
+        mu1.shape == mu2.shape
+    ), "Training and test mean vectors have different lengths"
+    assert (
+        sigma1.shape == sigma2.shape
+    ), "Training and test covariances have different dimensions"
+
+    diff = mu1 - mu2
+
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = (
+            "fid calculation produces singular product; "
+            "adding %s to diagonal of cov estimates"
+        ) % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError("Imaginary component {}".format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+
+def get_activations(
+    dataset, model, batch_size=50, dims=2048, device="cpu", num_workers=1
+):
+    model.eval()
+
+    dataset.set_transform(lambda sample: TF.to_tensor(sample.image.content.convert("RGB").resize((1024, 1024))))
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=num_workers,
+    )
+
+    pred_arr = np.empty((len(dataset), dims))
+
+    start_idx = 0
+
+    for batch in tqdm.tqdm(dataloader):
+        batch = batch.to(device)
+        print('batch',batch.shape)
+
+        with torch.no_grad():
+            pred = model(batch)[0]
+
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+
+        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
+
+        start_idx = start_idx + pred.shape[0]
+
+    return pred_arr
+
+
+def calculate_activation_statistics(
+    dataset, model, batch_size=50, dims=2048, device="cpu", num_workers=1
+):
+    act = get_activations(dataset, model, batch_size, dims, device, num_workers)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+def calculate_fid_given_datasets(real_dataset, syn_dataset, batch_size, device, dims, num_workers=1):
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = calculate_activation_statistics(
+        real_dataset, model, batch_size, dims, device, num_workers
+    )
+    m2, s2 = calculate_activation_statistics(
+        syn_dataset, model, batch_size, dims, device, num_workers
+    )
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+
+    return fid_value
+
+
+class FIDCalculatorConfig(pydantic.BaseModel):
+    """
+    Configuration for clustering operations.
+    """
+
+    seed: int = 42
+    real_dataset_name: str
+    synth_dataset_name: str
+    batch_size: int = 50
+    limit_sizes_to_smallest: bool = True
+
+
+def main(
+    cfg: FIDCalculatorConfig,
+):
+    """Example usage of FID calculator."""
+
+    # load the results csv
+    output_df_path = Path("data/results/fid.csv")
+
+    # load the results csv and check if row with same real and synth dataset exists
+    if output_df_path.exists():
+        output_df = pd.read_csv(output_df_path)
+        existing_row = output_df[
+            (output_df["real_dataset"] == cfg.real_dataset_name)
+            & (output_df["synth_dataset"] == cfg.synth_dataset_name)
+        ]
+        if not existing_row.empty:
+            logger.info(
+                f"FID already calculated for real dataset '{cfg.real_dataset_name}' and synthetic dataset '{cfg.synth_dataset_name}'. Skipping calculation."
+            )
+            logger.info(
+                f"Existing FID Score: {existing_row['fid'].values[0]:.4f}"
+            )
+            return
+    else:
+        output_df = pd.DataFrame(
+            columns=["real_dataset", "synth_dataset", "fid", "num_samples"]
+        )
+
+    # torch manual seed for reproducibility
+    torch.manual_seed(42)
+
+    # logging config
+    logger.info("Calculating FID with config:")
+    logger.info(cfg.json(indent=4))
+
+    # load real dataset pipeline
+    real_dataset = load_dataset(
+        dataset_name=cfg.real_dataset_name,
+        create_train_val_splits=False,
+    ).train
+
+    synth_dataset = load_synthetic_dataset(
+        dataset_name=cfg.synth_dataset_name,
+    ).train
+
+    # assert datasets are not None
+    assert real_dataset is not None, "Real dataset train split is None"
+    assert synth_dataset is not None, "Synthetic dataset train split is None"
+
+    # log dataset sizes
+    logger.info(f"Real dataset size: {len(real_dataset)}")
+    logger.info(f"Synthetic dataset size: {len(synth_dataset)}")
+
+    # limit both datasets to smallest size
+    if cfg.limit_sizes_to_smallest:
+        real_size = len(real_dataset)  # type: ignore
+        synth_size = len(synth_dataset)  # type: ignore
+
+        if real_size > synth_size:
+            logger.info(
+                f"Real dataset is bigger ({real_size} samples) than synthetic dataset ({synth_size} samples)."
+            )
+            random_indices = torch.randperm(real_size)[:synth_size]
+            real_dataset.set_subset_indices(random_indices.tolist())
+        else:
+            logger.info(
+                f"Synthetic dataset is bigger ({synth_size} samples) than real dataset ({real_size} samples)."
+            )
+            random_indices = torch.randperm(synth_size)[:real_size]
+            synth_dataset.set_subset_indices(random_indices.tolist())
+
+    total_real_dataset_samples = len(real_dataset)  # type: ignore
+    total_synth_dataset_samples = len(synth_dataset)  # type: ignore
+    assert total_real_dataset_samples == total_synth_dataset_samples, (
+        "FID calculation requires both datasets to have the same number of samples. "
+        f"Got {total_real_dataset_samples} real and {total_synth_dataset_samples} synthetic samples."
+    )
+
+    num_samples = total_real_dataset_samples
+    fid = calculate_fid_given_datasets(real_dataset, synth_dataset, cfg.batch_size, device="cuda", dims=2048)
+    logger.info(f"\FID Score: {fid:.4f} over {num_samples} samples")
+
+    # append result to csv
+    new_row = {
+        "real_dataset": cfg.real_dataset_name,
+        "synth_dataset": cfg.synth_dataset_name,
+        "fid": fid,
+        "num_samples": len(real_dataset),
+    }
+    output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
+    output_df.to_csv("data/results/fid.csv", index=False)
+    logger.info("FID score saved to data/results/fid.csv")
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=FIDCalculatorConfig,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/layoutfid/layoutfid_from_embeddings.py b/docgenie/analyzation/layoutfid/layoutfid_from_embeddings.py
new file mode 100755
index 0000000000000000000000000000000000000000..2d128a0a15ad9feff04616ce584e6c42cb324b9f
--- /dev/null
+++ b/docgenie/analyzation/layoutfid/layoutfid_from_embeddings.py
@@ -0,0 +1,402 @@
+import warnings
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from docgenie import ENV
+from docgenie.analyzation.clustering.core._embeddings import _load_embeddings
+import pydantic.v1 as pydantic
+import pydantic_argparse
+import torch
+from PIL import Image
+from scipy import linalg
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from docgenie.data._core._data_types import DocumentInstance
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.data.interface import (
+    load_dataset,
+    load_synthetic_dataset,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+warnings.filterwarnings("ignore")
+
+
+class LayoutFIDCalculator:
+    """
+    GPU-accelerated LayoutFID score calculator using LayoutLMv3 embeddings.
+    """
+
+    def __init__(
+        self, device: str = "cuda", model_name: str = "microsoft/layoutlmv3-base"
+    ):
+        """
+        Initialize LayoutFID calculator.
+
+        Args:
+            device: 'cuda' or 'cpu'
+            model_name: HuggingFace model identifier for LayoutLMv3
+        """
+        self.device = device if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {self.device}")
+
+        # Load LayoutLMv3 model and processor
+        self.processor = AutoProcessor.from_pretrained(model_name, apply_ocr=False)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+
+    def _get_embeddings(
+        self,
+        dataset: MsgpackDatasetReader,
+        batch_size: int,
+        use_image_only: bool = False,
+    ) -> np.ndarray:
+        """
+        Extract LayoutLMv3 embeddings for images.
+
+        Args:
+            image_paths: List of paths to document images
+            batch_size: Batch size for processing
+
+        Returns:
+            Embeddings array of shape (n_images, embedding_dim)
+        """
+
+        embeddings_list = []
+
+        with torch.no_grad():
+            dataloader = DataLoader(
+                dataset,  # type: ignore
+                batch_size=batch_size,
+                shuffle=False,
+                num_workers=4,
+                pin_memory=True,
+                collate_fn=lambda x: x,
+            )
+            for batch in tqdm(
+                dataloader,
+                desc=f"Extracting embeddings batch_size=[{batch_size}]",
+                total=len(dataloader),
+            ):
+                batch: list[DocumentInstance]
+
+                # get images, words, boxes from batch
+                words, word_bboxes, images = [], [], []
+                for sample in batch:
+                    assert sample.image is not None, "Sample image is None"
+                    assert isinstance(sample.image.content, Image.Image), (
+                        "Sample image content is not PIL Image"
+                    )
+                    images.append(sample.image.content.convert("RGB"))
+                    if use_image_only:
+                        continue
+                    assert sample.content is not None, "Sample content is None"
+                    assert sample.content.word_bboxes is not None, (
+                        "Sample word bboxes are None"
+                    )
+
+                    words.append(sample.content.words)
+                    word_bboxes.append(sample.content.word_bboxes.value)
+
+                # Process images with LayoutLMv3 processor
+                inputs = self.processor(
+                    text=words,
+                    boxes=word_bboxes,
+                    images=images,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                )
+
+                # layoutlmv3 expects bboxes in range [0, 1000]
+                # we assume to get normalized bboxes in [0, 1]
+                # scale bboxes
+                # if (
+                #     inputs["bbox"].max() > 1.01 or inputs["bbox"].min() < -0.01
+                # ):  # 1.1 to account for any floating point precision issues
+                #     raise ValueError(
+                #         f"Expected normalized bounding boxes in range [0, 1], Got max value {inputs['bbox'].max()}"
+                #     )
+
+                inputs["bbox"] = (inputs["bbox"].clip(0.0, 1.0) * 1000).long()
+
+                # Move to device
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
+                # Get model output
+                outputs = self.model(**inputs, output_hidden_states=True)
+
+                # Use last hidden state (CLS token or mean pooling)
+                # Extract the [CLS] token representation (first token)
+                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+                embeddings_list.append(batch_embeddings)
+
+        embeddings = np.concatenate(embeddings_list, axis=0)
+        return embeddings
+
+    def _compute_statistics(self, embeddings: np.ndarray) -> tuple:
+        """
+        Compute mean and covariance of embeddings.
+
+        Args:
+            embeddings: Array of shape (n_samples, embedding_dim)
+
+        Returns:
+            Tuple of (mean, covariance)
+        """
+        mu = np.mean(embeddings, axis=0)
+        sigma = np.cov(embeddings.T)
+
+        # Ensure sigma is 2D (handle 1D case)
+        if sigma.ndim == 1:
+            sigma = np.diag(sigma)
+
+        return mu, sigma
+
+    # def _compute_fid( # this works same as calculate_frechet_distance but i kept the original as its taken from well-known FID implementation
+    # https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/inception.py
+    #     self, mu1: np.ndarray, sigma1: np.ndarray, mu2: np.ndarray, sigma2: np.ndarray
+    # ) -> float:
+    #     """
+    #     Compute Fréchet Inception Distance.
+
+    #     Args:
+    #         mu1, sigma1: Mean and covariance of real embeddings
+    #         mu2, sigma2: Mean and covariance of generated embeddings
+
+    #     Returns:
+    #         FID score
+    #     """
+    #     # Euclidean distance between means
+    #     diff = mu1 - mu2
+    #     diff_norm = np.sum(diff**2)
+
+    #     # Trace of covariance matrices
+    #     trace_cov = np.trace(sigma1 + sigma2)
+
+    #     # Matrix square root of product of covariances
+    #     # Using eigenvalue decomposition for numerical stability
+    #     sqrt_cov_prod = self._sqrtm(sigma1 @ sigma2)
+    #     trace_sqrt_prod = np.trace(sqrt_cov_prod)
+
+    #     # FID = ||µr - µg||^2 + Tr(Σr + Σg - 2√(ΣrΣg))
+    #     fid = diff_norm + trace_cov - 2 * trace_sqrt_prod
+
+    #     return float(np.real(fid))
+
+    def calculate_frechet_distance(self, mu1, sigma1, mu2, sigma2, eps=1e-6):
+        """Numpy implementation of the Frechet Distance.
+        The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+        and X_2 ~ N(mu_2, C_2) is
+                d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+
+        Stable version by Dougal J. Sutherland.
+
+        Params:
+        -- mu1   : Numpy array containing the activations of a layer of the
+                inception net (like returned by the function 'get_predictions')
+                for generated samples.
+        -- mu2   : The sample mean over activations, precalculated on an
+                representative data set.
+        -- sigma1: The covariance matrix over activations for generated samples.
+        -- sigma2: The covariance matrix over activations, precalculated on an
+                representative data set.
+
+        Returns:
+        --   : The Frechet Distance.
+        """
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+
+        assert mu1.shape == mu2.shape, (
+            "Training and test mean vectors have different lengths"
+        )
+        assert sigma1.shape == sigma2.shape, (
+            "Training and test covariances have different dimensions"
+        )
+
+        diff = mu1 - mu2
+
+        # Product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; "
+                "adding %s to diagonal of cov estimates"
+            ) % eps
+            logger.info(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+        # Numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+
+        tr_covmean = np.trace(covmean)
+
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+
+    @staticmethod
+    def _sqrtm(matrix: np.ndarray) -> np.ndarray:
+        """
+        Compute matrix square root using eigenvalue decomposition.
+        More numerically stable than scipy.linalg.sqrtm for this use case.
+        """
+        try:
+            # Use scipy's sqrtm for general case
+            sqrt_m = linalg.sqrtm(matrix)
+            # Return real part if imaginary component is negligible
+            if np.iscomplexobj(sqrt_m):
+                sqrt_m = np.real(sqrt_m)
+            return sqrt_m
+        except np.linalg.LinAlgError:
+            # Fallback: eigenvalue decomposition
+            eigvals, eigvecs = np.linalg.eigh(matrix)
+            eigvals = np.maximum(eigvals, 0)  # Ensure non-negative
+            sqrt_m = eigvecs @ np.diag(np.sqrt(eigvals)) @ eigvecs.T
+            return np.real(sqrt_m)
+
+    def calculate_layoutfid(
+        self,
+        real_embeddings: "np.ndarray",
+        synth_embeddings: "np.ndarray",
+        limit_sizes_to_smallest: bool = True,
+    ) -> tuple[float, int]:
+        # limit both datasets to smallest size
+        if limit_sizes_to_smallest:
+            real_size = len(real_embeddings)  # type: ignore
+            synth_size = len(synth_embeddings)  # type: ignore
+
+            # layout fix see which dataset is smaller in size
+            if real_size > synth_size:
+                logger.info(
+                    f"Real embeddings is bigger ({real_size} samples) than synthetic dataset ({synth_size} samples)."
+                )
+                random_indices = torch.randperm(real_size)[:synth_size]
+                real_embeddings = real_embeddings[random_indices.tolist()]
+            else:
+                logger.info(
+                    f"Synthetic dataset is bigger ({synth_size} samples) than real dataset ({real_size} samples)."
+                )
+                random_indices = torch.randperm(synth_size)[:real_size]
+                synth_embeddings = synth_embeddings[random_indices.tolist()]
+
+        total_real_dataset_samples = len(real_embeddings)  # type: ignore
+        total_synth_dataset_samples = len(synth_embeddings)  # type: ignore
+        assert total_real_dataset_samples == total_synth_dataset_samples, (
+            "FID calculation requires both datasets to have the same number of samples. "
+            f"Got {total_real_dataset_samples} real and {total_synth_dataset_samples} synthetic samples."
+        )
+        logger.info("Calculating real statistics...")
+        mu_real, sigma_real = self._compute_statistics(real_embeddings)
+        logger.info("Calculating synthetic statistics...")
+        mu_gen, sigma_gen = self._compute_statistics(synth_embeddings)
+        layoutfid = self.calculate_frechet_distance(
+            mu_real, sigma_real, mu_gen, sigma_gen
+        )
+        return layoutfid, real_embeddings.shape[0]
+
+
+class LayoutFIDCalculatorConfig(pydantic.BaseModel):
+    """
+    Configuration for clustering operations.
+    """
+
+    seed: int = 42
+    real_dataset_name: str
+    synth_dataset_name: str
+    limit_sizes_to_smallest: bool = True
+    embedding_src: str = "layout"
+
+
+def main(
+    cfg: LayoutFIDCalculatorConfig,
+):
+    """Example usage of LayoutFID calculator."""
+
+    # load the results csv
+    output_df_path = Path("data/results/layout_fid_embeddings.csv")
+
+    # load the results csv and check if row with same real and synth dataset exists
+    if output_df_path.exists():
+        output_df = pd.read_csv(output_df_path)
+        existing_row = output_df[
+            (output_df["real_dataset"] == cfg.real_dataset_name)
+            & (output_df["synth_dataset"] == cfg.synth_dataset_name)
+            & (output_df["embedding_src"] == cfg.embedding_src)
+        ]
+        if not existing_row.empty:
+            logger.info(
+                f"LayoutFID already calculated for real dataset '{cfg.real_dataset_name}' and synthetic dataset '{cfg.synth_dataset_name}'. Skipping calculation."
+            )
+            logger.info(
+                f"Existing LayoutFID Score: {existing_row['layoutfid_score'].values[0]:.4f}"
+            )
+            return
+    else:
+        output_df = pd.DataFrame(
+            columns=["real_dataset", "synth_dataset", "layoutfid_score", "num_samples"]
+        )
+
+    # torch manual seed for reproducibility
+    torch.manual_seed(42)
+
+    # logging config
+    logger.info("Calculating LayoutFID with config:")
+    logger.info(cfg.json(indent=4))
+
+    # load the real embeddings
+    real_embeddings, _ = _load_embeddings(
+        file_path=ENV.EMBEDDINGS_DIR / cfg.real_dataset_name / f"{cfg.embedding_src}.h5"
+    )
+
+    # load the synthetic embeddings
+    synthetic_embeddings, _ = _load_embeddings(
+        file_path=ENV.EMBEDDINGS_DIR
+        / 'synth' 
+        / cfg.synth_dataset_name
+        / f"{cfg.embedding_src}.h5",
+    )
+
+    # Initialize calculator
+    calculator = LayoutFIDCalculator(device="cuda")
+
+    # Calculate LayoutFID
+    layoutfid_score, num_samples = calculator.calculate_layoutfid(
+        real_embeddings,
+        synthetic_embeddings,
+        limit_sizes_to_smallest=cfg.limit_sizes_to_smallest,
+    )
+    logger.info(f"\nLayoutFID Score: {layoutfid_score:.4f} over {num_samples} samples")
+
+    # append result to csv
+    new_row = {
+        "real_dataset": cfg.real_dataset_name,
+        "synth_dataset": cfg.synth_dataset_name,
+        "layoutfid_score": layoutfid_score,
+        "num_samples": len(real_embeddings),
+        "embedding_src": cfg.embedding_src,
+    }
+    output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
+    output_df_path.parent.mkdir(parents=True, exist_ok=True)
+    output_df.to_csv(output_df_path, index=False)
+    logger.info("LayoutFID score saved to data/results/layout_fid.csv")
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=LayoutFIDCalculatorConfig,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/layoutfid/layoutfidcalculator.py b/docgenie/analyzation/layoutfid/layoutfidcalculator.py
new file mode 100755
index 0000000000000000000000000000000000000000..aab33d1bd61a2adab76c848ae78dbebcfc69d280
--- /dev/null
+++ b/docgenie/analyzation/layoutfid/layoutfidcalculator.py
@@ -0,0 +1,440 @@
+import warnings
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pydantic.v1 as pydantic
+import pydantic_argparse
+import torch
+from PIL import Image
+from scipy import linalg
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor
+
+from docgenie.data._core._data_types import DocumentInstance
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.data.interface import (
+    load_dataset,
+    load_synthetic_dataset,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+warnings.filterwarnings("ignore")
+
+
+class LayoutFIDCalculator:
+    """
+    GPU-accelerated LayoutFID score calculator using LayoutLMv3 embeddings.
+    """
+
+    def __init__(
+        self, device: str = "cuda", model_name: str = "microsoft/layoutlmv3-base"
+    ):
+        """
+        Initialize LayoutFID calculator.
+
+        Args:
+            device: 'cuda' or 'cpu'
+            model_name: HuggingFace model identifier for LayoutLMv3
+        """
+        self.device = device if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {self.device}")
+
+        # Load LayoutLMv3 model and processor
+        self.processor = AutoProcessor.from_pretrained(model_name, apply_ocr=False)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+
+    def _get_embeddings(
+        self,
+        dataset: MsgpackDatasetReader,
+        batch_size: int,
+        use_image_only: bool = False,
+    ) -> np.ndarray:
+        """
+        Extract LayoutLMv3 embeddings for images.
+
+        Args:
+            image_paths: List of paths to document images
+            batch_size: Batch size for processing
+
+        Returns:
+            Embeddings array of shape (n_images, embedding_dim)
+        """
+
+        embeddings_list = []
+
+        with torch.no_grad():
+            dataloader = DataLoader(
+                dataset,  # type: ignore
+                batch_size=batch_size,
+                shuffle=False,
+                num_workers=4,
+                pin_memory=True,
+                collate_fn=lambda x: x,
+            )
+            for batch in tqdm(
+                dataloader,
+                desc=f"Extracting embeddings batch_size=[{batch_size}]",
+                total=len(dataloader),
+            ):
+                batch: list[DocumentInstance]
+
+                # get images, words, boxes from batch
+                words, word_bboxes, images = [], [], []
+                for sample in batch:
+                    assert sample.image is not None, "Sample image is None"
+                    assert isinstance(sample.image.content, Image.Image), (
+                        "Sample image content is not PIL Image"
+                    )
+                    images.append(sample.image.content.convert("RGB"))
+                    if use_image_only:
+                        words.append(["None"])
+                        word_bboxes.append([[0, 0, 0, 0]])
+
+                        continue
+                    assert sample.content is not None, "Sample content is None"
+                    assert sample.content.word_bboxes is not None, (
+                        "Sample word bboxes are None"
+                    )
+
+                    words.append(sample.content.words)
+                    word_bboxes.append(sample.content.word_bboxes.value)
+
+                # Process images with LayoutLMv3 processor
+                inputs = self.processor(
+                    text=words,
+                    boxes=word_bboxes,
+                    images=images,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                )
+
+                # layoutlmv3 expects bboxes in range [0, 1000]
+                # we assume to get normalized bboxes in [0, 1]
+                # scale bboxes
+                # if (
+                #     inputs["bbox"].max() > 1.01 or inputs["bbox"].min() < -0.01
+                # ):  # 1.1 to account for any floating point precision issues
+                #     raise ValueError(
+                #         f"Expected normalized bounding boxes in range [0, 1], Got max value {inputs['bbox'].max()}"
+                #     )
+
+                inputs["bbox"] = (inputs["bbox"].clip(0.0, 1.0) * 1000).long()
+
+                # Move to device
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+
+                for key, value in inputs.items():
+                    assert isinstance(value, torch.Tensor), (
+                        f"Expected tensor for input '{key}', got {type(value)}"
+                    )
+                    if value is not None:
+                        print(f"Input '{key}' shape: {value.shape}")
+                    else:
+                        print(f"Input '{key}' is None")
+
+                # Get model output
+                outputs = self.model(**inputs, output_hidden_states=True)
+
+                # Use last hidden state (CLS token or mean pooling)
+                # Extract the [CLS] token representation (first token)
+                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+                embeddings_list.append(batch_embeddings)
+
+        embeddings = np.concatenate(embeddings_list, axis=0)
+        return embeddings
+
+    def _compute_statistics(self, embeddings: np.ndarray) -> tuple:
+        """
+        Compute mean and covariance of embeddings.
+
+        Args:
+            embeddings: Array of shape (n_samples, embedding_dim)
+
+        Returns:
+            Tuple of (mean, covariance)
+        """
+        mu = np.mean(embeddings, axis=0)
+        sigma = np.cov(embeddings.T)
+
+        # Ensure sigma is 2D (handle 1D case)
+        if sigma.ndim == 1:
+            sigma = np.diag(sigma)
+
+        return mu, sigma
+
+    # def _compute_fid( # this works same as calculate_frechet_distance but i kept the original as its taken from well-known FID implementation
+    # https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/inception.py
+    #     self, mu1: np.ndarray, sigma1: np.ndarray, mu2: np.ndarray, sigma2: np.ndarray
+    # ) -> float:
+    #     """
+    #     Compute Fréchet Inception Distance.
+
+    #     Args:
+    #         mu1, sigma1: Mean and covariance of real embeddings
+    #         mu2, sigma2: Mean and covariance of generated embeddings
+
+    #     Returns:
+    #         FID score
+    #     """
+    #     # Euclidean distance between means
+    #     diff = mu1 - mu2
+    #     diff_norm = np.sum(diff**2)
+
+    #     # Trace of covariance matrices
+    #     trace_cov = np.trace(sigma1 + sigma2)
+
+    #     # Matrix square root of product of covariances
+    #     # Using eigenvalue decomposition for numerical stability
+    #     sqrt_cov_prod = self._sqrtm(sigma1 @ sigma2)
+    #     trace_sqrt_prod = np.trace(sqrt_cov_prod)
+
+    #     # FID = ||µr - µg||^2 + Tr(Σr + Σg - 2√(ΣrΣg))
+    #     fid = diff_norm + trace_cov - 2 * trace_sqrt_prod
+
+    #     return float(np.real(fid))
+
+    def calculate_frechet_distance(self, mu1, sigma1, mu2, sigma2, eps=1e-6):
+        """Numpy implementation of the Frechet Distance.
+        The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+        and X_2 ~ N(mu_2, C_2) is
+                d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+
+        Stable version by Dougal J. Sutherland.
+
+        Params:
+        -- mu1   : Numpy array containing the activations of a layer of the
+                inception net (like returned by the function 'get_predictions')
+                for generated samples.
+        -- mu2   : The sample mean over activations, precalculated on an
+                representative data set.
+        -- sigma1: The covariance matrix over activations for generated samples.
+        -- sigma2: The covariance matrix over activations, precalculated on an
+                representative data set.
+
+        Returns:
+        --   : The Frechet Distance.
+        """
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+
+        assert mu1.shape == mu2.shape, (
+            "Training and test mean vectors have different lengths"
+        )
+        assert sigma1.shape == sigma2.shape, (
+            "Training and test covariances have different dimensions"
+        )
+
+        diff = mu1 - mu2
+
+        # Product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; "
+                "adding %s to diagonal of cov estimates"
+            ) % eps
+            logger.info(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+        # Numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+
+        tr_covmean = np.trace(covmean)
+
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+
+    @staticmethod
+    def _sqrtm(matrix: np.ndarray) -> np.ndarray:
+        """
+        Compute matrix square root using eigenvalue decomposition.
+        More numerically stable than scipy.linalg.sqrtm for this use case.
+        """
+        try:
+            # Use scipy's sqrtm for general case
+            sqrt_m = linalg.sqrtm(matrix)
+            # Return real part if imaginary component is negligible
+            if np.iscomplexobj(sqrt_m):
+                sqrt_m = np.real(sqrt_m)
+            return sqrt_m
+        except np.linalg.LinAlgError:
+            # Fallback: eigenvalue decomposition
+            eigvals, eigvecs = np.linalg.eigh(matrix)
+            eigvals = np.maximum(eigvals, 0)  # Ensure non-negative
+            sqrt_m = eigvecs @ np.diag(np.sqrt(eigvals)) @ eigvecs.T
+            return np.real(sqrt_m)
+
+    def calculate_layoutfid(
+        self,
+        real_dataset: "MsgpackDatasetReader",
+        synth_dataset: "MsgpackDatasetReader",
+        batch_size: int = 32,
+        limit_sizes_to_smallest: bool = True,
+        use_image_only: bool = False,
+    ) -> tuple[float, int]:
+        """
+        Calculate LayoutFID between real and generated document images.
+
+        Args:
+            real_image_paths: List of paths to real document images (seed documents)
+            generated_image_paths: List of paths to generated document images
+            batch_size: Batch size for embedding extraction
+
+        Returns:
+            LayoutFID score (lower is better)
+        """
+        # limit both datasets to smallest size
+        if limit_sizes_to_smallest:
+            real_size = len(real_dataset)  # type: ignore
+            synth_size = len(synth_dataset)  # type: ignore
+
+            # layout fix see which dataset is smaller in size
+            if real_size > synth_size:
+                logger.info(
+                    f"Real dataset is bigger ({real_size} samples) than synthetic dataset ({synth_size} samples)."
+                )
+                random_indices = torch.randperm(real_size)[:synth_size]
+                real_dataset.set_subset_indices(random_indices.tolist())
+            else:
+                logger.info(
+                    f"Synthetic dataset is bigger ({synth_size} samples) than real dataset ({real_size} samples)."
+                )
+                random_indices = torch.randperm(synth_size)[:real_size]
+                synth_dataset.set_subset_indices(random_indices.tolist())
+
+        total_real_dataset_samples = len(real_dataset)  # type: ignore
+        total_synth_dataset_samples = len(synth_dataset)  # type: ignore
+        assert total_real_dataset_samples == total_synth_dataset_samples, (
+            "FID calculation requires both datasets to have the same number of samples. "
+            f"Got {total_real_dataset_samples} real and {total_synth_dataset_samples} synthetic samples."
+        )
+
+        logger.info(
+            f"Extracting embeddings for {total_real_dataset_samples} real images..."
+        )
+        real_embeddings = self._get_embeddings(
+            real_dataset, batch_size, use_image_only=use_image_only
+        )
+
+        logger.info(
+            f"Extracting embeddings for {total_synth_dataset_samples} generated images..."
+        )
+        gen_embeddings = self._get_embeddings(synth_dataset, batch_size)
+        mu_real, sigma_real = self._compute_statistics(real_embeddings)
+        mu_gen, sigma_gen = self._compute_statistics(gen_embeddings)
+        layoutfid = self.calculate_frechet_distance(
+            mu_real, sigma_real, mu_gen, sigma_gen
+        )
+        return layoutfid, real_embeddings.shape[0]
+
+
+class LayoutFIDCalculatorConfig(pydantic.BaseModel):
+    """
+    Configuration for clustering operations.
+    """
+
+    seed: int = 42
+    real_dataset_name: str
+    synth_dataset_name: str
+    batch_size: int = 32
+    limit_sizes_to_smallest: bool = True
+    use_image_only: bool = False
+
+
+def main(
+    cfg: LayoutFIDCalculatorConfig,
+):
+    """Example usage of LayoutFID calculator."""
+
+    # load the results csv
+    output_df_path = Path("data/results/layout_fid.csv")
+
+    # load the results csv and check if row with same real and synth dataset exists
+    if output_df_path.exists():
+        output_df = pd.read_csv(output_df_path)
+        existing_row = output_df[
+            (output_df["real_dataset"] == cfg.real_dataset_name)
+            & (output_df["synth_dataset"] == cfg.synth_dataset_name)
+        ]
+        if not existing_row.empty:
+            logger.info(
+                f"LayoutFID already calculated for real dataset '{cfg.real_dataset_name}' and synthetic dataset '{cfg.synth_dataset_name}'. Skipping calculation."
+            )
+            logger.info(
+                f"Existing LayoutFID Score: {existing_row['layoutfid_score'].values[0]:.4f}"
+            )
+            return
+    else:
+        output_df = pd.DataFrame(
+            columns=["real_dataset", "synth_dataset", "layoutfid_score", "num_samples"]
+        )
+
+    # torch manual seed for reproducibility
+    torch.manual_seed(42)
+
+    # logging config
+    logger.info("Calculating LayoutFID with config:")
+    logger.info(cfg.json(indent=4))
+
+    # load real dataset pipeline
+    real_dataset = load_dataset(
+        dataset_name=cfg.real_dataset_name,
+        create_train_val_splits=False,
+    ).train
+
+    synth_dataset = load_synthetic_dataset(
+        dataset_name=cfg.synth_dataset_name,
+    ).train
+
+    # assert datasets are not None
+    assert real_dataset is not None, "Real dataset train split is None"
+    assert synth_dataset is not None, "Synthetic dataset train split is None"
+
+    # log dataset sizes
+    logger.info(f"Real dataset size: {len(real_dataset)}")
+    logger.info(f"Synthetic dataset size: {len(synth_dataset)}")
+
+    # Initialize calculator
+    calculator = LayoutFIDCalculator(device="cuda")
+
+    # Calculate LayoutFID
+    layoutfid_score, num_samples = calculator.calculate_layoutfid(
+        real_dataset,
+        synth_dataset,
+        batch_size=cfg.batch_size,
+        limit_sizes_to_smallest=cfg.limit_sizes_to_smallest,
+        use_image_only=cfg.use_image_only,
+    )
+    logger.info(f"\nLayoutFID Score: {layoutfid_score:.4f} over {num_samples} samples")
+
+    # append result to csv
+    new_row = {
+        "real_dataset": cfg.real_dataset_name,
+        "synth_dataset": cfg.synth_dataset_name,
+        "layoutfid_score": layoutfid_score,
+        "num_samples": len(real_dataset),
+    }
+    output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
+    output_df.to_csv("data/results/layout_fid.csv", index=False)
+    logger.info("LayoutFID score saved to data/results/layout_fid.csv")
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=LayoutFIDCalculatorConfig,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/analyzation/synth/analyze_policy_violations.py b/docgenie/analyzation/synth/analyze_policy_violations.py
new file mode 100755
index 0000000000000000000000000000000000000000..30c9befef45345dfbaacca24570d33bff2b0227f
--- /dev/null
+++ b/docgenie/analyzation/synth/analyze_policy_violations.py
@@ -0,0 +1,112 @@
+import argparse
+from collections import Counter
+import json
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+from docgenie import ENV
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="DocGenie Synthetic Document Generator",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "SynDatasetDefinition",
+        type=str,
+        help="Filename without extension of the SynDatasetDefinition in data/syn_dataset_definitions",
+    )
+
+    args = parser.parse_args()
+    assert args.SynDatasetDefinition
+    print(args)
+    return args
+
+
+def search_for_refusals(dsname):
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{dsname}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    dsfiles: SyntheticDatasetFileStructure = dsdef.get_file_structure()
+
+    msg_ids_from_all_batches = set()
+    msg_id_to_batch_id = dict()
+    for prompt_batch_log_path in dsfiles.prompt_batches_directory.iterdir():
+        prompt_batch_log = json.loads(prompt_batch_log_path.read_text(encoding="utf-8"))
+        for msg_id in prompt_batch_log["message_ids"]:
+            msg_ids_from_all_batches.add(msg_id)
+            msg_id_to_batch_id[msg_id] = prompt_batch_log["id"]
+
+    missing_message_results = set()
+    refusals = set()
+    for msg_id in msg_ids_from_all_batches:
+        # Look for missing message results, as previously we didn't save them
+        msg_res_path = dsfiles.message_results_directory / f"{msg_id}.json"
+        if not msg_res_path.exists():
+            missing_message_results.add(msg_id)
+        else:
+            msg_res = json.loads(msg_res_path.read_text(encoding="utf-8"))
+            if msg_res["error"] == "refusal":
+                refusals.add(msg_id)
+
+    # Search seed images
+    all_refusals = missing_message_results | refusals
+    prompt_batch_log_lookup = dict()
+    problematic_seeds = list()
+    for msg_id in all_refusals:
+        batch_id = msg_id_to_batch_id[msg_id]
+        if batch_id not in prompt_batch_log_lookup:
+            prompt_batch_log_path = (
+                dsfiles.prompt_batches_directory / f"{batch_id}.json"
+            )
+            prompt_batch_log = json.loads(
+                prompt_batch_log_path.read_text(encoding="utf-8")
+            )
+            prompt_batch_log_lookup[batch_id] = prompt_batch_log
+
+        prompt_batch_log = prompt_batch_log_lookup[batch_id]
+        msg_seeds = prompt_batch_log["message_id_to_seed_docids"][msg_id]
+
+        # Previously there was a bug, such that every message got ALL seeds of the batch saved as list of lists in message_id_to_seed_docids
+        # In newer versions, this is just a single list
+        is_buggy_lookup = all(isinstance(elem, list) for elem in msg_seeds)
+        if is_buggy_lookup:
+            # we need to retrive the correct sublist via index
+            i = prompt_batch_log["message_ids"].index(msg_id)
+            msg_seeds = msg_seeds[i]
+        else:
+            # msg_seeds is already in correct format
+            ...
+        problematic_seeds.extend(msg_seeds)
+    c = Counter(problematic_seeds)
+    sc = sorted(c.items(), key=lambda item: item[1], reverse=True)
+    for seed, cnt in sc[:3]:
+        print(f"{cnt=} {dsfiles.preprocessed_seed_images_directory / f'{seed}.jpg'}")
+
+    return all_refusals
+
+
+if __name__ == "__main__":
+    dsnames = [
+        "cord_alpha=0.75",
+        "cord_alpha=1.0_v1",
+        "docvqa_alpha=0.5",
+        "docvqa_alpha=0.5_v1",
+        "docvqa_alpha=0.75",
+        "docvqa_alpha=0.75_v1",
+        "docvqa_alpha=1.0",
+        "docvqa_alpha=1.0_v1",
+        "publaynet_alpha=0.75",
+        "rvlcdip_alpha=0.5",
+        "rvlcdip_alpha=0.5_v1",
+        "rvlcdip_alpha=0.75",
+        "rvlcdip_alpha=0.75_v1",
+        "rvlcdip_alpha=1.0",
+        "rvlcdip_alpha=1.0_v1",
+    ]
+    for n in dsnames:
+        refusals = search_for_refusals(n)
+        print(f"{n} {len(refusals)=}")
diff --git a/docgenie/analyzation/synth/select_hw_examples.py b/docgenie/analyzation/synth/select_hw_examples.py
new file mode 100755
index 0000000000000000000000000000000000000000..620ab895fe9f1640b3dbd31910ecc29a765b196c
--- /dev/null
+++ b/docgenie/analyzation/synth/select_hw_examples.py
@@ -0,0 +1,107 @@
+import argparse
+from collections import Counter
+import json
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+from docgenie import ENV
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="DocGenie Synthetic Document Generator",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "SynDatasetDefinition",
+        type=str,
+        help="Filename without extension of the SynDatasetDefinition in data/syn_dataset_definitions",
+    )
+
+    args = parser.parse_args()
+    assert args.SynDatasetDefinition
+    print(args)
+    return args
+
+
+def get_all_hw_sentence_imgs(dsname):
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{dsname}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    dsfiles: SyntheticDatasetFileStructure = dsdef.get_file_structure()
+
+    images_path = dsfiles.handwritten_text_images_directory / "sentences"
+    if not images_path.exists():
+        return
+
+    for d in images_path.iterdir():
+        if d.is_dir():
+            for f in d.iterdir():
+                yield f
+
+
+if __name__ == "__main__":
+    datasets=[
+        "cord_alpha=0.5",
+        "cord_alpha=0.5_v1",
+        "cord_alpha=0.75",
+        "cord_alpha=0.75_v1",
+        "cord_alpha=1.0",
+        "cord_alpha=1.0_v1",
+        "doclaynet4k_alpha=1.0_CLS",
+        "doclaynet4k_alpha=1.0_DLA",
+        "doclaynet_alpha=1.0_CLS",
+        "doclaynet_alpha=1.0_DLA",
+        "docvqa_alpha=0.5",
+        "docvqa_alpha=0.5_v1",
+        "docvqa_alpha=0.75",
+        "docvqa_alpha=0.75_v1",
+        "docvqa_alpha=1.0",
+        "docvqa_alpha=1.0_v1",
+        "funsd_alpha=1.0",
+        "icdar2019_alpha=1.0",
+        "kleister_alpha=1.0",
+        "publaynet_alpha=0.5",
+        "publaynet_alpha=0.5_v1",
+        "publaynet_alpha=0.75",
+        "publaynet_alpha=0.75_v1",
+        "publaynet_alpha=1.0",
+        "publaynet_alpha=1.0_v1",
+        "publaynet_correct-sampling_alpha=0.5",
+        "publaynet_correct-sampling_alpha=0.5_v1",
+        "publaynet_correct-sampling_alpha=0.75",
+        "publaynet_correct-sampling_alpha=0.75_v1",
+        "publaynet_correct-sampling_alpha=1.0",
+        "publaynet_correct-sampling_alpha=1.0_v1",
+        "rvlcdip_alpha=0.5",
+        "rvlcdip_alpha=0.5_v1",
+        "rvlcdip_alpha=0.75",
+        "rvlcdip_alpha=0.75_v1",
+        "rvlcdip_alpha=1.0",
+        "rvlcdip_alpha=1.0_v1",
+        "sroie_alpha=1.0",
+        "tobacco3482_alpha=1.0",
+        "wtq_alpha=1.0",
+    ]
+
+    all_hw_sent_imgs = []
+    for n in datasets:
+        hw_sent_imgs = get_all_hw_sentence_imgs(n)
+        all_hw_sent_imgs.extend(list(hw_sent_imgs))
+
+    import random
+    random.seed = 42
+    random.shuffle(all_hw_sent_imgs)
+
+    import shutil
+    import pathlib
+    NUM_IMGS = 200
+
+    f: pathlib.Path
+    for f in all_hw_sent_imgs[:NUM_IMGS]:
+        d = ENV.DATA_DIR / "hw_imgs" / f'{f.parent.stem}-{f.name}'
+        print(d)
+        input()
+        shutil.copy(f, d)
diff --git a/docgenie/analyzation/utils.py b/docgenie/analyzation/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..bab0d9a1e9b9da7b713c4d82e4c152aba90b8bf7
--- /dev/null
+++ b/docgenie/analyzation/utils.py
@@ -0,0 +1,16 @@
+import pathlib
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+
+def read_h5_numpy(path: pathlib.Path) -> np.ndarray:
+    all_embeddings = []
+    all_ids = []
+    with h5py.File(path, "r") as f:
+        for id_ in tqdm(sorted(f.keys())):
+            emb = f[id_][:]  # load tensor in numpy format
+            all_embeddings.append(emb)
+            all_ids.append(id_)
+    
+    return all_embeddings, all_ids
\ No newline at end of file
diff --git a/docgenie/data/README.md b/docgenie/data/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..3fe47f7b8fb3fdc8450897d82eca5bc302977160
--- /dev/null
+++ b/docgenie/data/README.md
@@ -0,0 +1,83 @@
+# DocGenie
+## Setup environment
+```bash
+uv sync
+source .venv/bin/activate
+```
+
+## Run visualizations scripts for datasets for sanity check
+```bash
+# classification
+uv run python docgenie/data/cmds/visualize.py --dataset-name tobacco3482
+uv run python docgenie/data/cmds/visualize.py --dataset-name rvlcdip
+
+# entity labeling
+uv run python docgenie/data/cmds/visualize.py --dataset-name cord
+uv run python docgenie/data/cmds/visualize.py --dataset-name sroie
+uv run python docgenie/data/cmds/visualize.py --dataset-name funsd
+uv run python docgenie/data/cmds/visualize.py --dataset-name wild_receipts
+uv run python docgenie/data/cmds/visualize.py --dataset-name docile
+
+# extractive qa
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_docvqa  # avg pages ~1
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_deepform  # avg pages ~5
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_tabfact  # avg pages ~1
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_wiki  # avg pages ~1
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_infographics  # avg pages ~1
+uv run python docgenie/data/cmds/visualize.py --dataset-name ex_klc  # avg pages ~23
+```
+
+## How to load a specific dataset without transforms
+This script assumes that datasets are already prepared in the /path/to/datasets/ dir in msgpack format 
+The dataset preparation itself is managed using a separate atria_datasets library.
+To keep docgenie code clean the two are separated.
+```python
+from docgenie.data import load_dataset
+dataset = load_dataset(dataset_name, root_datasets_dir="/path/to/datasets/")
+
+# read samples or use dataset.train[0]
+train_dataset = dataset.train # could be None, check for actual use
+for sample in dataset.train:
+    print("Sample: ", sample)
+
+validation_dataset = dataset.validation # could be None, check for actual use
+for sample in dataset.validation:
+    print("Sample: ", sample)
+
+test_dataset = dataset.test # could be None, check for actual use
+for sample in dataset.test:
+    print("Sample: ", sample)
+```
+
+## How to load a specific dataset with task-specific transforms
+This script assumes that datasets are already prepared in the /path/to/datasets/ dir in msgpack format 
+The dataset preparation itself is managed using a separate atria_datasets library.
+To keep docgenie code clean the two are separated.
+```python
+from docgenie.data import load_data_pipeline
+
+# load sequence classification dataset pipeline 
+data_pipeline = load_data_pipeline(
+    dataset_name=dataset_name, 
+)
+
+# load tokenized batch from train dataloader 
+for batch in data_pipeline.train_dataloader:
+    print(batch)
+
+# load tokenized batch from validation dataloader 
+for batch in data_pipeline.validation_dataloader:
+    print(batch)
+
+# load tokenized batch from test dataloader 
+for batch in data_pipeline.test_dataloader:
+    print(batch)
+```
+
+## Run tests for data pipeline to make sure its correct
+This can be run to test datasets. If tests fail this means something wrong with preparation 
+of that dataset.
+
+```
+uv run pytest ./tests/test_data_pipeline.py -q --tb=line
+```
\ No newline at end of file
diff --git a/docgenie/data/__init__.py b/docgenie/data/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..3665022392003e561972c94f52bc1eabc20705c6
--- /dev/null
+++ b/docgenie/data/__init__.py
@@ -0,0 +1,3 @@
+from ._transforms import *  # noqa
+from .interface import *  # noqa
+from .constants import *  # noqa
diff --git a/docgenie/data/_core/_data_pipeline.py b/docgenie/data/_core/_data_pipeline.py
new file mode 100755
index 0000000000000000000000000000000000000000..26dbbd581d81fb3bc3775a228f2f2911ee8d5653
--- /dev/null
+++ b/docgenie/data/_core/_data_pipeline.py
@@ -0,0 +1,233 @@
+"""
+A data pipeline that wraps around a dataset and provides dataloaders for training, validation, and testing.
+"""
+
+from typing import TYPE_CHECKING, Callable
+
+from atria_core.utilities.repr import RepresentationMixin
+
+from docgenie.data._core._data_types import MMDetInput
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.logging import get_logger
+
+from ._dataset import Dataset
+from ._utilities import (
+    auto_dataloader,
+    default_collate,
+)
+
+if TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+
+
+logger = get_logger(__name__)
+
+
+def mmdet_pseudo_collate(batch: list["MMDetInput"]):
+    """
+    Default collate function for MMDetInput inputs.
+
+    This function collates a batch of data instances into a single batch. It is used when
+    the `collate_fn` argument is not provided to the DataLoader.
+
+    Args:
+        batch (List[MMDetInput]): A batch of data instances.
+
+    Returns:
+        Any: The collated batch.
+
+    Raises:
+        ValueError: If the batch is empty or not a list.
+    """
+    from mmengine.dataset.utils import pseudo_collate
+
+    return MMDetInput(
+        **pseudo_collate(
+            [
+                {
+                    "inputs": sample.inputs,
+                    "data_samples": sample.data_samples,
+                }
+                for sample in batch
+            ]
+        )
+    )
+
+
+class DataPipeline(RepresentationMixin):
+    def __init__(
+        self,
+        dataset: "Dataset",
+        # dataset split args
+        dataset_splitting_enabled: bool = False,
+        split_ratio: float = 0.9,
+        # collate_fn
+        collate_fn: str | None = "default_collate",
+    ):
+        self._dataset = dataset
+        self._sharded_storage_kwargs = {}
+        self._dataset_splitter = None
+
+        # if dataset_splitting_enabled and self._dataset.validation is None: # just make sure to turn this off for now
+        #     assert self._dataset.train is not None, (
+        #         "Dataset splitting enabled but no training dataset found."
+        #     )
+        #         self._dataset_splitter = StandardSplitter(
+        #             split_ratio=split_ratio, shuffle=True
+        #         )
+        #     self._dataset.train, self._dataset.validation = self._dataset_splitter(
+        #         self._dataset.train
+        #     )
+
+        #     logger.info("Dataset splitting enabled.")
+        #     logger.info(
+        #         f"Train set size: {self._dataset.train_size}, Validation set size: {self._dataset.validation_size}"
+        #     )
+
+        if collate_fn == "default_collate":
+            self._collate_fn = default_collate
+        elif collate_fn == "mmdet_pseudo_collate":
+            self._collate_fn = mmdet_pseudo_collate
+        elif collate_fn == "identity":
+            self._collate_fn = lambda x: x
+        else:
+            raise ValueError(f"Invalid collate_fn: {collate_fn}")
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def dataset_metadata(self):
+        return self._dataset.metadata
+
+    def set_transform(
+        self, transform: Callable, for_train: bool = True, for_eval: bool = True
+    ):
+        from torch.utils.data import ConcatDataset
+
+        if for_train and self._dataset.train is not None:
+            if isinstance(self._dataset.train, ConcatDataset):
+                for ds in self._dataset.train.datasets:
+                    ds.set_transform(transform)
+            else:
+                self._dataset.train.set_transform(transform)
+
+        if for_eval and self._dataset.validation is not None:
+            self._dataset.validation.set_transform(transform)
+
+        if for_eval and self._dataset.test is not None:
+            self._dataset.test.set_transform(transform)
+
+    def dataloader(
+        self,
+        split: str,
+        batch_size: int = 1,
+        pin_memory: bool = True,
+        num_workers: int = 4,
+        shuffle: bool = True,
+    ):
+        if split == "train":
+            return self.train_dataloader(
+                batch_size=batch_size,
+                pin_memory=pin_memory,
+                num_workers=num_workers,
+                shuffle=shuffle,
+            )
+        elif split == "validation":
+            return self.validation_dataloader(
+                batch_size=batch_size, pin_memory=pin_memory, num_workers=num_workers
+            )
+        elif split == "test":
+            return self.test_dataloader(
+                batch_size=batch_size, pin_memory=pin_memory, num_workers=num_workers
+            )
+        else:
+            raise ValueError(f"Invalid split name: {split}")
+
+    def train_dataloader(
+        self,
+        batch_size: int = 1,
+        pin_memory: bool = True,
+        num_workers: int = 4,
+        shuffle: bool = True,
+    ) -> "DataLoader | None":
+        import ignite.distributed as idist
+        from torch.utils.data import RandomSampler, SequentialSampler
+
+        if self._dataset.train is None:
+            return
+
+        return auto_dataloader(
+            dataset=self._dataset.train,
+            collate_fn=self._collate_fn,
+            sampler=RandomSampler(self._dataset.train)
+            if shuffle
+            else SequentialSampler(self._dataset.train),
+            drop_last=idist.get_world_size() > 1,
+            batch_size=batch_size * idist.get_world_size(),
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+        )
+
+    def validation_dataloader(
+        self, batch_size: int = 1, pin_memory: bool = True, num_workers: int = 4
+    ) -> "DataLoader | None":
+        dataset = self._dataset.validation or self._dataset.test
+        if dataset is None:
+            return
+
+        if self._dataset.validation is None:
+            logger.warning(
+                "No validation dataset found, using test dataset for validation."
+            )
+
+        return self._build_evaluation_dataloader(
+            dataset,
+            batch_size=batch_size,
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+        )
+
+    def test_dataloader(
+        self, batch_size: int = 1, pin_memory: bool = True, num_workers: int = 4
+    ) -> "DataLoader | None":
+        if self._dataset.test is None:
+            return None
+        return self._build_evaluation_dataloader(
+            self._dataset.test,
+            batch_size=batch_size,
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+        )
+
+    def _build_evaluation_dataloader(
+        self,
+        dataset: "MsgpackDatasetReader",
+        batch_size: int = 1,
+        pin_memory: bool = True,
+        num_workers: int = 4,
+    ) -> "DataLoader":
+        if dataset is None:
+            return None
+
+        import ignite.distributed as idist  # type: ignore
+        from torch.utils.data import SequentialSampler  # type: ignore
+
+        if idist.get_world_size() > 1:
+            if len(dataset) % idist.get_world_size() != 0:
+                logger.warning(
+                    "Enabling distributed evaluation with an eval dataset not divisible by process number. "
+                    "This will slightly alter validation results as extra duplicate entries are added to achieve "
+                    "equal num of samples per-process."
+                )
+        return auto_dataloader(
+            dataset=dataset,
+            collate_fn=self._collate_fn,
+            shuffle=False,
+            drop_last=False,
+            sampler=SequentialSampler(dataset),
+            batch_size=batch_size * idist.get_world_size(),
+            pin_memory=pin_memory,
+            num_workers=num_workers,
+        )
diff --git a/docgenie/data/_core/_data_types.py b/docgenie/data/_core/_data_types.py
new file mode 100755
index 0000000000000000000000000000000000000000..df0f8f2cf0e583d9214a44139de120c0b8c8412c
--- /dev/null
+++ b/docgenie/data/_core/_data_types.py
@@ -0,0 +1,537 @@
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field, fields, replace
+from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar
+
+from atria_core.types import *
+from mmdet.structures import DetDataSample
+from pydantic import ConfigDict
+
+if TYPE_CHECKING:
+    from typing import Any
+
+    import torch
+
+
+class OverflowStrategy(str, enum.Enum):
+    select_first = "select_first"
+    select_all = "select_all"
+    select_random = "select_random"
+
+
+if TYPE_CHECKING:
+    import torch
+
+T = TypeVar("T", bound="BaseModelInput")
+
+
+@dataclass(frozen=True)
+class MMDetInput:
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+    inputs: list[Any] | Any
+    data_samples: DetDataSample
+
+    def to(self, device: torch.device | str) -> "MMDetInput":
+        inputs = [tensor.to(device) for tensor in self.inputs]
+        return MMDetInput(
+            inputs=inputs,
+            data_samples=self.data_samples,
+        )
+
+
+@dataclass(frozen=True)
+class BaseModelInput:
+    """
+    Base class for model input dataclasses.
+    - Frozen (immutable)
+    - Prevents nested BaseModelInput instances
+    - Provides transform utilities (like .to(device))
+    """
+
+    _is_batched: bool = field(default=False, repr=False, compare=False)
+
+    def __post_init__(self):
+        # Disallow nested BaseModelInput instances
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if isinstance(value, BaseModelInput):
+                raise TypeError(
+                    f"Field '{f.name}' cannot be another BaseModelInput "
+                    f"({type(value).__name__}). Nesting is not allowed."
+                )
+
+    def _map_tensors(self, fn: callable):
+        """
+        Internal helper: apply a function to all torch.Tensor fields.
+        Returns a new instance with transformed fields.
+        """
+        import torch
+
+        updates = {}
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                updates[f.name] = fn(val)
+            elif isinstance(val, list):
+                # If it's a list of tensors, map them too
+                updates[f.name] = [
+                    fn(v) if isinstance(v, torch.Tensor) else v for v in val
+                ]
+            else:
+                updates[f.name] = val
+        return replace(self, **updates)
+
+    def to(self, device: torch.device | str):
+        """Move all tensor fields to a given device."""
+        return self._map_tensors(lambda t: t.to(device))
+
+    def cpu(self):
+        """Move all tensor fields to CPU."""
+        return self._map_tensors(lambda t: t.cpu())
+
+    def cuda(self):
+        """Move all tensor fields to CUDA."""
+        return self._map_tensors(lambda t: t.cuda())
+
+    def numpy(self):
+        """Convert all tensor fields to numpy arrays."""
+        return self._map_tensors(
+            lambda t: t.detach().cpu().numpy() if isinstance(t, torch.Tensor) else t
+        )
+
+    @classmethod
+    def batch(cls: Type[T], instances: list[T]) -> T:
+        """
+        Batch a list of BaseModelInput instances into a single instance.
+        - Tensor fields are stacked along dim=0.
+        - Non-tensor fields become lists.
+        """
+        import torch
+
+        if not instances:
+            raise ValueError("Cannot batch an empty list of inputs.")
+        if not all(isinstance(x, cls) for x in instances):
+            raise TypeError(f"All elements must be instances of {cls.__name__}.")
+
+        field_values = {}
+        for f in fields(instances[0]):
+            if f.name.startswith("_"):
+                field_values[f.name] = getattr(instances[0], f.name)
+                continue
+
+            vals = [getattr(x, f.name) for x in instances]
+            if vals[0] is None:  # we assume if any value is None, all are None
+                field_values[f.name] = None
+                continue
+
+            if all(isinstance(v, torch.Tensor) for v in vals):
+                field_values[f.name] = torch.stack(vals, dim=0)
+            else:
+                field_values[f.name] = vals
+
+        return cls(**field_values)
+
+    def __repr__(self) -> str:
+        """
+        Generates a developer-friendly string representation of the object.
+
+        Returns:
+            str: A developer-friendly string representation of the object.
+        """
+
+        import torch
+        from rich.pretty import pretty_repr
+
+        torch.set_printoptions(edgeitems=2, threshold=100)
+
+        return pretty_repr(self, max_length=4, max_string=128, max_depth=3)
+
+    def __str__(self) -> str:
+        """
+        Generates a human-readable string representation of the object.
+
+        Returns:
+            str: A human-readable string representation of the object.
+        """
+
+        import torch
+        from rich.pretty import pretty_repr
+
+        torch.set_printoptions(edgeitems=2, threshold=100)
+
+        return pretty_repr(self, max_length=4, max_string=128, max_depth=3)
+
+
+@dataclass(frozen=True)
+class DocumentInstanceModelInput(BaseModelInput):
+    tokenizer_config: dict | None = None
+
+    # token level fields
+    token_ids: "torch.Tensor" = None
+    token_bboxes: Optional["torch.Tensor"] = None
+    token_type_ids: Optional["torch.Tensor"] = None
+    token_labels: Optional["torch.Tensor"] = None
+    attention_mask: "torch.Tensor" = None
+    word_ids: "torch.Tensor" = None
+    sequence_ids: "torch.Tensor" = None
+    overflow_to_sample_mapping: "torch.Tensor" = None
+
+    # segment level fields
+    segment_index: "torch.Tensor" = None
+    segment_inner_token_rank: "torch.Tensor" = None
+    first_token_idxes: "torch.Tensor" = None
+    first_token_idxes_mask: "torch.Tensor" = None
+
+    # sample level fields
+    index: Optional["torch.Tensor"] = (
+        None  # index is used to uniquely identify a sample in a batch
+    )
+    sample_id: str = None
+    image: Optional["torch.Tensor"] = None
+    label: Optional["torch.Tensor"] = None
+    words: list[str] = None
+
+    # extractive QA specific fields
+    question_id: int | None = None
+    qa_question: str | None = None
+    qa_answers: list[str] | None = None
+    token_answer_start: Optional["torch.Tensor"] = None
+    token_answer_end: Optional["torch.Tensor"] = None
+
+    def select_overflow_samples_by_id(self, is_random: bool = False):
+        import torch
+
+        assert self._is_batched, (
+            "select_all_overflow_samples can only be called on batched inputs."
+        )
+
+        def _gather_idx_from_sequence_list(
+            samples_batch: list[torch.Tensor],
+        ) -> torch.Tensor | None:
+            if samples_batch is None:
+                return None
+
+            resolved_samples_batch = []
+            for sample_data in samples_batch:
+                if len(sample_data) == 1:
+                    resolved_samples_batch.append(sample_data[0])
+                else:
+                    idx = (
+                        0
+                        if not is_random
+                        else torch.randint(0, sample_data.shape[0], (1,)).item()
+                    )
+                    resolved_samples_batch.append(sample_data[idx])
+            return torch.stack(resolved_samples_batch)
+
+        token_ids = _gather_idx_from_sequence_list(self.token_ids)
+        token_type_ids = _gather_idx_from_sequence_list(self.token_type_ids)
+        token_bboxes = _gather_idx_from_sequence_list(self.token_bboxes)
+        token_labels = _gather_idx_from_sequence_list(self.token_labels)
+        attention_mask = _gather_idx_from_sequence_list(self.attention_mask)
+        word_ids = _gather_idx_from_sequence_list(self.word_ids)
+        sequence_ids = _gather_idx_from_sequence_list(self.sequence_ids)
+        overflow_to_sample_mapping = _gather_idx_from_sequence_list(
+            self.overflow_to_sample_mapping
+        )
+
+        # segment level fields
+        segment_index = _gather_idx_from_sequence_list(self.segment_index)
+        segment_inner_token_rank = _gather_idx_from_sequence_list(
+            self.segment_inner_token_rank
+        )
+        first_token_idxes = _gather_idx_from_sequence_list(self.first_token_idxes)
+        first_token_idxes_mask = _gather_idx_from_sequence_list(
+            self.first_token_idxes_mask
+        )
+
+        # sample level fields remain unchanged
+        token_answer_start, token_answer_end = None, None
+        if self.token_answer_start is not None:
+            token_answer_start = _gather_idx_from_sequence_list(self.token_answer_start)
+            token_answer_end = _gather_idx_from_sequence_list(self.token_answer_end)
+
+        return replace(
+            self,
+            token_ids=token_ids,
+            token_type_ids=token_type_ids,
+            token_bboxes=token_bboxes,
+            token_labels=token_labels,
+            attention_mask=attention_mask,
+            word_ids=word_ids,
+            sequence_ids=sequence_ids,
+            overflow_to_sample_mapping=overflow_to_sample_mapping,
+            token_answer_start=token_answer_start,
+            token_answer_end=token_answer_end,
+            # segment level fields
+            segment_index=segment_index,
+            segment_inner_token_rank=segment_inner_token_rank,
+            first_token_idxes=first_token_idxes,
+            first_token_idxes_mask=first_token_idxes_mask,
+            # stack tensors
+            image=self.image if self.image is None else torch.stack(self.image),
+            label=self.label if self.label is None else torch.stack(self.label),
+            # index=self.index if self.index is None else torch.tensor(self.index),
+        )
+
+    def resolve_sample_overflow(
+        self, overflow_strategy: OverflowStrategy = OverflowStrategy.select_all
+    ) -> DocumentInstanceModelInput:
+        if not isinstance(self.token_ids, list):
+            # already resolved
+            return self
+
+        if overflow_strategy == OverflowStrategy.select_all:
+            return self.select_all_overflow_samples()
+        elif overflow_strategy == OverflowStrategy.select_first:
+            return self.select_first_overflow_samples()
+        elif overflow_strategy == OverflowStrategy.select_random:
+            return self.select_random_overflow_samples()
+        else:
+            raise ValueError(f"Unknown overflow strategy: {overflow_strategy}")
+
+    def select_first_overflow_samples(self):
+        return self.select_overflow_samples_by_id(is_random=False)
+
+    def select_random_overflow_samples(self):
+        return self.select_overflow_samples_by_id(is_random=True)
+
+    def select_all_overflow_samples(self) -> tuple[bool, list[int], list[str]]:
+        import torch
+
+        assert self._is_batched, (
+            "select_all_overflow_samples can only be called on batched inputs."
+        )
+        repeat_indices = [sample.shape[0] for sample in self.token_ids]
+
+        # we concatenate all lists of overflowed samples into a single tensor
+        def _cat_tensor_fields(samples_list: list[torch.Tensor]) -> torch.Tensor | None:
+            if samples_list is not None:
+                return torch.cat(samples_list, dim=0)
+            return None
+
+        # these are all fields that are already in overflowed format
+        token_ids = _cat_tensor_fields(self.token_ids)
+        token_bboxes = _cat_tensor_fields(self.token_bboxes)
+        token_type_ids = _cat_tensor_fields(self.token_type_ids)
+        token_labels = _cat_tensor_fields(self.token_labels)
+        attention_mask = _cat_tensor_fields(self.attention_mask)
+        word_ids = _cat_tensor_fields(self.word_ids)
+        sequence_ids = _cat_tensor_fields(self.sequence_ids)
+        overflow_to_sample_mapping = _cat_tensor_fields(self.overflow_to_sample_mapping)
+
+        # segment level fields
+        segment_index = _cat_tensor_fields(self.segment_index)
+        segment_inner_token_rank = _cat_tensor_fields(self.segment_inner_token_rank)
+        first_token_idxes = _cat_tensor_fields(self.first_token_idxes)
+        first_token_idxes_mask = _cat_tensor_fields(self.first_token_idxes_mask)
+
+        token_answer_start, token_answer_end = None, None
+        if self.token_answer_start is not None and self.token_answer_end is not None:
+            token_answer_start = _cat_tensor_fields(self.token_answer_start)
+            token_answer_end = _cat_tensor_fields(self.token_answer_end)
+
+        # these are fields that are at sample level and need to be repeated in case of overflow
+        # sample level fields
+        index = self._repeat_field(self.index, repeat_indices)
+        sample_id = self._repeat_field(self.sample_id, repeat_indices)
+        image = self._repeat_field(self.image, repeat_indices)
+        label = self._repeat_field(self.label, repeat_indices)
+        words = self._repeat_field(self.words, repeat_indices)
+
+        # extractive QA specific fields
+        question_id = self._repeat_field(self.question_id, repeat_indices)
+        qa_question = self._repeat_field(self.qa_question, repeat_indices)
+        qa_answers = self._repeat_field(self.qa_answers, repeat_indices)
+
+        repeated_instance = replace(
+            self,
+            token_ids=token_ids,
+            token_bboxes=token_bboxes,
+            token_type_ids=token_type_ids,
+            token_labels=token_labels,
+            attention_mask=attention_mask,
+            word_ids=word_ids,
+            sequence_ids=sequence_ids,
+            overflow_to_sample_mapping=overflow_to_sample_mapping,
+            # segment level fields
+            segment_index=segment_index,
+            segment_inner_token_rank=segment_inner_token_rank,
+            first_token_idxes=first_token_idxes,
+            first_token_idxes_mask=first_token_idxes_mask,
+            # sample level fields
+            index=index,
+            sample_id=sample_id,
+            image=image,
+            label=label,
+            words=words,
+            question_id=question_id,
+            qa_question=qa_question,
+            qa_answers=qa_answers,
+            token_answer_start=token_answer_start,
+            token_answer_end=token_answer_end,
+        )
+
+        for key, value in repeated_instance.to_dict().items():
+            if isinstance(value, list) and len(value) != sum(repeat_indices):
+                raise ValueError(
+                    f"Field '{key}' length {len(value)} does not match expected {sum(repeat_indices)}"
+                )
+            if isinstance(value, torch.Tensor) and value.size(0) != sum(repeat_indices):
+                raise ValueError(
+                    f"Field '{key}' size {value.size(0)} does not match expected {sum(repeat_indices)}"
+                )
+        return repeated_instance
+
+    def _repeat_field(self, field_value: Any, repeat_indices: list[int]) -> Any:
+        import torch
+
+        if isinstance(field_value, list):
+            if len(field_value) == 0:
+                return field_value
+            if len(field_value) != len(repeat_indices):
+                raise ValueError(
+                    f"List length ({len(field_value)}) doesn't match repeat_indices length ({len(repeat_indices)})"
+                )
+            repeated_list = [
+                item
+                for item, count in zip(field_value, repeat_indices, strict=True)
+                for _ in range(count)
+            ]
+
+            if isinstance(field_value[0], torch.Tensor):
+                return torch.stack(repeated_list, dim=0)
+            return repeated_list
+
+        elif isinstance(field_value, torch.Tensor):
+            if field_value.size(0) != len(repeat_indices):
+                raise ValueError(
+                    f"Tensor batch size ({field_value.size(0)}) doesn't match repeat_indices length ({len(repeat_indices)})"
+                )
+            return field_value.repeat_interleave(
+                torch.tensor(repeat_indices, device=field_value.device), dim=0
+            )
+
+        return field_value
+
+    @classmethod
+    def batch(cls: DocumentInstanceModelInput, instances: list[T]) -> T:
+        if not instances:
+            raise ValueError("Cannot batch an empty list of inputs.")
+        if not all(isinstance(x, cls) for x in instances):
+            raise TypeError(f"All elements must be instances of {cls.__name__}.")
+
+        field_values = {}
+        for f in fields(instances[0]):
+            if f.name == "_is_batched":
+                field_values[f.name] = True
+                continue
+            if f.name == "tokenizer_config":
+                # For tokenizer_config, we take from the first instance
+                field_values[f.name] = getattr(instances[0], f.name)
+                continue
+            if f.name.startswith("_"):
+                field_values[f.name] = getattr(instances[0], f.name)
+                continue
+
+            vals = [getattr(x, f.name) for x in instances]
+            if vals[0] is None:  # we assume if any value is None, all are None
+                field_values[f.name] = None
+                continue
+
+            # we simply put all fields in a list and batch them later
+            # for example we can have sequences like following due to overflow mapping
+            # seq 1 -> token ids of size (2, 512)
+            # seq 2 -> token ids of size (1, 512)
+            # seq 3 -> token ids of size (4, 512)
+            field_values[f.name] = vals
+
+        return cls(**field_values)
+
+    def print_info(self):
+        import torch
+
+        print("DocumentInstanceModelInput:")
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                print(f"  {f.name}: Tensor shape {val.shape}, dtype {val.dtype}")
+            elif isinstance(val, list):
+                if len(val) > 0 and isinstance(val[0], torch.Tensor):
+                    shapes = [v.shape for v in val]
+                    print(f"  {f.name}: List of Tensors with shapes {shapes}")
+                else:
+                    print(f"  {f.name}: List of length {len(val)}")
+            else:
+                print(f"  {f.name}: {type(val).__name__} value: {val}")
+
+    def to_dict(self):
+        import torch
+
+        result = {}
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                result[f.name] = val.detach().cpu().numpy()
+            elif isinstance(val, list):
+                if len(val) > 0 and isinstance(val[0], torch.Tensor):
+                    result[f.name] = [v.detach().cpu().numpy() for v in val]
+                else:
+                    result[f.name] = val
+            else:
+                result[f.name] = val
+
+        return result
+
+    @classmethod
+    def from_dict(cls: Type[T], data: dict[str, Any]) -> T:
+        import numpy as np
+        import torch
+
+        for key, value in data.items():
+            if isinstance(value, np.ndarray):
+                data[key] = torch.tensor(value)
+            elif (
+                isinstance(value, list)
+                and len(value) > 0
+                and isinstance(value[0], np.ndarray)
+            ):
+                data[key] = [torch.tensor(v) for v in value]
+            else:
+                data[key] = value
+
+        return cls(**data)
+
+
+@dataclass(frozen=True)
+class ConditionalGenerationModelInput(BaseModelInput):
+    index: Optional["torch.Tensor"] = None
+    sample_id: Optional[str] = None
+    input_ids: Optional["torch.Tensor"] = None
+    bbox: Optional["torch.Tensor"] = None
+    attention_mask: Optional["torch.Tensor"] = None
+    pixel_values: Optional["torch.Tensor"] = None
+    question_text: Optional[str] = None
+    target_text: Optional[str] = None
+    target_token_ids: Optional["torch.Tensor"] = None
+    words: Optional[list[str]] = None
+    word_labels: Optional[list[str]] = None
+    label: Optional["torch.Tensor"] = None
+
+
+@dataclass(frozen=True)
+class VLMModelInput(BaseModelInput):
+    index: Optional["torch.Tensor"] = None
+    sample_id: Optional[str] = None
+    input_ids: Optional["torch.Tensor"] = None
+    bbox: Optional["torch.Tensor"] = None
+    attention_mask: Optional["torch.Tensor"] = None
+    pixel_values: Optional["torch.Tensor"] = None
+    image_grid_thw: Optional["torch.Tensor"] = None
+    question_text: Optional[str] = None
+    target_text: Optional[str] = None
+    target_token_ids: Optional["torch.Tensor"] = None
+    words: Optional[list[str]] = None
+    word_labels: Optional[list[str]] = None
+    label: Optional["torch.Tensor"] = None
diff --git a/docgenie/data/_core/_dataset.py b/docgenie/data/_core/_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..6b45751666c5071adefd52350fb41fa4b1c3df05
--- /dev/null
+++ b/docgenie/data/_core/_dataset.py
@@ -0,0 +1,114 @@
+"""
+A simple dataset class that holds multiple split iterators.
+"""
+
+from __future__ import annotations
+
+from typing import TypeVar
+
+from atria_core.types import DatasetMetadata
+from atria_core.types.common import DatasetSplitType
+from atria_core.types.data_instance.base import (
+    BaseDataInstance,
+)
+from atria_core.utilities.repr import RepresentationMixin
+
+from docgenie.logging import get_logger
+
+from ._msgpack_dataset_reader import MsgpackDatasetReader
+from ._utilities import TaskType
+
+logger = get_logger(__name__)
+
+
+T_BaseDataInstance = TypeVar("T_BaseDataInstance", bound=BaseDataInstance)
+
+
+class Dataset(RepresentationMixin):
+    def __init__(
+        self,
+        name: str,
+        split_iterators: dict,
+        metadata: DatasetMetadata,
+        task_type: TaskType,
+    ) -> None:
+        self._name = name
+        self._split_iterators: dict[DatasetSplitType, MsgpackDatasetReader] = (
+            split_iterators
+        )
+        self._metadata = metadata
+        self._task_type = task_type
+
+    @property
+    def name(self) -> str:
+        """Dataset name."""
+        return self._name
+
+    @property
+    def task_type(self) -> TaskType:
+        """Dataset task type."""
+        return self._task_type
+
+    @task_type.setter
+    def task_type(self, value: TaskType) -> None:
+        self._task_type = value
+
+    @property
+    def split_iterators(
+        self,
+    ) -> dict[DatasetSplitType, MsgpackDatasetReader]:
+        """Dictionary of split iterators."""
+        return self._split_iterators
+
+    @property
+    def train(self) -> MsgpackDatasetReader | None:
+        """Training split iterator. Returns None if training split is not available."""
+        return self._split_iterators.get(DatasetSplitType.train, None)
+
+    @property
+    def validation(self) -> MsgpackDatasetReader | None:
+        """Validation split iterator. Returns None if validation split is not available."""
+        return self._split_iterators.get(DatasetSplitType.validation, None)
+
+    @property
+    def test(self) -> MsgpackDatasetReader | None:
+        """Test split iterator. Returns None if test split is not available."""
+        return self._split_iterators.get(DatasetSplitType.test, None)
+
+    @train.setter
+    def train(self, value: MsgpackDatasetReader) -> None:
+        self._split_iterators[DatasetSplitType.train] = value
+
+    @validation.setter
+    def validation(self, value: MsgpackDatasetReader) -> None:
+        self._split_iterators[DatasetSplitType.validation] = value
+
+    @test.setter
+    def test(self, value: MsgpackDatasetReader) -> None:
+        self._split_iterators[DatasetSplitType.test] = value
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """Dataset metadata."""
+        return self._metadata
+
+    @property
+    def train_size(self) -> int:
+        """Length of the training split. Returns 0 if training split is not available."""
+        if self.train is None:
+            return 0
+        return len(self.train)
+
+    @property
+    def validation_size(self) -> int:
+        """Length of the validation split. Returns 0 if validation split is not available."""
+        if self.validation is None:
+            return 0
+        return len(self.validation)
+
+    @property
+    def test_size(self) -> int:
+        """Length of the test split. Returns 0 if test split is not available."""
+        if self.test is None:
+            return 0
+        return len(self.test)
diff --git a/docgenie/data/_core/_dataset_factory.py b/docgenie/data/_core/_dataset_factory.py
new file mode 100755
index 0000000000000000000000000000000000000000..ea8837e933e19abfc3bc5bb1cd3316fd46e4e96e
--- /dev/null
+++ b/docgenie/data/_core/_dataset_factory.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Callable
+
+import yaml
+from atria_core.types import DatasetMetadata
+from atria_core.types.common import DatasetSplitType
+from atria_core.types.data_instance.base import (
+    BaseDataInstance,
+)
+from atria_core.types.data_instance.document_instance import (
+    DocumentInstance,
+)
+
+from docgenie.data.constants import DatasetLoadConfig
+from docgenie.logging import get_logger
+
+from ._dataset import Dataset
+from ._msgpack_dataset_reader import (
+    MsgpackDatasetReader,
+)
+
+logger = get_logger(__name__)
+
+
+class DatasetFactory:
+    """
+    Factory class for creating and loading datasets from msgpack shard files.
+
+    The DatasetFactory provides a centralized way to load datasets stored in a specific
+    directory structure with msgpack format. It automatically discovers available datasets
+    and configurations, validates paths, and creates appropriate data iterators for each split.
+
+    Expected Directory Structure:
+        root_datasets_dir/
+        └── dataset_name/
+            └── storage/
+                └── dataset_config_name/
+                    └── msgpack/
+                        ├── train/
+                        │   ├── shard_001.msgpack
+                        │   └── shard_002.msgpack
+                        ├── validation/
+                        │   └── shard_001.msgpack
+                        └── test/
+                            └── shard_001.msgpack
+
+    Usage:
+        # Basic usage with default DocumentInstance data model
+        dataset = DatasetFactory.load_dataset(
+            root_datasets_dir="/path/to/datasets",
+            dataset_name="my_dataset",
+            dataset_config_name="default"
+
+        # With custom data model and output transformation
+        dataset = DatasetFactory.load_dataset(
+            root_datasets_dir="/path/to/datasets",
+            dataset_name="my_dataset",
+            dataset_config_name="processed",
+            data_model=CustomDataInstance,
+            output_transform=lambda x: preprocess(x)
+
+        # Access splits
+        for sample in dataset.train:
+            # Process training samples
+            pass
+
+    The factory handles:
+    - Automatic discovery of dataset splits (train, validation, test, etc.)
+    - Loading msgpack shard files for each split
+    - Data model instantiation and transformation
+    - Error handling with helpful messages about available datasets/configs
+    """
+
+    @classmethod
+    def get_preprocess_transform(self, preprocess_image_size: int) -> Callable:
+        def preprocess_transform(sample: BaseDataInstance) -> dict:
+            resized_image = sample.image.resize(
+                width=preprocess_image_size, height=preprocess_image_size
+            )
+            return sample.model_copy(update={"image": resized_image})
+
+        return preprocess_transform
+
+    @classmethod
+    def prepare_paths(
+        cls, root_datasets_dir: str | Path, dataset_name: str, dataset_config_name: str
+    ):
+        # construct paths
+        data_dir = Path(root_datasets_dir) / dataset_name / "storage"
+        metadata_file = data_dir / "metadata.yaml"
+        msgpack_dir = data_dir / dataset_config_name / "msgpack"
+
+        if not data_dir.exists():
+            raise ValueError(
+                f"Data directory {data_dir} does not exist. "
+                f"Please check the dataset {dataset_name} is prepared with config name {dataset_config_name}. "
+            )
+
+        assert metadata_file.exists(), f"Metadata file {metadata_file} does not exist. "
+        assert msgpack_dir.exists(), f"Data directory {msgpack_dir} does not exist. "
+        return metadata_file, msgpack_dir
+
+    @classmethod
+    def load_metadata(
+        cls,
+        metadata_file: str | Path,
+    ) -> DatasetMetadata:
+        # load metadata
+        with open(metadata_file, "r") as f:
+            metadata = yaml.safe_load(f)
+        return DatasetMetadata(**metadata)
+
+    @classmethod
+    def get_available_splits(cls, msgpack_dir: Path):
+        available_splits = [DatasetSplitType(x) for x in os.listdir(msgpack_dir)]
+        assert len(available_splits) > 0, (
+            f"No splits found in {msgpack_dir}. Found {available_splits}"
+        )
+        return available_splits
+
+    @classmethod
+    def load_split_from_disk(
+        cls,
+        msgpack_dir: Path,
+        split: DatasetSplitType,
+        data_model: type[BaseDataInstance],
+    ) -> MsgpackDatasetReader:
+        # load msgpack files for this split
+        split_files = list((msgpack_dir / split.value).glob("*.msgpack"))
+        return MsgpackDatasetReader(msgpack_files=split_files, data_model=data_model)
+
+    @classmethod
+    def load_dataset(
+        cls,
+        dataset_load_config: DatasetLoadConfig,
+        data_model: type[BaseDataInstance] = DocumentInstance,
+        split: str | None = None,
+    ) -> Dataset:
+        # get dataset name and config name
+        dataset_name, dataset_config_name = (
+            dataset_load_config.dataset_name,
+            dataset_load_config.dataset_config_name,
+        )
+
+        # handle tuple config names
+        if isinstance(dataset_config_name, tuple):
+            dataset_name, dataset_config_name = dataset_config_name
+
+        # construct paths
+        metadata_file, msgpack_dir = cls.prepare_paths(
+            dataset_load_config.root_datasets_dir, dataset_name, dataset_config_name
+        )
+
+        # load metadata
+        metadata = cls.load_metadata(metadata_file)
+
+        # load split files
+        available_splits = cls.get_available_splits(msgpack_dir)
+
+        # load split iterators
+        split_iterators = {}
+        for current_split in available_splits:
+            if split is not None and current_split.value != split:
+                continue
+            split_iterators[current_split] = cls.load_split_from_disk(
+                msgpack_dir, current_split, data_model
+            )
+
+        return Dataset(
+            name=dataset_name,
+            split_iterators=split_iterators,
+            metadata=metadata,
+            task_type=dataset_load_config.task_type,
+        )
diff --git a/docgenie/data/_core/_msgpack_dataset_reader.py b/docgenie/data/_core/_msgpack_dataset_reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae9f332fa5a92090231b2f2047dff21b6827767c
--- /dev/null
+++ b/docgenie/data/_core/_msgpack_dataset_reader.py
@@ -0,0 +1,174 @@
+"""
+Msgpack shard list dataset module taken from atria_datasets
+"""
+
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any, Callable, TypeVar
+
+import numpy as np
+from atria_core.types import BaseDataInstance
+from datadings.reader import MsgpackReader as MsgpackFileReader
+
+from docgenie.data._core._data_types import DocumentInstanceModelInput
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+T_BaseDataInstance = TypeVar("T_BaseDataInstance", bound=BaseDataInstance)
+
+
+class MsgpackDatasetReader(Sequence[Any]):
+    """
+    A dataset class for reading Msgpack-based shard files.
+
+    This class provides functionality for loading and iterating over datasets stored
+    in Msgpack-based shard files. It supports efficient indexing and cumulative size
+    calculations for handling multiple shards.
+
+    Attributes:
+        _shard_files (list[str]): A list of Msgpack file path for each shard.
+        _cumulative_sizes (list[int]): Cumulative sizes of the shards for efficient indexing.
+        _total_size (int): The total number of samples across all shards.
+    """
+
+    def __init__(
+        self,
+        msgpack_files: list[str] | list[Path],
+        data_model: type,
+        transform: Callable | None = None,
+    ) -> None:
+        """
+        Initializes the `MsgpackShardListDataset`.
+
+        Args:
+            shard_files (List[DatasetShardInfo]): A list of shard metadata containing file URLs.
+        """
+        logger.info(f"Loading dataset from files: {msgpack_files}")
+        self._msgpack_files = sorted(msgpack_files)
+        self._total_size: int = 0
+
+        cumulative_sizes: list[int] = []
+        for f in self._msgpack_files:
+            with MsgpackFileReader(f) as reader:
+                self._total_size += len(reader)
+                cumulative_sizes.append(self._total_size)
+        self._cumulative_sizes = np.array(cumulative_sizes)
+
+        self._data_model = data_model
+        self._transform = transform
+        self._subset_indices = None
+        self._msgpack_file_readers = [MsgpackFileReader(f) for f in self._msgpack_files]
+        self._data_dir = Path(self._msgpack_files[0]).parent
+
+    @property
+    def data_dir(self) -> Path:
+        return self._data_dir
+
+    def set_subset_indices(self, indices: list[int]) -> None:
+        """
+        Sets the subset indices for the dataset.
+
+        Args:
+            indices (List[int]): A list of indices to subset the dataset.
+        """
+        self._subset_indices = indices
+
+    def set_transform(self, transform: Callable) -> None:
+        """
+        Sets the transform function for the dataset.
+
+        Args:
+            transform (Callable): A function to transform each data instance.
+        """
+        self._transform = transform
+
+    def _transform_input(self, input: Any) -> BaseDataInstance:
+        if issubclass(self._data_model, BaseDataInstance):
+            if "total_num_pages" in input:
+                input.pop("total_num_pages")
+            data_instance: BaseDataInstance = self._data_model.model_validate(input)
+
+            # assert that the transformed instance is of the expected data model type
+            assert isinstance(data_instance, self._data_model), (
+                f"self._input_transform(sample) should return {self._data_model}, but got {type(data_instance)}"
+            )
+
+            # load the data instance from disk if not already loaded
+            data_instance.load()
+
+            # yield the transformed data instance if output transform is enabled
+            if self._transform is not None:
+                data_instance = self._transform(data_instance)
+            return data_instance
+        elif issubclass(self._data_model, DocumentInstanceModelInput):
+            data_instance = self._data_model.from_dict(input)
+            if self._transform is not None:
+                data_instance = self._transform(data_instance)
+            return data_instance
+        else:
+            raise ValueError(
+                f"Unsupported data model type: {self._data_model}. Must be a subclass of BaseDataInstance or DocumentInstanceModelInput."
+            )
+
+    def get_by_id(self, sample_id: str) -> int:
+        for reader in self._msgpack_file_readers:
+            try:
+                sample_id = str(sample_id)
+                index = reader.find_index(sample_id.replace(".", "_"))
+                sample = reader[index]
+                sample.pop("key", None)
+                sample = self._transform_input(sample)
+                assert sample.sample_id == sample_id, (  # this should never happen
+                    f"Sample ID mismatch: expected {sample_id} ({type(sample_id)}), got {sample.sample_id} ({type(sample.sample_id)})"
+                )
+                return sample
+            except KeyError:
+                continue
+        raise ValueError(f"Sample ID {sample_id} not found in any shard.")
+
+    def __getitem__(self, index: int) -> dict[str, Any]:  # type: ignore[override]
+        """
+        Retrieves a sample from the dataset by index.
+
+        Args:
+            index (int): The index of the sample to retrieve.
+
+        Returns:
+            Dict[str, Any]: The sample at the specified index.
+        """
+        if self._subset_indices is not None:
+            index = self._subset_indices[index]
+
+        shard_index = np.searchsorted(self._cumulative_sizes, index, side="right")
+        if shard_index == 0:
+            inner_index = index
+        else:
+            inner_index = index - self._cumulative_sizes[shard_index - 1]
+        sample = self._msgpack_file_readers[shard_index][inner_index]
+        sample.pop("key", None)
+        return self._transform_input(sample)
+
+    def __len__(self) -> int:
+        """
+        Returns the total number of samples in the dataset.
+
+        Returns:
+            int: The total number of samples.
+        """
+        if self._subset_indices is not None:
+            return len(self._subset_indices)
+        return self._total_size
+
+    def close(self) -> None:
+        """
+        Closes all shard file readers to release resources.
+        """
+        for reader in self._msgpack_file_readers:
+            reader._close()
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}, "
+            f"total_size={self._total_size}, num_shards={len(self._msgpack_files)})"
+        )
diff --git a/docgenie/data/_core/_msgpack_dataset_writer.py b/docgenie/data/_core/_msgpack_dataset_writer.py
new file mode 100755
index 0000000000000000000000000000000000000000..016c04c5023458c69b127d02ea12c20c5e92b8ef
--- /dev/null
+++ b/docgenie/data/_core/_msgpack_dataset_writer.py
@@ -0,0 +1,108 @@
+"""
+Defines interface for docgenie components to load datasets using DatasetFactory and log relevant information.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import tqdm
+from torch.utils.data import Dataset
+
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.logging import get_logger
+
+from ._data_types import BaseDataInstance, DocumentInstance
+
+logger = get_logger(__name__)
+
+
+class MsgpackDatasetWriter:
+    def __init__(
+        self,
+        dataset_reader: MsgpackDatasetReader | Dataset,
+        output_file: Path,
+        data_model: type | type[BaseDataInstance] = DocumentInstance,
+    ):
+        self._dataset_reader = dataset_reader
+        self._output_file = output_file
+        self._data_model = data_model
+
+    def _get_dataloader(self):
+        import torch
+
+        # setup dataloader
+        dataloader = torch.utils.data.DataLoader(
+            self._dataset_reader,
+            batch_size=16,
+            shuffle=False,
+            num_workers=0,
+            collate_fn=lambda x: x,
+            drop_last=False,
+        )
+        return dataloader
+
+    def write(self, force_overwrite: bool = False) -> MsgpackDatasetReader:
+        if force_overwrite:
+            logger.warning(
+                f"Force overwrite is enabled. Existing file at {self._output_file} will be deleted if it exists."
+            )
+            self._output_file.unlink(missing_ok=True)
+
+        if not self._output_file.exists():
+            self._write()
+        return self.read()
+
+    def read(self):
+        return MsgpackDatasetReader(
+            msgpack_files=[str(self._output_file)],
+            data_model=self._data_model,
+        )
+
+    def _write(self):
+        from datadings.writer import FileWriter
+
+        try:
+            dataloader = self._get_dataloader()
+            total_sample = len(self._dataset_reader)
+            self._output_file.parent.mkdir(parents=True, exist_ok=True)
+            with FileWriter(
+                self._output_file,
+                overwrite=True,
+            ) as writer:
+                for batch in tqdm.tqdm(
+                    dataloader,
+                    desc=f"Preprocessing dataset to {self._output_file} with total samples {total_sample}",
+                ):
+                    for sample_or_sample_list in batch:
+                        sample_list = (
+                            [sample_or_sample_list]
+                            if not isinstance(sample_or_sample_list, list)
+                            else sample_or_sample_list
+                        )
+                        for sample in sample_list:
+                            try:
+                                sample_dict = (
+                                    sample.to_dict()
+                                    if hasattr(sample, "to_dict")
+                                    else sample.model_dump()
+                                )
+                                writer.write(
+                                    {
+                                        "key": sample.sample_id,
+                                        **sample_dict,
+                                    }
+                                )
+                            except ValueError as e:
+                                logger.error(
+                                    f"[WriteError] Failed to write sample '{getattr(sample, 'sample_id', 'unknown')}': {e}"
+                                )
+                                continue
+        except Exception as e:
+            logger.error(f"Error while writing preprocessed data: {e}")
+            self._output_file.unlink(missing_ok=True)
+            raise e
+        except KeyboardInterrupt as e:
+            logger.error("Preprocessing interrupted by user.")
+            self._output_file.unlink(missing_ok=True)
+            raise e
diff --git a/docgenie/data/_core/_standard_splitter.py b/docgenie/data/_core/_standard_splitter.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9b2a41338ae63f58f9e02b7d27dc1754d68d291
--- /dev/null
+++ b/docgenie/data/_core/_standard_splitter.py
@@ -0,0 +1,140 @@
+"""
+Dataset Splitter Module
+
+This module defines the `StandardSplitter` class, which provides utilities for splitting
+datasets into training and validation subsets. It supports both sequential and random
+splitting strategies, with configurable options for shuffle, and split ratio.
+
+Classes:
+    - StandardSplitter: A class for splitting datasets into training and validation subsets.
+
+Dependencies:
+    - copy: For deep copying datasets.
+    - typing: For type annotations.
+    - torch.utils.data: For dataset splitting utilities.
+    - atria_core.logger.logger: For logging utilities.
+    - atria_registry: For registering dataset splitters.
+    - atria_datasets.core.datasets.atria_dataset: For the base dataset class.
+
+Author: Your Name (your.email@example.com)
+Date: 2025-04-07
+Version: 1.0.0
+License: MIT
+"""
+
+from atria_core.utilities.repr import RepresentationMixin
+
+from ._msgpack_dataset_reader import MsgpackDatasetReader
+
+
+class StandardSplitter(RepresentationMixin):
+    """
+    A class for splitting datasets into training and validation subsets.
+
+    This class provides methods for creating sequential and random splits of datasets.
+    It supports configurable options for shuffle, and split ratio.
+
+    Attributes:
+        split_ratio (float): The ratio of the training split. Defaults to 0.8.
+        shuffle (bool): Whether to shuffle the dataset before splitting. Defaults to True.
+    """
+
+    def __init__(self, split_ratio: float = 0.8, shuffle: bool = True):
+        """
+        Initializes the `StandardSplitter`.
+
+        Args:
+            split_ratio (float): The ratio of the training split. Defaults to 0.8.
+            shuffle (bool): Whether to shuffle the dataset before splitting. Defaults to True.
+        """
+        self.split_ratio = split_ratio
+        self.shuffle = shuffle
+
+    def create_sequential_split(
+        self, train: "MsgpackDatasetReader"
+    ) -> tuple["MsgpackDatasetReader", "MsgpackDatasetReader"]:
+        """
+        Creates a sequential split of the dataset.
+
+        The dataset is split into training and validation subsets based on the split ratio,
+        without shuffling.
+
+        Args:
+            train_dataset (AtriaDataset): The dataset to split.
+
+        Returns:
+            Tuple[AtriaDataset, AtriaDataset]: The training and validation subsets.
+        """
+        import copy
+
+        dataset_size = len(train)
+        split_point = int(dataset_size * round(self.split_ratio, 2))
+
+        validation = copy.deepcopy(train)
+        train.set_subset_indices(list(range(split_point)))
+        validation.set_subset_indices(list(range(split_point)))
+        return train, validation
+
+    def create_random_split(
+        self, train: "MsgpackDatasetReader"
+    ) -> tuple["MsgpackDatasetReader", "MsgpackDatasetReader"]:
+        """
+        Creates a random split of the dataset.
+
+        The dataset is split into training and validation subsets based on the split ratio,
+        with shuffling.
+
+        Args:
+            train_dataset (AtriaDataset): The dataset to split.
+
+        Returns:
+            Tuple[AtriaDataset, AtriaDataset]: The training and validation subsets.
+        """
+        import copy
+
+        from sklearn.model_selection import train_test_split
+
+        assert train is not None, (
+            "The dataset must have a 'train' split defined for sequential splitting."
+        )
+
+        train_dataset_size = len(train)
+        validation = copy.deepcopy(train)
+        train_subset, validation_subset = train_test_split(
+            list(range(train_dataset_size)),
+            test_size=1 - self.split_ratio,
+            random_state=42,
+        )
+        train.set_subset_indices(list(train_subset))
+        validation.set_subset_indices(list(validation_subset))
+        return train, validation
+
+    def __call__(
+        self, train_split: "MsgpackDatasetReader"
+    ) -> tuple["MsgpackDatasetReader", "MsgpackDatasetReader"]:
+        """
+        Splits the dataset into training and validation subsets.
+
+        The splitting strategy (sequential or random) is determined by the `shuffle` attribute.
+
+        Args:
+            train_dataset (AtriaDataset): The dataset to split.
+
+        Returns:
+            Tuple[AtriaDataset, AtriaDataset]: The training and validation subsets.
+
+        Raises:
+            AssertionError: If the dataset is not an instance of `AtriaDataset` or if the
+                            dataset size is unknown (e.g., in iterable mode).
+        """
+        assert isinstance(train_split, MsgpackDatasetReader), (
+            "The dataset must be a PyTorch or Hugging Face dataset."
+        )
+        assert len(train_split) != "unknown", (
+            "The dataset size is unknown. This means that the dataset is set up "
+            "in iterable mode and splitting is not supported."
+        )
+        if self.shuffle:
+            return self.create_random_split(train_split)
+        else:
+            return self.create_sequential_split(train_split)
diff --git a/docgenie/data/_core/_synth.py b/docgenie/data/_core/_synth.py
new file mode 100755
index 0000000000000000000000000000000000000000..73286a103956a48377d264819965313af0b2b65e
--- /dev/null
+++ b/docgenie/data/_core/_synth.py
@@ -0,0 +1,589 @@
+import json
+
+import cv2
+import fitz
+import numpy as np
+import textdistance as td
+import tqdm
+from PIL import Image as PILImageLoader
+from torch.utils.data import Dataset
+
+from docgenie.generation.constants import IMAGE_RENDER_EXT
+from docgenie.generation.models import (
+    SynDatasetDefinition,
+    SyntheticDatasetFileStructure,
+)
+from docgenie.generation.models._consts import DatasetTask
+from docgenie.generation.models._log import SynDocumentLog
+from docgenie.generation.utils.bboxes import read_syn_dataset_bboxes
+from docgenie.logging import get_logger
+
+from ._data_types import (
+    AnnotatedObject,
+    AnnotatedObjectList,
+    BoundingBox,
+    BoundingBoxList,
+    ClassificationAnnotation,
+    DocumentContent,
+    DocumentInstance,
+    EntityLabelingAnnotation,
+    ExtractiveQAAnnotation,
+    ExtractiveQAPair,
+    Image,
+    Label,
+    LabelList,
+    LayoutAnalysisAnnotation,
+)
+from ._utilities import TaskType
+
+logger = get_logger(__name__)
+
+
+def _compute_anls(
+    predictions: list[list[str]], gold_labels: list[list[str]], tau=0.5, rank=0
+):
+    res = []
+    for i, (preds, golds) in enumerate(zip(predictions, gold_labels)):
+        max_s = 0
+        for pred in preds:
+            for gold in golds:
+                dis = td.levenshtein.distance(pred.lower(), gold.lower())
+                max_len = max(len(pred), len(gold))
+                if max_len == 0:
+                    s = 0
+                else:
+                    nl = dis / max_len
+                    s = 1 - nl if nl < tau else 0
+                max_s = max(s, max_s)
+        res.append(max_s)
+    return res, sum(res) / len(res)
+
+
+def _compute_iou(box1, box2):
+    """Compute IoU between two bounding boxes in format [x1, y1, x2, y2]"""
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+
+    if x2 <= x1 or y2 <= y1:
+        return 0.0
+
+    intersection = (x2 - x1) * (y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area1 + area2 - intersection
+
+    return intersection / union if union > 0 else 0.0
+
+
+def _foreground_bbox_clip(
+    image,
+    bboxes,
+    coords_are_inclusive=True,
+    min_area=10,
+    morph_kernel_size=3,
+    debug=False,
+    unnormalize=True,
+) -> list:
+    if image is None:
+        raise ValueError("Image is None")
+
+    gray = image if image.ndim == 2 else cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    H, W = gray.shape
+
+    refined = []
+    debug_vis = image.copy()
+
+    for i, box in enumerate(bboxes):
+        x1, y1, x2, y2 = box
+
+        # Handle normalized input
+        if unnormalize:
+            x1, y1, x2, y2 = x1 * W, y1 * H, x2 * W, y2 * H
+
+        # Convert to ints
+        x1, y1, x2, y2 = map(lambda v: int(round(v)), (x1, y1, x2, y2))
+
+        if coords_are_inclusive:
+            x2_slice, y2_slice = x2 + 1, y2 + 1
+        else:
+            x2_slice, y2_slice = x2, y2
+
+        # Clip to image boundaries
+        x1c, y1c = max(0, min(W - 1, x1)), max(0, min(H - 1, y1))
+        x2c, y2c = max(0, min(W, x2_slice)), max(0, min(H, y2_slice))
+
+        if x2c <= x1c or y2c <= y1c:
+            refined.append([x1c, y1c, x2c, y2c])
+            continue
+
+        crop = gray[y1c:y2c, x1c:x2c]
+        blur = cv2.GaussianBlur(crop, (5, 5), 0)
+
+        mean_val = float(np.mean(blur))
+        invert = mean_val > 127
+
+        # Apply Otsu threshold
+        if invert:
+            _, mask = cv2.threshold(
+                blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+            )
+        else:
+            _, mask = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+        # ---- REMOVE HORIZONTAL LINES ----
+        # Tune these values depending on your document scale
+        # horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (mask.shape[1] // 8, 1))
+        # detect_horizontal = cv2.morphologyEx(mask, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
+
+        # Subtract detected lines from the mask
+        # mask = cv2.subtract(mask, detect_horizontal)
+
+        # (Optional) Also remove very thin components (height < 3 px)
+        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
+            mask, connectivity=8
+        )
+        clean_mask = np.zeros_like(mask)
+        for i in range(1, num_labels):
+            x, y, w, h, area = stats[i]
+            if h > 3:  # ignore 1–2 pixel tall components (likely lines)
+                clean_mask[labels == i] = 255
+        mask = clean_mask
+
+        # plt.figure(figsize=(12, 12))
+        # plt.imshow(mask, cmap='gray')
+        # plt.axis('off')
+        # plt.show()
+
+        # Morphological closing
+        if morph_kernel_size and morph_kernel_size > 1:
+            kernel = cv2.getStructuringElement(
+                cv2.MORPH_RECT, (morph_kernel_size, morph_kernel_size)
+            )
+            mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1)
+
+        # plt.figure(figsize=(12, 12))
+        # plt.imshow(mask, cmap='gray')
+        # plt.axis('off')
+        # plt.show()
+
+        # Remove small noise components
+        n_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
+            mask, connectivity=8
+        )
+        keep_mask = np.zeros_like(mask, dtype=np.uint8)
+
+        for label in range(1, n_labels):
+            if stats[label, cv2.CC_STAT_AREA] >= min_area:
+                keep_mask[labels == label] = 255
+
+        # If no foreground remains, keep original box
+        if np.count_nonzero(keep_mask) == 0:
+            refined.append([x1c, y1c, x2c, y2c])
+            continue
+
+        # Find tight bounds
+        ys, xs = np.where(keep_mask > 0)
+        y_min_local, y_max_local = int(ys.min()), int(ys.max())
+        x_min_local, x_max_local = int(xs.min()), int(xs.max())
+
+        new_x1, new_y1 = x1c + x_min_local, y1c + y_min_local
+        new_x2, new_y2 = x1c + x_max_local, y1c + y_max_local
+
+        new_x1, new_y1 = max(0, new_x1), max(0, new_y1)
+        new_x2, new_y2 = min(W - 1, new_x2), min(H - 1, new_y2)
+
+        refined.append([new_x1, new_y1, new_x2, new_y2])
+
+        # --- Debug Visualization ---
+        if debug:
+            # Overlay mask in red channel
+            overlay = debug_vis.copy()
+            colored_mask = cv2.cvtColor(keep_mask, cv2.COLOR_GRAY2BGR)
+            colored_mask = cv2.resize(colored_mask, (x2c - x1c, y2c - y1c))
+            overlay[y1c:y2c, x1c:x2c, 2] = np.maximum(
+                overlay[y1c:y2c, x1c:x2c, 2], colored_mask[:, :, 2]
+            )
+
+            debug_vis = cv2.addWeighted(debug_vis, 0.7, overlay, 0.3, 0)
+
+            # Draw original bbox (yellow) and new bbox (green)
+            cv2.rectangle(debug_vis, (x1, y1), (x2, y2), (0, 255, 255), 1)
+            cv2.rectangle(debug_vis, (new_x1, new_y1), (new_x2, new_y2), (0, 255, 0), 2)
+
+            # Label with index
+            cv2.putText(
+                debug_vis,
+                f"{i}",
+                (x1, max(10, y1 - 5)),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (255, 0, 255),
+                1,
+                cv2.LINE_AA,
+            )
+
+    if debug:
+        return refined, debug_vis
+    return refined
+
+
+class SynthesizedDataset(Dataset):
+    def __init__(
+        self,
+        dsdef: SynDatasetDefinition,
+        task_type: TaskType,
+        dataset_labels: list[str],
+        resize_images: bool = False,
+        clip_bboxes_to_foreground: bool = False,
+    ):
+        self.dataset_labels = dataset_labels
+        self.data = self._load_your_synthesized_data(dsdef)
+        self.task_type = task_type
+        self.resize_images = resize_images
+        self.clip_bboxes_to_foreground = clip_bboxes_to_foreground
+
+        # remap dataset labels if cord
+        if dsdef.name.startswith("cord"):
+            self.dataset_labels = [x.replace(".", "_") for x in self.dataset_labels]
+        if dsdef.name.startswith("publaynet"):
+            self.dataset_labels = ["LE-" + x.upper() for x in self.dataset_labels]
+        if dsdef.name.startswith("doclaynet") and task_type == TaskType.layout_analysis:
+            self.dataset_labels = ["LE-" + x.upper() for x in self.dataset_labels]
+        if dsdef.name.startswith("icdar2019"):
+            self.dataset_labels = ["LE-" + x.upper() for x in self.dataset_labels]
+        if dsdef.name.startswith("tobacco3482"):
+            self.dataset_labels = [x.upper() for x in self.dataset_labels]
+            self.dataset_labels[self.dataset_labels.index("NEWS")] = "NEWS_ARTICLE"
+            self.dataset_labels[self.dataset_labels.index("ADVE")] = "ADVERTISEMENT"
+
+    def _load_qa_gt(self, annotations: dict) -> dict:
+        qa_annotations = []
+        for i, a in enumerate(annotations):
+            # if no answer is found we remove the sample
+            if len(a["answer_bbox_indices"]) == 0:
+                logger.warning(
+                    f"No answer found for question id {i} in synthesized data. Skipping annotation."
+                )
+                continue
+
+            qa_annotation = {
+                "question_id": i,
+                "question": a["question"],
+                "answer_text": [a["answer"]],
+                "answer_start_indices": [a["answer_bbox_indices"][0]],
+                "answer_end_indices": [a["answer_bbox_indices"][-1]],
+            }
+            qa_annotations.append(qa_annotation)
+
+        return {"qa_annotations": qa_annotations}
+
+    def _load_kie_as_qa_gt(
+        self, annotations: dict, dsdef: SynDatasetDefinition
+    ) -> dict:
+        assert dsdef.prompt_task == "json", (
+            "Modelling KIE tasks as QA in dataloader not implemented for annotation-type KIE."
+        )
+        qa_annotations = []
+        for i, a in enumerate(annotations["entities"]):
+            # if no answer is found we remove the sample
+            if len(a["bbox_indices"]) == 0:
+                logger.warning(
+                    f"No answer found for KIE (modelled as QA) question id {i} in synthesized data. Skipping sample."
+                )
+                continue
+
+            qa_annotation = {
+                "question_id": i,
+                "question": a["key"],
+                "answer_text": [a["value"]],
+                "answer_start_indices": [a["bbox_indices"][0]],
+                "answer_end_indices": [a["bbox_indices"][-1]],
+            }
+            qa_annotations.append(qa_annotation)
+
+        return {"qa_annotations": qa_annotations}
+
+    def _load_classification_gt(self, annotations: dict) -> dict:
+        assert len(annotations) == 1
+        return annotations  # is already in correct format: {"label": "FORM"}
+
+    def _load_kie_as_qa_gt(
+        self, annotations: dict, dsdef: SynDatasetDefinition
+    ) -> dict:
+        assert dsdef.prompt_task == "json", (
+            "Modelling KIE tasks as QA in dataloader not implemented for annotation-type KIE."
+        )
+        qa_annotations = []
+        for i, a in enumerate(annotations["entities"]):
+            # if no answer is found we remove the sample
+            if len(a["bbox_indices"]) == 0:
+                logger.warning(
+                    f"No answer found for KIE (modelled as QA) question id {i} in synthesized data. Skipping sample."
+                )
+                continue
+
+            qa_annotation = {
+                "question_id": i,
+                "question": a["key"],
+                "answer_text": [a["value"]],
+                "answer_start_indices": [a["bbox_indices"][0]],
+                "answer_end_indices": [a["bbox_indices"][-1]],
+            }
+            qa_annotations.append(qa_annotation)
+
+        return {"qa_annotations": qa_annotations}
+
+    def _load_kie_gt(self, annotations: dict) -> dict:
+        return {"word_labels": annotations["word_labels"]}
+
+    def _load_dla_gt(self, annotations: dict) -> dict:
+        dla_annotations = []
+        for i, a in enumerate(annotations):
+            dla_annotation = {
+                "label": a["label"],
+                "bbox": [a["x0"], a["y0"], a["x2"], a["y2"]],  # already normalized
+            }
+            dla_annotations.append(dla_annotation)
+
+        return {"annotations": dla_annotations}
+
+    def _load_your_synthesized_data(self, dsdef: SynDatasetDefinition) -> list[dict]:
+        dsfiles: SyntheticDatasetFileStructure = dsdef.get_file_structure()
+        dslog_path = dsfiles.base_path / "dataset_log.json"
+        dslog: dict = json.loads(dslog_path.read_text(encoding="utf-8"))
+        valid_samples = dslog["valid_samples"]["items"]
+
+        samples = list()
+        for docid in tqdm.tqdm(
+            valid_samples, desc="Loading synthesized dataset samples"
+        ):
+            doclog = SynDocumentLog(
+                document_id=docid, logdir=dsfiles.document_logs_directory
+            )
+
+            annotations_path = dsfiles.gt_directory / f"{docid}.json"
+            annotations = json.loads(annotations_path.read_text(encoding="utf-8"))
+
+            sample_annotations = None
+            match dsdef.task:
+                case DatasetTask.QA.value:
+                    sample_annotations = self._load_qa_gt(annotations=annotations)
+                case DatasetTask.CLASSIFICATION.value:
+                    sample_annotations = self._load_classification_gt(
+                        annotations=annotations
+                    )
+                case DatasetTask.KIE.value:
+                    if dsdef.dataloader_model_task_as == DatasetTask.QA.value:
+                        sample_annotations = self._load_kie_as_qa_gt(
+                            annotations=annotations, dsdef=dsdef
+                        )
+                    else:
+                        sample_annotations = self._load_kie_gt(annotations=annotations)
+                case DatasetTask.DLA.value:
+                    sample_annotations = self._load_dla_gt(annotations=annotations)
+                case _:
+                    raise ValueError(f"Unknown synthetic dataset task: {dsdef.task}")
+
+            # TODO: implement other tasks than QA
+
+            word_bbox_path = dsfiles.get_final_normalized_bbox_path(
+                level="word", doc_id=docid
+            )
+            word_bboxes_raw = read_syn_dataset_bboxes(word_bbox_path)
+            seg_bbox_path = dsfiles.get_final_normalized_bbox_path(
+                level="segment", doc_id=docid
+            )
+            seg_bboxes_raw = read_syn_dataset_bboxes(seg_bbox_path)
+
+            words = [b.text for b in word_bboxes_raw]
+            word_bboxes = [[b.x0, b.y0, b.x2, b.y2] for b in word_bboxes_raw]
+            segment_level_bboxes = [[b.x0, b.y0, b.x2, b.y2] for b in seg_bboxes_raw]
+
+            if len(word_bboxes) == 0:
+                logger.warning(
+                    f"No word bboxes found for document id {docid} in synthesized data. Skipping sample."
+                )
+                continue
+
+            if doclog.ocr_required:
+                image_file_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
+            else:
+                image_file_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
+
+            sample = {
+                "sample_id": docid,
+                "image_file_path": image_file_path,
+                "words": words,
+                "word_bboxes": word_bboxes,
+                "segment_level_bboxes": segment_level_bboxes,
+            }
+            sample.update(sample_annotations)
+            samples.append(sample)
+        return samples
+
+    def _prepare_annotations(self, sample, image) -> list:
+        if self.task_type == TaskType.sequence_classification:
+            assert self.dataset_labels is not None, "Dataset labels must be provided."
+            return [
+                ClassificationAnnotation(  # assuming label is present as category map label to whichever classification category is output for synthesized data
+                    label=Label(
+                        name=sample["label"],
+                        value=self.dataset_labels.index(sample["label"]),
+                    )
+                )
+            ]
+        elif self.task_type == TaskType.token_classification:
+            # for token classification we use bio tagging. so we need to make sure label indices
+            # map back to riginal
+            assert self.dataset_labels is not None, "Dataset labels must be provided."
+            return [
+                EntityLabelingAnnotation(
+                    word_labels=LabelList.from_list(
+                        [
+                            Label(value=self.dataset_labels.index(label), name=label)
+                            for label in sample[
+                                "word_labels"
+                            ]  # here we assume word_labels are provided in synthesized data
+                        ]
+                    )
+                ),
+            ]
+
+        elif self.task_type == TaskType.extractive_qa:
+            qa_pairs = []
+            for i, qa_annotation in enumerate(sample["qa_annotations"]):
+                qa_pair = ExtractiveQAPair(
+                    id=qa_annotation["question_id"],  # unique id if available
+                    question_text=qa_annotation["question"],  # question text
+                    answer_start=qa_annotation[
+                        "answer_start_indices"
+                    ],  # start index answer in word tokens
+                    answer_end=qa_annotation[
+                        "answer_end_indices"
+                    ],  # end index of answer in word tokens
+                    answer_text=qa_annotation["answer_text"],  # actual answer text
+                )
+                qa_pairs.append(qa_pair)
+            return [ExtractiveQAAnnotation(qa_pairs=qa_pairs)]
+
+        elif self.task_type == TaskType.layout_analysis:
+            assert self.dataset_labels is not None, "Dataset labels must be provided."
+            annotated_objects = []
+            for annotation in sample["annotations"]:
+                label = annotation["label"]
+                assert label in self.dataset_labels, (
+                    f"Label {label} not in dataset labels. Found labels: {self.dataset_labels}"
+                )
+                bbox = BoundingBox(value=annotation["bbox"], normalized=True)
+
+                annotated_object = AnnotatedObject(
+                    label=Label(value=self.dataset_labels.index(label), name=label),
+                    bbox=bbox,
+                )
+                annotated_objects.append(annotated_object)
+
+            # convert to AnnotatedObjectList
+            annotated_objects = AnnotatedObjectList.from_list(annotated_objects)
+
+            if self.clip_bboxes_to_foreground:
+                image = np.array(image)
+                refined_bboxes = _foreground_bbox_clip(
+                    image,
+                    annotated_objects.bbox.value,
+                    coords_are_inclusive=False,
+                    min_area=10,
+                    morph_kernel_size=3,
+                    unnormalize=annotated_objects.bbox.normalized,
+                )
+                annotated_objects = annotated_objects.model_copy(
+                    update={
+                        "bbox": BoundingBoxList(value=refined_bboxes).normalize(
+                            image.shape[1], image.shape[0]
+                        )
+                    }
+                )
+
+            return [
+                LayoutAnalysisAnnotation(annotated_objects=annotated_objects),
+            ]
+        else:
+            raise ValueError(f"Unsupported task type: {self.task_type}")
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        sample = self.data[idx]
+
+        image_file_path = str(sample["image_file_path"])
+        if image_file_path.endswith(IMAGE_RENDER_EXT):
+            image = PILImageLoader.open(image_file_path)
+        elif image_file_path.endswith(".pdf"):
+            doc = fitz.open(image_file_path)
+            page = doc[0]
+            mat = fitz.Matrix(1, 1)
+            pix = page.get_pixmap(matrix=mat)
+            image = PILImageLoader.frombytes(
+                "RGB", [pix.width, pix.height], pix.samples
+            )
+        else:
+            raise ValueError(f"Unsupported image file format: {image_file_path}")
+
+        image = Image(file_path=sample["image_file_path"], content=image)
+        word_bboxes = sample["word_bboxes"]
+        segment_level_bboxes = sample["segment_level_bboxes"]
+
+        # remap segment level bboxes to word level if counts mismatch
+        if len(word_bboxes) != len(segment_level_bboxes):
+            remapped_segment_level_bboxes = []
+            for word_bbox in word_bboxes:
+                best_iou = 0.0
+                best_segment_bbox = word_bbox  # fallback to word bbox if no good match
+
+                for segment_bbox in segment_level_bboxes:
+                    iou = _compute_iou(word_bbox, segment_bbox)
+                    if iou > best_iou:
+                        best_iou = iou
+                        best_segment_bbox = segment_bbox
+
+                remapped_segment_level_bboxes.append(best_segment_bbox)
+            segment_level_bboxes = remapped_segment_level_bboxes
+
+        assert len(segment_level_bboxes) == len(word_bboxes) == len(sample["words"]), (
+            f"Length mismatch after remapping for sample {sample['sample_id']}. "
+            f"Words: {len(sample['words'])}, Word BBoxes: {len(word_bboxes)}, "
+            f"Segment Level BBoxes: {len(segment_level_bboxes)}"
+        )
+
+        if self.resize_images:
+            image = image.resize_with_aspect_ratio(1024)
+
+        return DocumentInstance(
+            sample_id=sample["sample_id"],
+            image=image,
+            content=DocumentContent(
+                words=sample["words"],  # simple list of words
+                word_bboxes=BoundingBoxList(value=word_bboxes, normalized=True),
+                word_segment_level_bboxes=BoundingBoxList(
+                    value=segment_level_bboxes, normalized=True
+                ),
+            ),
+            annotations=self._prepare_annotations(sample, image.content),
+        )
+
+
+"""
+hey man I checked your file and it was just a small mistake on read.
+1. I also fixed some other mistakes on write
+2. added metadata file copying for labels
+3. added normalization to word bboxes
+4. I noticed you use xywh format is that correct? if so it'd be better to just change it to x1y1x2y2 right here
+
+Hey man i just wrote this.
+i havent tested it for anything but it will give you the idea of what you need to do.
+You will also have to add the synthesized dataset name for each in DATASET_CONFIG_MAP i guess for it to finally be loaded after being saved.
+ After that it could be loaded like any other dataset and preprocessed as well.
+ We need to do preprocessing in later step only because different training will result in different preprocssing
+"""
diff --git a/docgenie/data/_core/_utilities.py b/docgenie/data/_core/_utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..7f55894f3649516aabc446126c2b1e4848769261
--- /dev/null
+++ b/docgenie/data/_core/_utilities.py
@@ -0,0 +1,137 @@
+import enum
+from typing import Any
+
+from docgenie.data._core._data_types import BaseModelInput
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class TaskType(str, enum.Enum):
+    generate_embeddings = "generate_embeddings"
+    sequence_classification = "sequence_classification"
+    token_classification = "token_classification"
+    extractive_qa = "extractive_qa"
+    layout_analysis = "layout_analysis"
+    table_extraction = "table_extraction"
+    table_detection = "table_detection"
+
+
+def auto_dataloader(dataset: Any, **kwargs: Any) -> Any:
+    """
+    Automatically configures a DataLoader for distributed training.
+
+    This function adjusts DataLoader settings based on the distributed training configuration,
+    including rank, world size, and device type. It supports XLA devices and provides warnings
+    for incompatible configurations.
+
+    Args:
+        iterator (Iterator): The dataset split iterator to load data from.
+        **kwargs (Any): Additional arguments for configuring the DataLoader.
+
+    Returns:
+        DataLoader: A configured DataLoader instance.
+
+    Raises:
+        ValueError: If incompatible configurations are detected.
+    """
+    from ignite.distributed import DistributedProxySampler
+    from ignite.distributed import utils as idist
+    from ignite.distributed.comp_models import xla as idist_xla
+    from torch.utils.data import DataLoader, IterableDataset
+    from torch.utils.data.distributed import DistributedSampler
+    from torch.utils.data.sampler import Sampler
+
+    rank = idist.get_rank()
+    world_size = idist.get_world_size()
+
+    if world_size > 1:
+        if "batch_size" in kwargs and kwargs["batch_size"] >= world_size:
+            kwargs["batch_size"] //= world_size
+
+        nproc = idist.get_nproc_per_node()
+        if "num_workers" in kwargs and kwargs["num_workers"] >= nproc:
+            kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc
+
+        if "batch_sampler" not in kwargs:
+            if isinstance(dataset, IterableDataset):
+                logger.info(
+                    "Found iterable dataset, dataloader will be created without any distributed sampling. "
+                    "Please, make sure that the dataset itself produces different data on different ranks."
+                )
+            else:
+                sampler: DistributedProxySampler | DistributedSampler | Sampler | None
+                sampler = kwargs.get("sampler", None)
+                if isinstance(sampler, DistributedSampler):
+                    if sampler.rank != rank:
+                        logger.warning(
+                            f"Found distributed sampler with rank={sampler.rank}, but process rank is {rank}"
+                        )
+                    if sampler.num_replicas != world_size:
+                        logger.warning(
+                            f"Found distributed sampler with num_replicas={sampler.num_replicas}, "
+                            f"but world size is {world_size}"
+                        )
+                elif sampler is None:
+                    shuffle = kwargs.pop("shuffle", True)
+                    sampler = DistributedSampler(
+                        dataset, num_replicas=world_size, rank=rank, shuffle=shuffle
+                    )
+                else:
+                    sampler = DistributedProxySampler(
+                        sampler, num_replicas=world_size, rank=rank
+                    )
+                kwargs["sampler"] = sampler
+        else:
+            logger.warning(
+                "Found batch_sampler in provided kwargs. Please, make sure that it is compatible "
+                "with distributed configuration"
+            )
+
+    if (
+        idist.has_xla_support
+        and idist.backend() == idist_xla.XLA_TPU
+        and kwargs.get("pin_memory", False)
+    ):
+        logger.warning(
+            "Found incompatible options: xla support and pin_memory args equal True. "
+            "Argument `pin_memory=False` will be used to construct data loader."
+        )
+        kwargs["pin_memory"] = False
+    else:
+        kwargs["pin_memory"] = kwargs.get("pin_memory", "cuda" in idist.device().type)
+
+    dataloader = DataLoader(dataset, **kwargs)
+    if (
+        idist.has_xla_support
+        and idist.backend() == idist_xla.XLA_TPU
+        and world_size > 1
+    ):
+        logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA")
+
+        from torch_xla.distributed.parallel_loader import MpDeviceLoader  # type: ignore
+
+        mp_device_loader_cls = MpDeviceLoader
+        mp_dataloader = mp_device_loader_cls(dataloader, idist.device())
+        mp_dataloader.sampler = dataloader.sampler  # type: ignore[attr-defined]
+        return mp_dataloader
+
+    return dataloader
+
+
+def default_collate(list_of_inputs: list[BaseModelInput] | list[list[BaseModelInput]]):
+    if isinstance(list_of_inputs[0], list):
+        list_of_inputs = [item for sublist in list_of_inputs for item in sublist]
+    if isinstance(list_of_inputs, list) and len(list_of_inputs) > 0:
+        return list_of_inputs[0].batch(list_of_inputs)
+    else:
+        raise ValueError("Batch is empty or not a list.")
+
+
+def mmdet_pseudo_collate(batch: list["MMDetInput"]):
+    from atria_datasets.core.transforms.mmdet import MMDetInput
+    from mmengine.dataset.utils import pseudo_collate
+
+    return MMDetInput.model_construct(
+        **pseudo_collate([sample.model_dump() for sample in batch])
+    )
diff --git a/docgenie/data/_core/_visualization_utilities.py b/docgenie/data/_core/_visualization_utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..25571260b9e7ec5559906f4786ab9dcd8d527fc3
--- /dev/null
+++ b/docgenie/data/_core/_visualization_utilities.py
@@ -0,0 +1,706 @@
+from __future__ import annotations
+
+import os
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import textdistance as td
+from PIL import Image, ImageDraw, ImageFont
+import textwrap
+from docgenie.data._core._data_types import (
+    AnnotatedObjectList,
+    BoundingBoxList,
+    DatasetLabels,
+    DocumentInstance,
+    ExtractiveQAPair,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+GT_BG_FILL=(0, 0, 0, 120)
+
+def merge_bio_bboxes(
+    words: List[str], bboxes: List[List[int]], labels: List[str]
+) -> Tuple[List[str], List[List[int]], List[str]]:
+    """
+    Merge BIO-style labeled bounding boxes into combined entity boxes and labels.
+
+    Args:
+        words: List of words in sequence.
+        bboxes: List of bounding boxes [x1, y1, x2, y2] corresponding to each word.
+        labels: BIO labels, e.g., ["B-ANSWER", "I-ANSWER", "O", "B-QUESTION", "I-QUESTION"].
+
+    Returns:
+        merged_words: List of concatenated entity strings.
+        merged_bboxes: List of merged bounding boxes for each entity.
+        merged_labels: List of entity types (e.g., ["ANSWER", "QUESTION"]).
+    """
+    merged_words = []
+    merged_bboxes = []
+    merged_labels = []
+
+    current_words = []
+    current_boxes = []
+    current_label_type = None
+
+    for word, bbox, label in zip(words, bboxes, labels):
+        if label.startswith("B-"):
+            # Finalize previous entity if any
+            if current_words:
+                x1 = min(b[0] for b in current_boxes)
+                y1 = min(b[1] for b in current_boxes)
+                x2 = max(b[2] for b in current_boxes)
+                y2 = max(b[3] for b in current_boxes)
+                merged_words.append(" ".join(current_words))
+                merged_bboxes.append([x1, y1, x2, y2])
+                merged_labels.append(current_label_type)
+
+            # Start new entity
+            current_label_type = label.split("-", 1)[1]
+            current_words = [word]
+            current_boxes = [bbox]
+
+        elif label.startswith("I-") and current_label_type == label.split("-", 1)[1]:
+            # Continue same entity
+            current_words.append(word)
+            current_boxes.append(bbox)
+
+        else:
+            # Finalize previous if we hit O or mismatch
+            if current_words:
+                x1 = min(b[0] for b in current_boxes)
+                y1 = min(b[1] for b in current_boxes)
+                x2 = max(b[2] for b in current_boxes)
+                y2 = max(b[3] for b in current_boxes)
+                merged_words.append(" ".join(current_words))
+                merged_bboxes.append([x1, y1, x2, y2])
+                merged_labels.append(current_label_type)
+                current_words, current_boxes, current_label_type = [], [], None
+
+            # If "O", skip (non-entity)
+            continue
+
+    # Finalize last entity
+    if current_words:
+        x1 = min(b[0] for b in current_boxes)
+        y1 = min(b[1] for b in current_boxes)
+        x2 = max(b[2] for b in current_boxes)
+        y2 = max(b[3] for b in current_boxes)
+        merged_words.append(" ".join(current_words))
+        merged_bboxes.append([x1, y1, x2, y2])
+        merged_labels.append(current_label_type)
+
+    return merged_words, merged_bboxes, merged_labels
+
+
+def _save_visualization(
+    sample: DocumentInstance,
+    dataset_name: str,
+    output_dir: str,
+    split: str,
+    dataset_labels: DatasetLabels,
+    visualize_gt_only: bool = True,
+):
+    """Save visualizations of document instance with bounding boxes and annotations."""
+
+    # Create output directory
+    sample_id = sample.sample_id.split("/")[-1]
+    output_path = Path(output_dir) / dataset_name / split
+    os.makedirs(output_path, exist_ok=True)
+
+    # Extract annotations
+    annotations = _extract_annotations(sample=sample)
+
+    # Extract content
+    words, word_bboxes, word_segment_level_bboxes = _extract_content_data(sample=sample)
+
+    # Create filename suffix
+    label_suffix = ""
+    if "label" in annotations and annotations["label"] is not None:
+        label_suffix = (
+            f"_label={annotations['label'].name}" if annotations["label"] else ""
+        )
+
+    # # Save visualizations
+    image = sample.image.content
+    if not visualize_gt_only:
+        if words is not None and word_bboxes is not None:
+            _save_word_bbox_visualization(
+                image=image,
+                word_bboxes=word_bboxes,
+                words=words,
+                word_labels=annotations["word_labels"],
+                output_path=output_path,
+                sample_id=sample_id,
+                label_suffix=label_suffix,
+            )
+
+        if words is not None and word_segment_level_bboxes is not None:
+            _save_segment_bbox_visualization(
+                image=image,
+                segment_bboxes=word_segment_level_bboxes,
+                words=words,
+                word_labels=annotations["word_labels"],
+                output_path=output_path,
+                sample_id=sample_id,
+                label_suffix=label_suffix,
+            )
+    else:
+        if words is not None and word_bboxes is not None and annotations["word_labels"]:
+            _save_word_labels_visualization(
+                image=image,
+                word_bboxes=word_bboxes,
+                words=words,
+                word_labels=annotations["word_labels"],
+                output_path=output_path,
+                sample_id=sample_id,
+                label_suffix=label_suffix,
+            )
+
+    if annotations["qa_pairs"]:
+        _save_qa_visualization(
+            image=image,
+            word_bboxes=word_bboxes,
+            words=words,
+            qa_pairs=annotations["qa_pairs"],
+            output_path=output_path,
+            sample_id=sample_id,
+            label_suffix=label_suffix,
+        )
+
+    if annotations["annotated_objects"]:
+        _save_layout_visualization(
+            image=image,
+            annotated_objects=annotations["annotated_objects"],
+            image_size=sample.image.size,
+            output_path=output_path,
+            sample_id=sample_id,
+            layout_labels=dataset_labels.layout,
+            label_suffix=label_suffix,
+        )
+
+
+def _extract_annotations(sample: DocumentInstance) -> Dict[str, Any]:
+    """Extract annotations from sample."""
+    annotations = {
+        "label": None,
+        "word_labels": None,
+        "qa_pairs": None,
+        "annotated_objects": None,
+    }
+
+    for annotation in sample.annotations:
+        if annotation._type == "classification":
+            annotations["label"] = annotation.label
+        elif annotation._type == "entity_labeling":
+            annotations["word_labels"] = annotation.word_labels
+        elif annotation._type == "extractive_qa":
+            annotations["qa_pairs"] = annotation.qa_pairs
+        elif annotation._type == "layout":
+            annotations["annotated_objects"] = annotation.annotated_objects
+
+    return annotations
+
+
+def _extract_content_data(
+    sample: DocumentInstance,
+) -> tuple[list[str], BoundingBoxList, Optional[BoundingBoxList]]:
+    """Extract content data from sample."""
+    if sample.content is None:
+        return None, None, None
+
+    words, word_bboxes, word_segment_level_bboxes = (
+        sample.content.words,
+        sample.content.word_bboxes,
+        sample.content.word_segment_level_bboxes,
+    )
+
+    # Unnormalize bounding boxes
+    word_bboxes: BoundingBoxList = (
+        _unnormalize_bboxes(word_bboxes, sample.image.size)
+        if word_bboxes.normalized
+        else word_bboxes
+    )
+
+    if word_segment_level_bboxes:
+        word_segment_level_bboxes = (
+            _unnormalize_bboxes(word_segment_level_bboxes, sample.image.size)
+            if word_segment_level_bboxes.normalized
+            else word_segment_level_bboxes
+        )
+    return (
+        words,
+        word_bboxes,
+        word_segment_level_bboxes,
+    )
+
+
+def _unnormalize_bboxes(bbox_data, img_size):
+    """Unnormalize bounding boxes from 0-1 to pixel coordinates."""
+    if not bbox_data or not bbox_data.value:
+        return None
+
+    img_width, img_height = img_size
+    unnormalized_bboxes = []
+
+    for bbox in bbox_data.value:
+        unnormalized_bboxes.append(
+            [
+                int(bbox[0] * img_width),
+                int(bbox[1] * img_height),
+                int(bbox[2] * img_width),
+                int(bbox[3] * img_height),
+            ]
+        )
+
+    return bbox_data.model_copy(
+        update={"value": unnormalized_bboxes, "normalized": False}
+    )
+
+
+def _draw_bboxes_on_image(
+    image: Image,
+    bboxes_data,
+    word_labels=None,
+) -> Image:
+    """Draw bounding boxes with warm colors and readable transparent labels."""
+    if not bboxes_data or not getattr(bboxes_data, "value", None):
+        return image.copy()
+
+    img_copy = image.copy().convert("RGB")
+    draw = ImageDraw.Draw(img_copy, "RGBA")
+
+    # Warm color palette (soft oranges, reds, and golds)
+    warm_colors = [
+        (255, 99, 71),  # tomato
+        (255, 140, 0),  # dark orange
+        (255, 165, 0),  # orange
+        (255, 69, 0),  # red-orange
+        (255, 215, 0),  # gold
+        (255, 182, 80),  # light orange
+    ]
+
+    # Calculate font size based on image dimensions as a ratio
+    img_width, img_height = image.size
+    base_size = img_height
+    font_size = max(12, int(base_size * 0.015))  # 2% of smaller dimension, minimum 12px
+
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", font_size)
+    except IOError:
+        font = ImageFont.load_default()
+
+    unique_labels = []
+    if word_labels:
+        unique_labels = list(
+            set(word_labels if isinstance(word_labels, list) else word_labels.name)
+        )
+
+    label_to_color = {}
+    for idx, label in enumerate(unique_labels):
+        label_to_color[label] = warm_colors[idx % len(warm_colors)]
+
+    for i, bbox in enumerate(bboxes_data.value):
+        if len(bbox) < 4:
+            continue
+
+        # Assign color based on label, fallback to random if no label
+        if word_labels and i < len(word_labels):
+            current_label = (
+                word_labels[i] if isinstance(word_labels, list) else word_labels.name[i]
+            )
+            color = label_to_color.get(current_label, random.choice(warm_colors))
+        else:
+            color = random.choice(warm_colors)
+
+        # Draw bounding box
+        try:
+            draw.rectangle(bbox[:4], outline=color + (255,), width=2)
+        except Exception as e:
+            print(f"Error drawing bounding box {bbox}: {e}")
+            continue
+
+        # Prepare label text
+        text = ""
+        # if words and i < len(words):
+        #     text = words[i]
+        if word_labels and i < len(word_labels):
+            text += f"{word_labels[i]}"
+
+        if not text:
+            continue
+
+        # Compute text size (modern Pillow uses textbbox)
+        try:
+            text_bbox = draw.textbbox((0, 0), text, font=font)
+            text_w, text_h = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
+        except AttributeError:
+            # Fallback for older Pillow versions
+            text_w, text_h = font.getsize(text)
+
+        # Place text slightly above bbox
+        text_x = bbox[0]
+        text_y = max(0, bbox[1] - text_h - 4)
+
+        # Draw transparent black background behind text
+        draw.rectangle(
+            [text_x, text_y, text_x + text_w + 6, text_y + text_h + 4],
+            fill=GT_BG_FILL,
+        )
+
+        # Draw white text on top
+        draw.text((text_x + 3, text_y + 2), text, fill=(255, 255, 255, 255), font=font)
+
+    return img_copy
+
+
+def _draw_qa_answers_on_image(image, word_bboxes, qa_pairs):
+    if not word_bboxes or not word_bboxes.value:
+        return image.copy()
+
+    img_copy = image.copy().convert("RGB")
+    draw = ImageDraw.Draw(img_copy, "RGBA")
+
+    warm_colors = [
+        (255, 99, 71),
+        (255, 140, 0),
+        (255, 165, 0),
+        (255, 69, 0),
+        (255, 215, 0),
+        (255, 182, 80),
+    ]
+
+    img_width, img_height = image.size
+    font_size = max(12, int(img_height * 0.018))
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", font_size)
+    except IOError:
+        font = ImageFont.load_default()
+
+    max_text_width = int(img_width * 0.6)
+
+    for qa_idx, qa_pair in enumerate(qa_pairs):
+        color = warm_colors[qa_idx % len(warm_colors)]
+
+        question_text = getattr(qa_pair, "question_text", f"Q{qa_idx + 1}")
+        answer_starts = getattr(qa_pair, "answer_start", [])
+        answer_ends = getattr(qa_pair, "answer_end", [])
+
+        for start, end in zip(answer_starts, answer_ends):
+            if start == -1 or end == -1 or start >= len(word_bboxes.value):
+                continue
+
+            # Merge bounding boxes for full answer span
+            boxes = word_bboxes.value[start : min(end + 1, len(word_bboxes.value))]
+            if not boxes:
+                continue
+
+            x1 = min(b[0] for b in boxes)
+            y1 = min(b[1] for b in boxes)
+            x2 = max(b[2] for b in boxes)
+            y2 = max(b[3] for b in boxes)
+
+            draw.rectangle([x1, y1, x2, y2], outline=color + (255,), width=2)
+
+            # Create wrapped text using textwrap
+            label_text = f"Q{qa_idx + 1}: {question_text}"
+            # Approximate chars per line based on width and font metrics
+            char_width = font.getlength("A") or font_size * 0.6
+            max_chars = max_text_width // int(char_width)
+            wrapped = textwrap.fill(label_text, width=max_chars)
+
+            # Compute text block size
+            text_bbox = draw.multiline_textbbox((0, 0), wrapped, font=font, spacing=4)
+            tw = text_bbox[2] - text_bbox[0]
+            th = text_bbox[3] - text_bbox[1]
+
+            # Define a box above the answer span (or clamp to top of image)
+            text_x = x1
+            text_y = max(0, y1 - th - 8)
+
+            # Background rectangle
+            draw.rectangle(
+                [text_x, text_y, text_x + tw + 8, text_y + th + 6],
+                fill=GT_BG_FILL,
+            )
+
+            # Draw wrapped text directly
+            draw.multiline_text(
+                (text_x + 4, text_y + 3),
+                wrapped,
+                font=font,
+                fill=(255, 255, 255, 255),
+                spacing=4,
+            )
+
+    return img_copy
+
+
+def _save_word_labels_visualization(
+    image: Image,
+    word_bboxes,
+    words: List[str],
+    word_labels: Optional[List[str]],
+    output_path: Path,
+    sample_id: str,
+    label_suffix: str,
+):
+    """Save word-level bounding box visualization."""
+    has_bio_tagging = False
+    if word_labels and any(
+        label.startswith("B-") or label.startswith("I-") for label in word_labels.name
+    ):
+        has_bio_tagging = True
+    if has_bio_tagging:
+        words, word_bboxes, word_labels = merge_bio_bboxes(
+            words, word_bboxes.value, word_labels.name
+        )
+        word_bboxes = BoundingBoxList(value=word_bboxes, normalized=False)
+
+    image_with_bboxes = _draw_bboxes_on_image(
+        image,
+        word_bboxes,
+        word_labels if isinstance(word_labels, list) else word_labels.name,
+    )
+    bbox_path = output_path / f"{sample_id}{label_suffix}_word_bboxes.png"
+    image_with_bboxes.save(bbox_path)
+    logger.info(f"Saved word bbox visualization: {bbox_path}")
+
+
+def _save_word_bbox_visualization(
+    image: Image,
+    word_bboxes,
+    words: List[str],
+    word_labels: Optional[List[str]],
+    output_path: Path,
+    sample_id: str,
+    label_suffix: str,
+):
+    """Save word-level bounding box visualization."""
+    image_with_bboxes = _draw_bboxes_on_image(
+        image, word_bboxes, words, word_labels, "red"
+    )
+    bbox_path = output_path / f"{sample_id}{label_suffix}_word_bboxes.png"
+    image_with_bboxes.save(bbox_path)
+    logger.info(f"Saved word bbox visualization: {bbox_path}")
+
+
+def _save_segment_bbox_visualization(
+    image: Image,
+    segment_bboxes,
+    words: List[str],
+    word_labels: Optional[List[str]],
+    output_path: Path,
+    sample_id: str,
+    label_suffix: str,
+):
+    """Save segment-level bounding box visualization."""
+    try:
+        image_with_bboxes = _draw_bboxes_on_image(
+            image, segment_bboxes, words, word_labels, "blue"
+        )
+    except:
+        logger.error(
+            f"Error drawing segment bounding boxes for sample {sample_id}. Skipping visualization."
+        )
+        return
+    bbox_path = output_path / f"{sample_id}{label_suffix}_segment_bboxes.png"
+    image_with_bboxes.save(bbox_path)
+    logger.info(f"Saved segment bbox visualization: {bbox_path}")
+
+
+def _save_qa_visualization(
+    image: Image,
+    word_bboxes,
+    words: List[str],
+    qa_pairs: List[ExtractiveQAPair],
+    output_path: Path,
+    sample_id: str,
+    label_suffix: str,
+):
+    """Save QA answer visualization and text file."""
+    # Save QA image
+    image_with_qa = _draw_qa_answers_on_image(image, word_bboxes, qa_pairs)
+    qa_image_path = output_path / f"{sample_id}{label_suffix}_qa_answers.png"
+    image_with_qa.save(qa_image_path)
+    logger.info(f"Saved QA answers visualization: {qa_image_path}")
+
+    # qa_txt_path = output_path / f"{sample_id}{label_suffix}_qa.txt"
+    # with open(qa_txt_path, "w", encoding="utf-8") as f:
+    #     f.write(f"Document Index: {sample_id}\n\n")
+    #     f.write("Document OCR:\n")
+    #     f.write(",".join(words) + "\n\n")
+
+    #     for i, qa_pair in enumerate(qa_pairs):
+    #         f.write(f"Q{i + 1}: {qa_pair.question_text}\n")
+    #         f.write(f"A{i + 1}: {qa_pair.answer_text}\n")
+
+    #         answer_starts, answer_ends = qa_pair.answer_start, qa_pair.answer_end
+    #         for idx, (start, end) in enumerate(zip(answer_starts, answer_ends)):
+    #             f.write(f"Answer Span [{idx}]: ({start}, {end})\n")
+    #             f.write(f"Extracted Answer: {' '.join(words[start : end + 1])}\n")
+    #         f.write("\n")
+
+    # logger.info(f"Saved QA info: {qa_txt_path}")
+
+
+def _draw_layout_bboxes_on_image(
+    image: Image,
+    annotated_objects,
+    image_size: tuple[int, int],
+    layout_labels: List[str],
+) -> Image:
+    """
+    Draw layout bounding boxes with warm colors (one color per label) and filled area.
+    """
+
+    # Unnormalize if needed
+    bboxes = (
+        _unnormalize_bboxes(annotated_objects.bbox, image_size)
+        if annotated_objects.bbox.normalized
+        else annotated_objects.bbox
+    )
+
+    img_copy = image.copy().convert("RGB")
+    draw = ImageDraw.Draw(img_copy, "RGBA")
+
+    # Warm color palette
+    warm_colors = [
+        (255, 99, 71),  # tomato
+        (255, 140, 0),  # dark orange
+        (255, 165, 0),  # orange
+        (255, 69, 0),  # red-orange
+        (255, 215, 0),  # gold
+        (255, 182, 80),  # light orange
+    ]
+
+    # Calculate font size based on image dimensions
+    img_width, img_height = image.size
+    base_size = img_height
+    font_size = max(12, int(base_size * 0.015))
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", font_size)
+    except IOError:
+        font = ImageFont.load_default()
+
+    # Map each layout label to a warm color
+    unique_labels = list(set(layout_labels))
+    label_to_color = {
+        label: warm_colors[idx % len(warm_colors)]
+        for idx, label in enumerate(unique_labels)
+    }
+
+    for bbox, label_idx in zip(bboxes.value, annotated_objects.label.value):
+        if len(bbox) < 4:
+            continue
+
+        x1, y1, x2, y2 = bbox
+        label_text = layout_labels[label_idx]
+        color = label_to_color.get(label_text, random.choice(warm_colors))
+
+        # Draw bounding box (outline only)
+        draw.rectangle([x1, y1, x2, y2], outline=color + (255,), width=3)
+
+        # Draw label text with transparent black background
+        text_x, text_y = x1, max(0, y1 - font_size - 4)
+        try:
+            text_bbox = draw.textbbox((text_x, text_y), label_text, font=font)
+            text_w, text_h = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
+        except AttributeError:
+            text_w, text_h = font.getsize(label_text)
+
+        # Background rectangle
+        draw.rectangle(
+            [text_x, text_y, text_x + text_w + 6, text_y + text_h + 4],
+            fill=GT_BG_FILL,
+        )
+
+        # Draw text
+        draw.text(
+            (text_x + 3, text_y + 2), label_text, fill=(255, 255, 255, 255), font=font
+        )
+
+    return img_copy
+
+
+def _save_layout_visualization(
+    image: Image,
+    annotated_objects: AnnotatedObjectList,
+    image_size: tuple[int, int],
+    output_path: Path,
+    sample_id: str,
+    layout_labels: list[str],
+    label_suffix: str,
+):
+    """Save layout annotation visualization."""
+    # Placeholder function for layout visualization
+    layout_image_path = output_path / f"{sample_id}{label_suffix}_layout.png"
+    img_copy = _draw_layout_bboxes_on_image(
+        image,
+        annotated_objects,
+        image_size=image_size,
+        layout_labels=layout_labels,
+    )
+    img_copy.save(str(layout_image_path) + ".png")
+    logger.info(f"Saved layout visualization (placeholder): {layout_image_path}.png")
+
+
+def _anls_metric_str(
+    predictions: list[list[str]], gold_labels: list[list[str]], tau=0.5, rank=0
+):
+    res = []
+    for i, (preds, golds) in enumerate(zip(predictions, gold_labels)):
+        max_s = 0
+        for pred in preds:
+            for gold in golds:
+                dis = td.levenshtein.distance(pred.lower(), gold.lower())
+                max_len = max(len(pred), len(gold))
+                if max_len == 0:
+                    s = 0
+                else:
+                    nl = dis / max_len
+                    s = 1 - nl if nl < tau else 0
+                max_s = max(s, max_s)
+        res.append(max_s)
+    return res, sum(res) / len(res)
+
+
+def _compute_qa_stats(split_reader, split_name):
+    """Compute QA statistics for a given dataset split."""
+    import tqdm
+
+    total_questions = 0
+    total_answers_found = 0
+    all_extracted_answers = []
+    all_gold_answers = []
+
+    for sample in tqdm.tqdm(split_reader, f"Computing QA stats for {split_name}..."):
+        # Extract annotations
+        words = sample.content.words if sample.content else []
+        annotations = _extract_annotations(sample=sample)
+        for qa_pair in annotations["qa_pairs"]:
+            total_questions += 1
+            extracted_answers = []
+            for ans_start, ans_end in zip(qa_pair.answer_start, qa_pair.answer_end):
+                if ans_start != -1 and ans_end != -1:
+                    extracted_answers.append(
+                        " ".join(words[ans_start : ans_end + 1])
+                        if ans_start != -1 and ans_end != -1
+                        else ""
+                    )
+            if len(extracted_answers) > 0:
+                total_answers_found += 1
+            all_extracted_answers.append(extracted_answers)
+            all_gold_answers.append(qa_pair.answer_text)
+
+    logger.info(f"{split_name} - total_questions: {total_questions}")
+    logger.info(f"{split_name} - total_answers_found: {total_answers_found}")
+
+    if total_questions > 0:
+        logger.info("Computing ANLS metric...")
+        logger.info("First 10 extracted answers:\n%s", all_extracted_answers[:50])
+        logger.info("First 10 gold answers:\n%s", all_gold_answers[:50])
+        _, anls = _anls_metric_str(all_extracted_answers, all_gold_answers)
+        logger.info(f"{split_name} - anls: {anls}")
diff --git a/docgenie/data/_transforms/__init__.py b/docgenie/data/_transforms/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2ccc382d21fc34473bf968ba936eba3b5a08718d
--- /dev/null
+++ b/docgenie/data/_transforms/__init__.py
@@ -0,0 +1,11 @@
+from ._tokenizers._document_processors import (
+    QuestionAnsweringDocumentProcessor,
+    SequenceClassificationDocumentProcessor,
+    TokenClassificationDocumentProcessor,
+)
+
+__all__ = [
+    "SequenceClassificationDocumentProcessor",
+    "TokenClassificationDocumentProcessor",
+    "QuestionAnsweringDocumentProcessor",
+]
diff --git a/docgenie/data/_transforms/_generics/_base.py b/docgenie/data/_transforms/_generics/_base.py
new file mode 100755
index 0000000000000000000000000000000000000000..d5ec29fa2a1977cc44dacd0664c52b869256b0a7
--- /dev/null
+++ b/docgenie/data/_transforms/_generics/_base.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+from typing import Generic, TypeVar
+
+import torch
+from atria_core.utilities.repr import RepresentationMixin
+from PIL.Image import Image as PILImage
+from pydantic import BaseModel
+
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+T = TypeVar("T")
+
+
+class ToRGB(object):
+    def __call__(self, image: PILImage | torch.Tensor) -> PILImage | torch.Tensor:
+        if isinstance(image, torch.Tensor):
+            if image.shape[0] == 3:
+                return image
+            return image.repeat(3, 1, 1)
+        else:
+            return image.convert("RGB")
+
+
+class BaseTransform(RepresentationMixin, BaseModel, Generic[T]):
+    def get_output_data_model(self) -> type[T]:
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs) -> T | list[T]:
+        raise NotImplementedError
diff --git a/docgenie/data/_transforms/_generics/_hf_processor.py b/docgenie/data/_transforms/_generics/_hf_processor.py
new file mode 100755
index 0000000000000000000000000000000000000000..3dae2d7a7f78af97b362aa364933b520332160db
--- /dev/null
+++ b/docgenie/data/_transforms/_generics/_hf_processor.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import inspect
+from typing import Any
+
+from pydantic import Field
+from transformers import (
+    AutoProcessor,
+    BatchEncoding,
+    BertTokenizerFast,
+    RobertaTokenizerFast,
+)
+
+from docgenie.data._transforms._generics._base import BaseTransform
+
+# add custom models
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class HuggingfaceProcessor(BaseTransform[BatchEncoding]):
+    _TOKENIZERS_REQUIRING_SPLIT_TEXT = (BertTokenizerFast, RobertaTokenizerFast)
+
+    tokenizer_name: str = "microsoft/layoutlmv3-base"
+    init_kwargs: dict = Field(default_factory=dict)
+    call_kwargs: dict = Field(default_factory=dict)
+    cache_dir: str = "./cache"
+    overflow_sampling: str = "return_all"
+
+    @property
+    def tokenizer(self):
+        return (
+            self._hf_processor.tokenizer
+            if hasattr(self._hf_processor, "tokenizer")
+            else self._hf_processor
+        )
+
+    @property
+    def all_special_ids(self) -> set[int]:
+        return set(self.tokenizer.all_special_ids)
+
+    def model_post_init(self, context) -> None:
+        assert self.overflow_sampling in [
+            "return_all",
+            "return_random_n",
+            "no_overflow",
+            "return_first_n",
+        ], f"Overflow sampling strategy {self.overflow_sampling} is not supported."
+
+        self._hf_processor = self._initialize_transform()
+
+    def _get_default_call_kwargs(self):
+        return {
+            "add_special_tokens": True,
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 512,
+            "stride": 0,
+            "pad_to_multiple_of": 8,
+            "is_split_into_words": True,
+            "return_overflowing_tokens": self.overflow_sampling
+            != "no_overflow",  # set some arguments that we need to stay fixed for our case
+            "return_token_type_ids": None,
+            "return_attention_mask": True,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "return_tensors": "pt",
+            "verbose": True,
+        }
+
+    def _initialize_transform(self):
+        processor = AutoProcessor.from_pretrained(
+            self.tokenizer_name,
+            cache_dir=self.cache_dir,
+            local_files_only=False,
+            apply_ocr=False,
+            add_prefix_space=True,
+            do_lower_case=True,
+            do_normalize=False,
+            do_resize=False,
+            do_rescale=False,
+            **self.init_kwargs,
+        )
+
+        self.call_kwargs = {**self._get_default_call_kwargs(), **self.call_kwargs}
+        self._possible_args = inspect.signature(processor.__call__).parameters
+        for key in list(self.call_kwargs.keys()):
+            if key not in self._possible_args:
+                logger.warning(
+                    f"Invalid keyword argument '{key}' found in call_kwargs for {self.__class__.__name__}. Skipping it."
+                )
+                self.call_kwargs.pop(key)
+        return processor
+
+    def get_config(self):
+        return {
+            "tokenizer_name": self.tokenizer_name,
+            "init_kwargs": self.init_kwargs,
+            "call_kwargs": self.call_kwargs,
+        }
+
+    def get_output_data_model(self) -> type[BatchEncoding]:
+        return BatchEncoding
+
+    def _convert_text_to_list(self, text: Any) -> list[str]:
+        if isinstance(text, str):
+            return text.split()
+        elif isinstance(text, list):
+            return text
+        else:
+            raise ValueError("Input text must be a string or a list of strings.")
+
+    def __call__(self, **inputs) -> BatchEncoding:
+        if isinstance(self.tokenizer, self._TOKENIZERS_REQUIRING_SPLIT_TEXT):
+            text = inputs.get("text", None)
+            text_pair = inputs.get("text_pair", None)
+
+            if text is not None and text_pair is not None:
+                inputs["text"] = self._convert_text_to_list(text)
+                inputs["text_pair"] = self._convert_text_to_list(text_pair)
+
+                assert isinstance(inputs["text"], list), (
+                    "Input 'text' must be a list of strings."
+                )
+                assert isinstance(inputs["text_pair"], list), (
+                    "Input 'text_pair' must be a list of strings."
+                )
+        filtered_inputs = {k: v for k, v in inputs.items() if k in self._possible_args}
+        return self._hf_processor(**filtered_inputs, **self.call_kwargs)
diff --git a/docgenie/data/_transforms/_generics/_image_processor.py b/docgenie/data/_transforms/_generics/_image_processor.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae6a24ef8ae438f32801e7d97eefd508b0bde2f2
--- /dev/null
+++ b/docgenie/data/_transforms/_generics/_image_processor.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from PIL.Image import Image as PILImage
+
+from docgenie.data._transforms._generics._base import BaseTransform, ToRGB
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class ImageProcessor(BaseTransform[PILImage]):
+    do_normalize: bool = True  # Normalize the image to ImageNet mean and std
+    do_resize: bool = True  # Resize the image to 224x224
+    use_imagenet_mean_std: bool = False
+    resize_height: int = 224
+    resize_width: int = 224
+    image_mean: list[float] | None = None
+    image_std: list[float] | None = None
+
+    def model_post_init(self, context) -> None:
+        from transformers.utils.constants import (
+            IMAGENET_DEFAULT_MEAN,
+            IMAGENET_DEFAULT_STD,
+            IMAGENET_STANDARD_MEAN,
+            IMAGENET_STANDARD_STD,
+        )
+
+        self.image_mean = self.image_mean or IMAGENET_STANDARD_MEAN
+        self.image_std = self.image_std or IMAGENET_STANDARD_STD
+        if self.use_imagenet_mean_std:
+            self.image_mean = IMAGENET_DEFAULT_MEAN
+            self.image_std = IMAGENET_DEFAULT_STD
+
+        # prepare image transform
+        self._transform = self._prepare_image_transform()
+
+    def _prepare_image_transform(self):
+        from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+
+        transform = [ToRGB(), ToTensor()]
+        if self.do_resize:
+            transform += [
+                Resize(
+                    (self.resize_height, self.resize_width),
+                    interpolation=2,  # type: ignore[attr-defined]
+                    antialias=True,  # type: ignore[attr-defined]
+                ),
+            ]
+        if self.do_normalize:
+            transform += [
+                Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        transform = Compose(transform)
+        return transform
+
+    def get_output_data_model(self) -> type[PILImage]:
+        return PILImage
+
+    def __call__(self, image: PILImage) -> PILImage:
+        return self._transform(image)
diff --git a/docgenie/data/_transforms/_tokenizers/_conditional_generation.py b/docgenie/data/_transforms/_tokenizers/_conditional_generation.py
new file mode 100755
index 0000000000000000000000000000000000000000..124cba1e8e9c9ecaadcf22d5b475c0ac8d224b06
--- /dev/null
+++ b/docgenie/data/_transforms/_tokenizers/_conditional_generation.py
@@ -0,0 +1,547 @@
+from __future__ import annotations
+
+import json
+
+from docgenie.data._transforms._tokenizers._document_processors import BaseTransform
+from docgenie.data._transforms._tokenizers._udop_processor import CustomUdopProcessor
+from docgenie.logging import get_logger
+
+from ..._core._data_types import (
+    AnnotatedObjectList,
+    ConditionalGenerationModelInput,
+    DatasetLabels,
+    DocumentInstance,
+    ExtractiveQAPair,
+    Label,
+    LabelList,
+)
+from ..._core._utilities import TaskType
+from ._utilities import _extract_annotations
+
+logger = get_logger(__name__)
+
+
+class ConditionalGenerationTokenizer(BaseTransform):
+    task_type: TaskType
+    tokenizer_name: str = "microsoft/udop-large"
+    tokenizer_cache_dir: str = "./cache"
+    is_training: bool = True
+    generate_entity_vocabulary: bool = True
+    dataset_labels: DatasetLabels
+
+    def get_output_data_model(self) -> type:
+        return ConditionalGenerationModelInput
+
+    def model_post_init(self, context) -> None:
+        from transformers import AutoProcessor
+
+        self._default_init_kwargs = {
+            "cache_dir": self.tokenizer_cache_dir,
+            "local_files_only": False,
+            "apply_ocr": False,
+        }
+        self._default_call_kwargs = {
+            "add_special_tokens": True,
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 1024,
+            "stride": 0,
+            "pad_to_multiple_of": 8,
+            "return_tensors": "pt",
+        }
+        if self.task_type == TaskType.token_classification:
+            self._default_call_kwargs["return_overflowing_tokens"] = True
+            self._default_call_kwargs["return_offsets_mapping"] = True
+            self._default_call_kwargs["stride"] = 128
+            self._default_call_kwargs["max_length"] = 512
+            self._processor = CustomUdopProcessor.from_pretrained(
+                self.tokenizer_name,
+                **self._default_init_kwargs,
+                clean_up_tokenization_spaces=False,
+            )
+        else:
+            self._processor = AutoProcessor.from_pretrained(
+                self.tokenizer_name, **self._default_init_kwargs
+            )
+        self._tokenizer = (
+            self._processor.tokenizer
+            if hasattr(self._processor, "tokenizer")
+            else self._processor
+        )
+
+        # if self.task_type == TaskType.token_classification and self.generate_entity_vocabulary:
+        #     possible_labels = (
+        #         self.dataset_labels.ser
+        #         if self.dataset_labels.ser is not None
+        #         else []
+        #     )
+        # possible_labels = [f"<{lbl}>" for lbl in possible_labels]
+        # num_added_tokens = self._tokenizer.add_special_tokens({"additional_special_tokens": possible_labels})
+        # logger.info(f"Added {num_added_tokens} special tokens for entity labels: {possible_labels}")
+
+    def _get_common_kwargs(self, document_instance: DocumentInstance) -> tuple:
+        # get pil image from the document instance
+        image = document_instance.image.load().content.convert("RGB")
+
+        # get words from the document instance
+        words = (
+            document_instance.content.words
+            if document_instance.content is not None
+            else []
+        )
+
+        # get bounding boxes from the document instance
+        boxes = (
+            document_instance.content.word_bboxes.value
+            if document_instance.content is not None
+            else []
+        )
+
+        return image, words, boxes
+
+    def _prepare_instances_for_sequence_classification(
+        self, document_instance: DocumentInstance, label: Label
+    ) -> ConditionalGenerationModelInput:
+        import torch
+
+        possible_labels = (
+            self.dataset_labels.classification
+            if self.dataset_labels.classification is not None
+            else []
+        )
+        image, words, boxes = self._get_common_kwargs(document_instance)
+        prompt = f"Document Classification. Classify the document into one of these categories: {', '.join(possible_labels)}. Document: "
+        target_text = label.name
+
+        if not words:
+            # Supply a dummy token and box so UDOP doesn't crash
+            words = ["None"]
+            boxes = [[0, 0, 0, 0]]
+
+        tokenized_instance = {}
+        if self.tokenizer_name == "microsoft/udop-large":
+            tokenized_instance = self._processor(
+                image, prompt, text_pair=words, boxes=boxes, **self._default_call_kwargs
+            )
+        elif self.tokenizer_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+            tokenized_instance = self._processor(
+                prompt, text_pair=" ".join(words), **self._default_call_kwargs
+            )
+
+        for key, value in tokenized_instance.items():
+            tokenized_instance[key] = value.squeeze(0)
+
+        # # for debugging decode the input ids
+        # decoded_input = self._processor.decode(tokenized_instance['input_ids'], skip_special_tokens=True)
+        # print('Decoded input:', decoded_input)
+
+        # Tokenize target text to get target_token_ids
+        target_token_ids = self._tokenizer.encode(  # this takes text but returns a batch, truly a garbage design
+            target_text,
+            add_special_tokens=True,
+            return_tensors="pt",
+            max_length=16,
+            truncation=True,
+            padding="max_length",
+        )[0]
+
+        # decoded_target_text = self._processor.decode(target_token_ids, skip_special_tokens=True)
+        # print('Decoded target_text:', decoded_target_text)
+
+        # Set padding token IDs to -100 to ignore in loss computation
+        target_token_ids[target_token_ids == 0] = -100
+
+        return ConditionalGenerationModelInput(
+            **tokenized_instance,
+            index=torch.tensor(document_instance.index),
+            sample_id=document_instance.sample_id,
+            words=words,
+            target_text=target_text,
+            target_token_ids=target_token_ids,
+            _tokenizer_name=self.tokenizer_name,
+            _tokenizer_init_kwargs=self._default_init_kwargs,
+        )
+
+    def _generate_target_text_for_token_classification(
+        self,
+        words: list[str],
+        word_labels: list[str],
+        target_text_type: str = "key_value_pairs",
+    ) -> str:
+        # entities = {}
+
+        target_text = ""
+        for word_idx, (word, word_label) in enumerate(
+            zip(words, word_labels, strict=True)
+        ):
+            target_text += f"{word} {word_label} "
+        target_text = target_text.strip()
+        return target_text
+
+        # if word_label == "O" or word_label.startswith("I-"):
+        #     continue
+
+        # if word_label.startswith("B-"):
+        #     entity_words = [word]
+        #     for next_word, next_label in zip(
+        #         words[word_idx + 1 :], word_labels[word_idx + 1 :]
+        #     ):
+        #         if next_label == f"I-{word_label[2:]}":
+        #             entity_words.append(next_word)
+        #         else:
+        #             break
+
+        #     if word_label[2:] not in entities:
+        #         entities[word_label[2:]] = []
+        #     entities[word_label[2:]].append(" ".join(entity_words))
+
+        if len(entities) == 0:
+            return None
+
+        if target_text_type == "csv":
+            lines = []
+            separator = "|"
+            for key, values in entities.items():
+                for value in values:
+                    line = f"{key}={value}{separator}"
+                    lines.append(line)
+            lines[-1] = lines[-1].rstrip(f"{separator}")  # remove sep from last line
+            return "".join(lines)
+        elif target_text_type == "json":
+            return json.dumps(entities)
+        else:
+            raise NotImplementedError(
+                f"Target text type {target_text_type} not supported."
+            )
+
+    def _prepare_instances_for_token_classification(
+        self, document_instance: DocumentInstance, word_labels: LabelList
+    ) -> list[ConditionalGenerationModelInput]:
+        import torch
+
+        image, words, boxes = self._get_common_kwargs(document_instance)
+        prompt = "Information Extraction. Extract all the entities present in this document: Document: "
+
+        if not words:
+            words = ["None"]
+            boxes = [[0, 0, 0, 0]]
+            word_labels.name = ["O"]
+
+        features, encoded_batch = None, None
+        if self.tokenizer_name == "microsoft/udop-large":
+            features, encoded_batch = self._processor(
+                image, prompt, text_pair=words, boxes=boxes, **self._default_call_kwargs
+            )
+        elif self.tokenizer_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+            raise NotImplementedError(
+                "Token classification not implemented for T5 models yet."
+            )
+
+        sequence_ids = []
+        word_ids = []
+        for i in range(len(encoded_batch["input_ids"])):
+            sequence_ids_per_overflow = encoded_batch.sequence_ids(i)
+            word_ids_per_overflow = encoded_batch.word_ids(i)
+
+            # filter sequence_ids
+            sequence_ids_per_overflow = [
+                -100 if x is None else x for x in sequence_ids_per_overflow
+            ]
+            word_ids_per_overflow = [
+                -100 if x is None else x for x in word_ids_per_overflow
+            ]
+            if max(sequence_ids_per_overflow) > 0:
+                word_ids_per_overflow = [
+                    -100 if sequence_id == 0 else word_id
+                    for word_id, sequence_id in zip(
+                        word_ids_per_overflow, sequence_ids_per_overflow
+                    )
+                ]
+            sequence_ids.append(sequence_ids_per_overflow)
+            word_ids.append(word_ids_per_overflow)
+
+        sequence_ids = torch.tensor(sequence_ids)
+        word_ids = torch.tensor(word_ids)
+
+        # to compare the targets we need to know where the start of next overlfow sequence is after stride
+        last_max_word_id = -1
+        instances = []
+        for overflow_idx in range(len(encoded_batch["input_ids"])):
+            # find min max word ids
+            input_ids_per_per_overflow = encoded_batch["input_ids"][overflow_idx]
+            word_ids_per_overflow = word_ids[overflow_idx]
+            min_word_id = min(
+                [wid for wid in word_ids_per_overflow.tolist() if wid != -100]
+            )
+            max_word_id = max(
+                [wid for wid in word_ids_per_overflow.tolist() if wid != -100]
+            )
+
+            # words in this overflow
+            words_in_this_overflow = words[min_word_id : max_word_id + 1]
+            word_labels_in_this_overflow = word_labels.name[
+                min_word_id : max_word_id + 1
+            ]
+
+            target_text = self._generate_target_text_for_token_classification(
+                words=words_in_this_overflow,
+                word_labels=word_labels_in_this_overflow,
+                target_text_type="csv",
+            )
+
+            if target_text is None:
+                continue
+
+            target_token_ids = self._tokenizer.encode(
+                target_text,
+                add_special_tokens=True,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True,
+                padding="max_length",
+            )[0]
+
+            # word labels after stride
+            word_to_extract_in_this_overflow = words[
+                last_max_word_id + 1 : max_word_id + 1
+            ]
+            word_labels_to_extract_in_this_overflow = word_labels.name[
+                last_max_word_id + 1 : max_word_id + 1
+            ]
+            # decoded_target_text = tokenizer.decode(target_token_ids, skip_special_tokens=True)
+            # decoded_input_text = tokenizer.decode(input_ids_per_overflow, skip_special_tokens=True)
+            last_max_word_id = max_word_id
+
+            # index: Optional["torch.Tensor"] = None
+            # sample_id: Optional[str] = None
+            # input_ids: Optional["torch.Tensor"] = None
+            # bbox: Optional["torch.Tensor"] = None
+            # attention_mask: Optional["torch.Tensor"] = None
+            # pixel_values: Optional["torch.Tensor"] = None
+            # question_text: Optional[str] = None
+            # target_text: Optional[str] = None
+            # target_token_ids: Optional["torch.Tensor"] = None
+            # words: Optional[list[str]] = None
+            # word_labels: Optional[list[str]] = None
+            # label: Optional["torch.Tensor"] = None
+            # _tokenizer_name: Optional[str] = None
+            # _tokenizer_init_kwargs: Optional[dict] = None
+
+            # Set padding token IDs to -100 to ignore in loss computation
+            target_token_ids[target_token_ids == 0] = -100
+
+            instance = ConditionalGenerationModelInput(
+                index=torch.tensor(document_instance.index),
+                sample_id=document_instance.sample_id,
+                input_ids=input_ids_per_per_overflow,
+                attention_mask=features["attention_mask"][overflow_idx],
+                pixel_values=features["pixel_values"][overflow_idx],
+                bbox=features["bbox"][overflow_idx],
+                words=word_to_extract_in_this_overflow,
+                word_labels=word_labels_to_extract_in_this_overflow,
+                target_text=target_text,
+                target_token_ids=target_token_ids,
+                _tokenizer_name=self.tokenizer_name,
+                _tokenizer_init_kwargs=self._default_init_kwargs,
+            )
+            instances.append(instance)
+
+        if self.is_training:
+            random_index = int(torch.randint(0, len(instances), (1,)).item())
+            return instances[random_index]
+
+        return instances
+
+    def _prepare_instances_for_question_answering(
+        self, document_instance: DocumentInstance, qa_pairs: list[ExtractiveQAPair]
+    ) -> list[ConditionalGenerationModelInput]:
+        import torch
+
+        image, words, boxes = self._get_common_kwargs(document_instance)
+
+        instances = []
+        for qa_pair in qa_pairs:
+            # since we can have multiple answers per question, we need to handle that here and just take one which is not
+            # -1 # we don't need to remove no answer indices in conditional generation setting as we always have the answer anyway
+            # word_ans_start, word_ans_end = -1, -1
+            # for ans_start, ans_end in zip(qa_pair.answer_start, qa_pair.answer_end):
+            #     if ans_start != -1 and ans_end != -1:
+            #         word_ans_start = ans_start
+            #         word_ans_end = ans_end
+            #         break
+
+            # if word_ans_start == -1 or word_ans_end == -1:
+            #     if self.is_training:
+            #         logger.warning(f"Skipping QA pair with no answer during training: {qa_pair}")
+            #         continue
+
+            prompt = f"Question answering. {qa_pair.question_text}"
+            target_text = qa_pair.answer_text[0]
+
+            tokenized_instance = {}
+            if self.tokenizer_name == "microsoft/udop-large":
+                tokenized_instance = self._processor(
+                    image,
+                    prompt,
+                    text_pair=words,
+                    boxes=boxes,
+                    **self._default_call_kwargs,
+                )
+            elif self.tokenizer_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+                tokenized_instance = self._processor(
+                    prompt, text_pair=" ".join(words), **self._default_call_kwargs
+                )
+
+            for key, value in tokenized_instance.items():
+                tokenized_instance[key] = value.squeeze(0)
+
+            # # # for debugging decode the input ids
+            # decoded_input = self._processor.decode(tokenized_instance['input_ids'], skip_special_tokens=True)
+            # print('Decoded input:', decoded_input)
+
+            # Tokenize target text to get target_token_ids
+            target_token_ids = self._tokenizer.encode(  # this takes text but returns a batch, truly a garbage design
+                target_text,
+                add_special_tokens=True,
+                return_tensors="pt",
+                max_length=128,
+                truncation=True,
+                padding="max_length",
+            )[0]
+
+            # decoded_target_text = self._processor.decode(target_token_ids, skip_special_tokens=True)
+            # print('Decoded target_text:', decoded_target_text)
+
+            # Set padding token IDs to -100 to ignore in loss computation
+            target_token_ids[target_token_ids == 0] = -100
+
+            instance = ConditionalGenerationModelInput(
+                **tokenized_instance,
+                index=torch.tensor(document_instance.index),
+                sample_id=document_instance.sample_id,
+                words=words,
+                target_text=target_text,
+                question_text=qa_pair.question_text,
+                target_token_ids=target_token_ids,
+                _tokenizer_name=self.tokenizer_name,
+                _tokenizer_init_kwargs=self._default_init_kwargs,
+            )
+
+            instances.append(instance)
+
+        if self.is_training:
+            random_index = int(torch.randint(0, len(instances), (1,)).item())
+            return instances[random_index]
+
+        return instances
+
+    def _prepare_instances_for_layout_analysis(
+        self,
+        document_instance: DocumentInstance,
+        annotated_objects: AnnotatedObjectList,
+    ) -> str:
+        possible_labels = (
+            self.dataset_labels.layout if self.dataset_labels.layout is not None else []
+        )
+        image, words, boxes = self._get_common_kwargs(document_instance)
+        prompt = f"Layout Analysis. Extract the layout entities present in the document into one of these categories: {', '.join(possible_labels)}. Document: "
+        bbox_labels_concatenated = []
+        for label, bbox in zip(
+            annotated_objects.label.name,
+            annotated_objects.bbox,
+        ):
+            bbox = [int(x * 1000) for x in bbox]
+            bbox_str = "".join([f"<{coord}>" for coord in bbox])
+            bbox_labels_concatenated.append(f"{bbox_str}<{label}>")
+
+        target_text = ",".join(bbox_labels_concatenated)
+
+        tokenized_instance = {}
+        if self.tokenizer_name == "microsoft/udop-large":
+            tokenized_instance = self._processor(
+                image, prompt, **self._default_call_kwargs
+            )
+        elif self.tokenizer_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+            tokenized_instance = self._processor(
+                prompt, text_pair=" ".join(words), **self._default_call_kwargs
+            )
+
+        for key, value in tokenized_instance.items():
+            tokenized_instance[key] = value.squeeze(0)
+
+        # for debugging decode the input ids
+        # decoded_input = self._processor.decode(tokenized_instance['input_ids'], skip_special_tokens=True)
+        # print('Decoded input:', decoded_input)
+
+        # Tokenize target text to get target_token_ids
+        tokenizer = (
+            self._processor.tokenizer
+            if hasattr(self._processor, "tokenizer")
+            else self._processor
+        )
+        target_token_ids = tokenizer.encode(  # this takes text but returns a batch, truly a garbage design
+            target_text,
+            add_special_tokens=True,
+            return_tensors="pt",
+            max_length=256,
+            truncation=True,
+            padding="max_length",
+        )[0]
+
+        # debugging
+        # print(tokenizer.special_tokens_map)
+        # print(tokenizer.additional_special_tokens)
+        # print(tokenizer.additional_special_tokens_ids)
+        # print('target_token_ids', target_token_ids)
+        # for idx, token in enumerate(target_token_ids):
+        #     decoded_token = tokenizer.decode([token.item()])
+        #     print(f'Token ID: {token.item()} -> Decoded Token: "{decoded_token}"')
+        #     if idx > 10:
+        #         break
+        # decoded_input = self._processor.decode(target_token_ids, skip_special_tokens=True)
+        # print('Decoded target_text:', target_text)
+
+        # Set padding token IDs to -100 to ignore in loss computation
+        target_token_ids[target_token_ids == 0] = -100
+
+        return ConditionalGenerationModelInput(
+            **tokenized_instance,
+            index=torch.tensor(document_instance.index),
+            sample_id=document_instance.sample_id,
+            words=words,
+            target_text=target_text,
+            target_token_ids=target_token_ids,
+            image_size=document_instance.image.load().content.size,
+            _tokenizer_name=self.tokenizer_name,
+            _tokenizer_init_kwargs=self._default_init_kwargs,
+        )
+
+    def __call__(
+        self, document_instance: DocumentInstance
+    ) -> ConditionalGenerationModelInput | list[ConditionalGenerationModelInput]:
+        # prepare prompt based on task type
+        annotations = _extract_annotations(document_instance)
+        if self.task_type == TaskType.sequence_classification:
+            return self._prepare_instances_for_sequence_classification(
+                document_instance, annotations.label
+            )
+        elif self.task_type == TaskType.token_classification:
+            return self._prepare_instances_for_token_classification(
+                document_instance, annotations.word_labels
+            )
+        elif self.task_type == TaskType.extractive_qa:
+            return self._prepare_instances_for_question_answering(
+                document_instance, annotations.qa_pairs
+            )
+        elif self.task_type == TaskType.layout_analysis:
+            return self._prepare_instances_for_layout_analysis(
+                document_instance, annotations.annotated_objects
+            )
+        else:
+            raise NotImplementedError(f"Task type {self.task_type} not supported.")
+
+    def __repr__(self) -> str:
+        return f"ConditionalGenerationTokenizer(task_type={self.task_type}, is_training={self.is_training})"
+
+    def __str__(self) -> str:
+        return f"ConditionalGenerationTokenizer(task_type={self.task_type}, is_training={self.is_training})"
diff --git a/docgenie/data/_transforms/_tokenizers/_document_processors.py b/docgenie/data/_transforms/_tokenizers/_document_processors.py
new file mode 100755
index 0000000000000000000000000000000000000000..116c0808d28386e3bcb963b7dd0523b6da5ee55b
--- /dev/null
+++ b/docgenie/data/_transforms/_tokenizers/_document_processors.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+from dataclasses import replace
+
+import numpy as np
+import torch
+from pydantic import ConfigDict, Field
+
+from docgenie.data._core._data_types import (
+    DocumentInstance,
+    DocumentInstanceModelInput,
+)
+from docgenie.data._transforms._generics._base import BaseTransform
+from docgenie.data._transforms._generics._hf_processor import HuggingfaceProcessor
+from docgenie.data._transforms._generics._image_processor import ImageProcessor
+from docgenie.logging import get_logger
+
+from ._utilities import (
+    _document_instance_to_hf_processor_inputs,
+    _extract_annotations,
+    _generate_qa_token_ids,
+    _post_process_tokenizer_outputs,
+)
+
+logger = get_logger(__name__)
+
+
+class BaseDocumentProcessor(BaseTransform[DocumentInstanceModelInput]):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True, validate_assignment=True, extra="forbid"
+    )
+
+    # tokenizer args
+    tokenizer_name: str = "microsoft/layoutlmv3-base"
+    init_kwargs: dict = Field(default_factory=dict)
+    call_kwargs: dict = Field(default_factory=dict)
+    overflow_sampling: str = "return_all"
+    max_overflow_samples: int = 10
+    use_segment_level_bboxes: bool = False
+    cache_dir: str = "./cache"
+
+    # image processor args
+    do_normalize: bool = True  # Normalize the image to ImageNet mean and std
+    do_resize: bool = True  # Resize the image to 224x224
+    use_imagenet_mean_std: bool = False
+    resize_height: int = 224
+    resize_width: int = 224
+    image_mean: list[float] | None = None
+    image_std: list[float] | None = None
+
+    # segment-level-rank info args
+    add_segment_level_info: bool = False
+    max_segment_num: int = 150
+
+    def model_post_init(self, context) -> None:
+        self._hf_processor = HuggingfaceProcessor(
+            tokenizer_name=self.tokenizer_name,
+            init_kwargs=self.init_kwargs,
+            call_kwargs=self.call_kwargs,
+            overflow_sampling=self.overflow_sampling,
+            cache_dir=self.cache_dir,
+        )
+        self._image_transform = ImageProcessor(
+            do_normalize=self.do_normalize,
+            do_resize=self.do_resize,
+            use_imagenet_mean_std=self.use_imagenet_mean_std,
+            resize_height=self.resize_height,
+            resize_width=self.resize_width,
+            image_mean=self.image_mean,
+            image_std=self.image_std,
+        )
+
+    def get_output_data_model(self):
+        return DocumentInstanceModelInput
+
+    def __call__(
+        self, document_instance: DocumentInstance
+    ) -> DocumentInstanceModelInput | list[DocumentInstanceModelInput]:
+        hf_processor_inputs = _document_instance_to_hf_processor_inputs(
+            document_instance,
+            use_segment_level_bboxes=self.use_segment_level_bboxes,
+            image_transform=self._image_transform,
+        )
+        tokenization_data = self._hf_processor(**hf_processor_inputs)
+        processed_outputs = _post_process_tokenizer_outputs(
+            tokenization_data=tokenization_data,
+            input_word_boxes=hf_processor_inputs.get("boxes", None),
+            input_word_labels=hf_processor_inputs.get("word_labels", None),
+            input_image=hf_processor_inputs.get("images", None),
+            add_segment_level_info=self.add_segment_level_info,
+            all_special_ids=self._hf_processor.tokenizer.all_special_ids,
+            max_segment_num=self.max_segment_num,
+        )
+        return DocumentInstanceModelInput(
+            index=torch.tensor(document_instance.index)
+            if document_instance.index is not None
+            else None,
+            sample_id=document_instance.sample_id,
+            words=hf_processor_inputs.pop("text", None),
+            tokenizer_config=self._hf_processor.get_config(),
+            **processed_outputs,
+        )
+
+
+class SequenceClassificationDocumentProcessor(BaseDocumentProcessor):
+    def __call__(
+        self, document_instance: DocumentInstance
+    ) -> DocumentInstanceModelInput | list[DocumentInstanceModelInput]:
+        instance = super().__call__(document_instance)
+        annotations = _extract_annotations(document_instance)
+        assert annotations.label is not None, "No label found in the document instance."
+        if isinstance(instance, list):
+            return [
+                replace(
+                    inst,
+                    label=torch.tensor(annotations.label.value),
+                )
+                for inst in instance
+            ]
+        return replace(
+            instance,
+            label=torch.tensor(annotations.label.value),
+        )
+
+
+class TokenClassificationDocumentProcessor(BaseDocumentProcessor):
+    pass
+
+
+class QuestionAnsweringDocumentProcessor(BaseDocumentProcessor):
+    ignore_samples_with_no_answer: bool = False
+    is_training: bool = False
+
+    def model_post_init(self, context) -> None:
+        # update call kwargs
+        self.call_kwargs["truncation"] = "only_second"
+
+        super().model_post_init(context)
+
+    def _is_no_answer_sample(
+        self, token_answer_start, token_answer_end, tokenization_data
+    ):
+        total_answers = len(token_answer_start)
+        for key, value in tokenization_data.items():
+            if value is None:
+                continue
+            if key == "image":
+                continue
+            assert len(value) == total_answers, (
+                f"Length mismatch in tokenization data for key {key}. "
+                f"Expected length: {total_answers}, Actual length: {len(value)}"
+            )
+
+        valid_indices = []
+        for idx, (s, e) in enumerate(zip(token_answer_start, token_answer_end)):
+            if s != -1 and e != -1:
+                valid_indices.append(idx)
+
+        if len(valid_indices) == 0:
+            return True  # skip this sample entirely
+
+        if len(valid_indices) < total_answers:
+            tokenization_data = {
+                k: v[valid_indices] if v is not None and k not in ["image"] else v
+                for k, v in tokenization_data.items()
+            }
+            token_answer_start = token_answer_start[valid_indices]
+            token_answer_end = token_answer_end[valid_indices]
+
+        assert (np.array(token_answer_end) != -1).all(), (
+            f"Some end answer indices are -1 in document {token_answer_end}"
+        )
+        assert (np.array(token_answer_start) != -1).all(), (
+            f"Some start answer indices are -1 in document {token_answer_start}"
+        )
+        total_answers = len(token_answer_start)
+        for key, value in tokenization_data.items():
+            if value is None:
+                continue
+            if key == "image":
+                continue
+            assert len(value) == total_answers, (
+                f"Length mismatch in tokenization data for key {key}. "
+                f"Expected length: {total_answers}, Actual length: {len(value)}"
+            )
+        return False
+
+    def __call__(
+        self, document_instance: DocumentInstance
+    ) -> DocumentInstanceModelInput | list[DocumentInstanceModelInput]:
+        qa_pairs = _extract_annotations(document_instance).qa_pairs
+        assert qa_pairs is not None, "No QA pairs found in the document instance."
+        assert len(qa_pairs) > 0, "No QA pairs found in the document instance."
+
+        transformed_instances = []
+        for qa_pair_index in range(len(qa_pairs)):
+            # prepare model input
+            hf_processor_inputs = _document_instance_to_hf_processor_inputs(
+                document_instance,
+                use_segment_level_bboxes=self.use_segment_level_bboxes,
+                image_transform=self._image_transform,
+                context=qa_pairs[qa_pair_index].question_text,
+            )
+
+            text_pair = hf_processor_inputs.get("text_pair", None)
+            boxes = hf_processor_inputs.get("boxes", None)
+            assert len(text_pair) == len(boxes), (
+                f"Length mismatch between text_pair and boxes for sample {document_instance.sample_id}. "
+                f"Length of text_pair: {len(text_pair)}, Length of boxes: {len(boxes)}"
+            )
+
+            tokenization_data = self._hf_processor(**hf_processor_inputs)
+            processed_outputs = _post_process_tokenizer_outputs(
+                tokenization_data=tokenization_data,
+                input_word_boxes=hf_processor_inputs.get("boxes", None),
+                input_word_labels=hf_processor_inputs.get("word_labels", None),
+                input_image=hf_processor_inputs.get("images", None),
+                add_segment_level_info=self.add_segment_level_info,
+                all_special_ids=self._hf_processor.tokenizer.all_special_ids,
+                max_segment_num=self.max_segment_num,
+            )
+
+            token_answer_start, token_answer_end = _generate_qa_token_ids(
+                qa_pair=qa_pairs[qa_pair_index],
+                word_ids=processed_outputs["word_ids"],
+                sequence_ids=processed_outputs["sequence_ids"],
+                sequence_length=processed_outputs["token_ids"].shape[-1],
+            )
+
+            # if all token_answer_start and token_answer_end are 0, it means we could not find the answer in the context
+            # therefore using this sample as a training sample will not help the model learn anything
+            if self.is_training and self.ignore_samples_with_no_answer:
+                total_answers = len(token_answer_start)
+                for key, value in processed_outputs.items():
+                    if value is None:
+                        continue
+                    if key == "image":
+                        continue
+                    assert len(value) == total_answers, (
+                        f"Length mismatch in tokenization data for key {key}. "
+                        f"Expected length: {total_answers}, Actual length: {len(value)}"
+                    )
+
+                valid_indices = []
+                for idx, (s, e) in enumerate(zip(token_answer_start, token_answer_end)):
+                    if s != -1 and e != -1:
+                        valid_indices.append(idx)
+
+                if len(valid_indices) == 0:
+                    continue  # skip this sample entirely
+
+                if len(valid_indices) < total_answers:
+                    processed_outputs = {
+                        k: v[valid_indices]
+                        if v is not None and k not in ["image"]
+                        else v
+                        for k, v in processed_outputs.items()
+                    }
+                    token_answer_start = token_answer_start[valid_indices]
+                    token_answer_end = token_answer_end[valid_indices]
+
+                assert (np.array(token_answer_end) != -1).all(), (
+                    f"Some end answer indices are -1 in document {token_answer_end}"
+                )
+                assert (np.array(token_answer_start) != -1).all(), (
+                    f"Some start answer indices are -1 in document {token_answer_start}"
+                )
+                total_answers = len(token_answer_start)
+                for key, value in processed_outputs.items():
+                    if value is None:
+                        continue
+                    if key == "image":
+                        continue
+                    assert len(value) == total_answers, (
+                        f"Length mismatch in tokenization data for key {key}. "
+                        f"Expected length: {total_answers}, Actual length: {len(value)}"
+                    )
+
+            # make sure afterwards we always have one length for all processed outputs
+            sample_id = document_instance.sample_id + "_subsample_" + str(qa_pair_index)
+            transformed_instance = DocumentInstanceModelInput(
+                index=torch.tensor(document_instance.index)
+                if document_instance.index is not None
+                else None,
+                sample_id=sample_id,
+                words=hf_processor_inputs.pop("text_pair", None),
+                question_id=qa_pair_index,
+                qa_question=qa_pairs[qa_pair_index].question_text,
+                qa_answers=qa_pairs[qa_pair_index].answer_text,
+                token_answer_start=token_answer_start,
+                token_answer_end=token_answer_end,
+                tokenizer_config=self._hf_processor.get_config(),
+                **processed_outputs,
+            )
+            transformed_instances.append(transformed_instance)
+        return transformed_instances
diff --git a/docgenie/data/_transforms/_tokenizers/_udop_processor.py b/docgenie/data/_transforms/_tokenizers/_udop_processor.py
new file mode 100755
index 0000000000000000000000000000000000000000..937cc93aab9d601ab0b5b2f4c94405051607824a
--- /dev/null
+++ b/docgenie/data/_transforms/_tokenizers/_udop_processor.py
@@ -0,0 +1,89 @@
+# verify input
+# patch udop processor to return word and sequence ids 
+from transformers.models.udop.processing_udop import UdopProcessor, UdopProcessorKwargs
+from typing import List, Optional, Union
+
+from transformers import logging
+
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+
+class CustomUdopProcessor(UdopProcessor):
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        *args,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[UdopProcessorKwargs],
+    ) -> BatchFeature:
+        output_kwargs = self._merge_kwargs(
+            UdopProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+            **self.prepare_and_validate_optional_call_args(*args),
+        )
+
+        boxes = output_kwargs["text_kwargs"].pop("boxes", None)
+        word_labels = output_kwargs["text_kwargs"].pop("word_labels", None)
+        text_pair = output_kwargs["text_kwargs"].pop("text_pair", None)
+        return_overflowing_tokens = output_kwargs["text_kwargs"].get("return_overflowing_tokens", False)
+        return_offsets_mapping = output_kwargs["text_kwargs"].get("return_offsets_mapping", False)
+        text_target = output_kwargs["text_kwargs"].get("text_target", None)
+
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if return_overflowing_tokens and not return_offsets_mapping:
+            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
+
+        if text_target is not None:
+            # use the processor to prepare the targets of UDOP
+            return self.tokenizer(
+                **output_kwargs["text_kwargs"],
+            )
+
+        else:
+            # use the processor to prepare the inputs of UDOP
+            # first, apply the image processor
+            features = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            features_words = features.pop("words", None)
+            features_boxes = features.pop("boxes", None)
+
+            output_kwargs["text_kwargs"].pop("text_target", None)
+            output_kwargs["text_kwargs"].pop("text_pair_target", None)
+            output_kwargs["text_kwargs"]["text_pair"] = text_pair
+            output_kwargs["text_kwargs"]["boxes"] = boxes if boxes is not None else features_boxes
+            output_kwargs["text_kwargs"]["word_labels"] = word_labels
+
+            # second, apply the tokenizer
+            if text is not None and self.image_processor.apply_ocr and text_pair is None:
+                if isinstance(text, str):
+                    text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+                output_kwargs["text_kwargs"]["text_pair"] = features_words
+
+            encoded_inputs = self.tokenizer(
+                text=text if text is not None else features_words,
+                **output_kwargs["text_kwargs"],
+            )
+
+            # add pixel values
+            if return_overflowing_tokens is True:
+                features["pixel_values"] = self.get_overflowing_images(
+                    features["pixel_values"], encoded_inputs["overflow_to_sample_mapping"]
+                )
+            features.update(encoded_inputs)
+
+            return features, encoded_inputs
\ No newline at end of file
diff --git a/docgenie/data/_transforms/_tokenizers/_utilities.py b/docgenie/data/_transforms/_tokenizers/_utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..ec2d3dc2e17c08b723586c79e4c7cf1b260f34f9
--- /dev/null
+++ b/docgenie/data/_transforms/_tokenizers/_utilities.py
@@ -0,0 +1,430 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Callable, Mapping
+
+import torch
+from transformers import BatchEncoding
+
+from docgenie.logging import get_logger
+
+from ..._core._data_types import (
+    AnnotatedObjectList,
+    ClassificationAnnotation,
+    DocumentInstance,
+    EntityLabelingAnnotation,
+    ExtractiveQAAnnotation,
+    ExtractiveQAPair,
+    Label,
+    LabelList,
+    LayoutAnalysisAnnotation,
+)
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class Annotations:
+    label: Label | None = None
+    word_labels: LabelList | None = None
+    qa_pairs: list[ExtractiveQAPair] | None = None
+    annotated_objects: AnnotatedObjectList | None = None
+
+
+def _document_instance_to_hf_processor_inputs(
+    document_instance: DocumentInstance,
+    use_segment_level_bboxes: bool = False,
+    image_transform: Callable | None = None,
+    context: str | None = None,
+) -> dict[str, Any]:
+    if document_instance.content is None:
+        return {}
+
+    inputs = {}
+
+    if context is None:
+        if document_instance.content.words is not None:
+            inputs["text"] = document_instance.content.words
+    else:
+        qa_pairs = _extract_annotations(document_instance).qa_pairs
+        assert qa_pairs is not None and len(qa_pairs) > 0, (
+            "No QA pairs found in the document instance for extractive QA task."
+        )
+        inputs["text"] = context
+        inputs["text_pair"] = document_instance.content.words
+
+    if document_instance.content.word_bboxes is not None:
+        inputs["boxes"] = (
+            document_instance.content.word_segment_level_bboxes.value
+            if use_segment_level_bboxes
+            and document_instance.content.word_segment_level_bboxes is not None
+            else document_instance.content.word_bboxes.value
+        )
+
+    if document_instance.image is not None:
+        inputs["images"] = (
+            image_transform(document_instance.image.content)
+            if image_transform is not None
+            else document_instance.image.content
+        )
+
+        # extract annotations
+        annotations = _extract_annotations(document_instance)
+        if annotations.label is not None:
+            inputs["label"] = annotations.label.value
+        if annotations.word_labels is not None:
+            inputs["word_labels"] = annotations.word_labels.value
+
+    return inputs
+
+
+def _extract_sequence_and_word_ids(
+    tokenization_data: BatchEncoding,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    sequence_ids = []
+    word_ids = []
+    input_ids = tokenization_data["input_ids"]
+    num_overflow_samples = len(input_ids)  # type: ignore
+    for i in range(num_overflow_samples):
+        sequence_ids_per_overflow = tokenization_data.sequence_ids(i)
+        word_ids_per_overflow = tokenization_data.word_ids(i)
+
+        # filter sequence_ids
+        sequence_ids_per_overflow = [
+            -100 if x is None else x for x in sequence_ids_per_overflow
+        ]
+        word_ids_per_overflow = [
+            -100 if x is None else x for x in word_ids_per_overflow
+        ]
+        if max(sequence_ids_per_overflow) > 0:
+            word_ids_per_overflow = [
+                -100 if sequence_id == 0 else word_id
+                for word_id, sequence_id in zip(
+                    word_ids_per_overflow, sequence_ids_per_overflow
+                )
+            ]
+        sequence_ids.append(sequence_ids_per_overflow)
+        word_ids.append(word_ids_per_overflow)
+
+    sequence_ids = torch.tensor(sequence_ids)
+    word_ids = torch.tensor(word_ids)
+    return sequence_ids, word_ids
+
+
+def _extract_token_bboxes_from_word_bboxes(
+    word_bboxes: list[list[float]], word_ids: torch.Tensor
+) -> torch.Tensor:
+    token_bboxes = []
+    for word_ids_per_sample in word_ids:
+        token_bboxes_per_sample = [
+            [0, 0, 0, 0] if word_id == -100 else word_bboxes[word_id]
+            for word_id in word_ids_per_sample.tolist()
+        ]
+        token_bboxes.append(token_bboxes_per_sample)
+    return torch.tensor(token_bboxes)
+
+
+def _extract_token_labels_from_word_labels(
+    word_labels: list[int], word_ids: Any
+) -> torch.Tensor:
+    token_labels = []
+    for word_ids_per_sample in word_ids:
+        token_labels_per_sample = []
+        last_word_id = None
+        for word_id in word_ids_per_sample.tolist():
+            if word_id == -100 or word_id == last_word_id:
+                token_labels_per_sample.append(-100)  # padding label
+            else:
+                token_labels_per_sample.append(word_labels[word_id])
+            last_word_id = word_id
+        token_labels.append(token_labels_per_sample)
+    return torch.tensor(token_labels)
+
+
+def _extract_segment_level_data(
+    token_ids: torch.Tensor,
+    token_bboxes: torch.Tensor,
+    all_special_ids: set[int],
+    max_segment_num: int = 150,
+) -> Mapping[str, Any]:
+    segment_index = _generate_segment_level_bbox_ranks(
+        token_ids=token_ids,
+        segment_level_bboxes=token_bboxes,
+        all_special_ids=all_special_ids,
+    )
+    segment_inner_token_rank = _generate_segment_level_inner_ranks(
+        line_rank_id=segment_index
+    )
+    first_token_idxes, first_token_idxes_mask = _generate_first_token_idxes(
+        line_rank_id=segment_index, max_segment_num=max_segment_num
+    )
+    return {
+        "segment_index": segment_index,
+        "segment_inner_token_rank": segment_inner_token_rank,
+        "first_token_idxes": first_token_idxes,
+        "first_token_idxes_mask": first_token_idxes_mask,
+    }
+
+
+def _post_process_tokenizer_outputs(
+    tokenization_data: BatchEncoding,
+    input_word_boxes: list[list[float]] | None,
+    input_word_labels: list[int] | None,
+    input_image: Any | None,
+    add_segment_level_info: bool = False,
+    all_special_ids: set[int] = set(),
+    max_segment_num: int = 150,
+) -> Mapping[str, Any]:
+    sequence_ids, word_ids = _extract_sequence_and_word_ids(tokenization_data)
+    token_bboxes = tokenization_data.get("bbox", None)
+    if token_bboxes is None and input_word_boxes is not None:
+        token_bboxes = _extract_token_bboxes_from_word_bboxes(
+            input_word_boxes, word_ids
+        )
+    token_labels = tokenization_data.get("labels", None)
+    if token_labels is None and input_word_labels is not None:
+        token_labels = _extract_token_labels_from_word_labels(
+            input_word_labels, word_ids
+        )
+    image = tokenization_data.get("pixel_values", None)
+    if image is not None:
+        image = image[0]
+    if image is None and input_image is not None:
+        image = input_image
+
+    outputs = {
+        "token_ids": tokenization_data.get("input_ids"),
+        "attention_mask": tokenization_data.get("attention_mask"),
+        "token_bboxes": token_bboxes,
+        "token_type_ids": tokenization_data.get("token_type_ids", None),
+        "token_labels": token_labels,
+        "sequence_ids": sequence_ids,
+        "word_ids": word_ids,
+        "image": image,
+    }
+
+    if add_segment_level_info:
+        segment_level_data = _extract_segment_level_data(
+            token_ids=outputs["token_ids"],
+            token_bboxes=outputs["token_bboxes"],
+            all_special_ids=all_special_ids,
+            max_segment_num=max_segment_num,
+        )
+        outputs.update(segment_level_data)
+
+    # assert that we have all the keys
+    assert outputs["token_ids"] is not None, (
+        "token_ids is None in the tokenizer outputs."
+    )
+    assert outputs["attention_mask"] is not None, (
+        "attention_mask is None in the tokenizer outputs."
+    )
+    assert outputs["token_bboxes"] is not None, (
+        "token_bboxes is None in the tokenizer outputs."
+    )
+    assert outputs["sequence_ids"] is not None, (
+        "sequence_ids is None in the tokenizer outputs."
+    )
+    assert outputs["word_ids"] is not None, "word_ids is None in the tokenizer outputs."
+    assert outputs["image"] is not None, "image is None in the tokenizer outputs."
+    if input_word_labels is not None:
+        assert outputs["token_labels"] is not None, (
+            "token_labels is None in the tokenizer outputs."
+        )
+
+    return outputs
+
+
+def _get_subword_start_end(word_start, word_end, word_ids, sequence_ids):
+    start_of_context = -1
+    for i in range(len(sequence_ids)):
+        if sequence_ids[i] == 1:
+            start_of_context = i
+            break
+    num_question_tokens = start_of_context
+    assert start_of_context != -1, "Could not find the start of the context"
+    subword_start = -1
+    subword_end = -1
+    for i in range(start_of_context, len(word_ids)):
+        if word_start == word_ids[i] and subword_start == -1:
+            subword_start = i
+        if word_end == word_ids[i]:
+            subword_end = i
+    return subword_start, subword_end, num_question_tokens
+
+
+def _generate_qa_token_ids(
+    qa_pair: ExtractiveQAPair,
+    word_ids: torch.Tensor,
+    sequence_ids: torch.Tensor,
+    sequence_length: int = 512,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import torch
+
+    # since we can have multiple answers per question, we need to handle that here and just take one which is not
+    # -1
+    word_ans_start, word_ans_end = -1, -1
+    for ans_start, ans_end in zip(qa_pair.answer_start, qa_pair.answer_end):
+        if ans_start != -1 and ans_end != -1:
+            word_ans_start = ans_start
+            word_ans_end = ans_end
+            break
+
+    # now we have one answer, with start and end indices in the word level
+    # we need to convert them to token level
+
+    token_answer_starts, token_answer_ends = [], []
+    for word_ids_per_overflow, sequence_ids_per_overflow in zip(
+        word_ids, sequence_ids, strict=True
+    ):
+        token_answer_start, token_answer_end = None, None
+        if word_ans_start == -1:
+            token_answer_start = -1
+            token_answer_end = -1
+        else:
+            (token_answer_start, token_answer_end, _) = _get_subword_start_end(
+                word_ans_start,
+                word_ans_end,
+                word_ids_per_overflow,
+                sequence_ids_per_overflow,
+            )
+            if token_answer_start == -1:
+                token_answer_start = -1
+                token_answer_end = -1
+            if token_answer_end == -1:
+                token_answer_end = sequence_length - 1
+            assert token_answer_end >= token_answer_start, (
+                "End token index is less than start token index. "
+                "Something is wrong in the conversion from answer word indices to answer token indices."
+            )
+        token_answer_starts.append(token_answer_start)
+        token_answer_ends.append(token_answer_end)
+    token_answer_start = torch.tensor(
+        token_answer_starts, dtype=torch.long, device=word_ids.device
+    )
+    token_answer_end = torch.tensor(
+        token_answer_ends, dtype=torch.long, device=word_ids.device
+    )
+    return token_answer_start, token_answer_end
+
+
+def _extract_annotations(sample: DocumentInstance) -> Annotations:
+    """Extract annotations from sample."""
+    annotations = Annotations()
+
+    if sample.annotations is not None:
+        for ann in sample.annotations:
+            if isinstance(ann, ClassificationAnnotation):
+                annotations.label = ann.label
+            elif isinstance(ann, ExtractiveQAAnnotation):
+                annotations.qa_pairs = ann.qa_pairs
+            elif isinstance(ann, EntityLabelingAnnotation):
+                annotations.word_labels = ann.word_labels
+            elif isinstance(ann, LayoutAnalysisAnnotation):
+                annotations.annotated_objects = ann.annotated_objects
+
+    return annotations
+
+
+def _generate_segment_level_bbox_ranks(
+    token_ids: torch.Tensor,
+    segment_level_bboxes: torch.Tensor,
+    all_special_ids: set[int],
+):
+    import torch
+
+    line_rank_ids = []
+    assert len(token_ids) == len(segment_level_bboxes), (
+        f"Token ids and segment level bboxes must have the same batch size, Got {len(token_ids)} and {len(segment_level_bboxes)}"
+    )
+    for token_ids_per_sample, bboxes_per_sample in zip(
+        token_ids, segment_level_bboxes
+    ):  # this is a shape of [batch_size, seq_len, 4] in xyxy format and normalized 0-1000
+        assert len(token_ids_per_sample) == len(bboxes_per_sample), (
+            "Token ids and segment level bboxes must have the same sequence length"
+        )
+        line_rank_ids_per_sample = []
+        line_rank = 0
+        last_b = None
+        for token_id, b in zip(token_ids_per_sample, bboxes_per_sample):
+            if last_b is not None and not torch.equal(b, last_b):
+                line_rank += 1
+            if token_id in all_special_ids:
+                line_rank_ids_per_sample.append(0)
+            else:
+                line_rank_ids_per_sample.append(line_rank)
+            last_b = b
+        line_rank_ids.append(line_rank_ids_per_sample)
+
+    return torch.tensor(line_rank_ids, device=segment_level_bboxes.device)
+
+
+def _generate_segment_level_inner_ranks(line_rank_id: torch.Tensor):
+    # line_inner_rank_id is the inner rank as follows 1 means start 2 for all middle tokens 3 for end token ... for each token in the line/segment.
+    # if there is no middle token, start token will be 1 and end token will be 3.
+    inner_ranks = []
+    for line_ranks_per_sample in line_rank_id:
+        inner_ranks_per_sample = torch.zeros_like(
+            line_ranks_per_sample, device=line_ranks_per_sample.device
+        )
+
+        line_segment_spans = []
+        start_idx = 0
+        last_lr = None
+        for curr_idx, lr in enumerate(line_ranks_per_sample):
+            if last_lr is not None and lr != last_lr:
+                line_segment_spans.append((start_idx, curr_idx - 1))
+                start_idx = curr_idx
+            last_lr = lr
+        line_segment_spans.append(
+            (start_idx, start_idx)
+        )  # add the last segment for sep token
+
+        for span in line_segment_spans:
+            span_start, span_end = span
+            span_length = span_end - span_start
+            if span_length == 0:
+                inner_ranks_per_sample[span_start] = 1  # only one token in the line
+            elif span_length == 1:
+                inner_ranks_per_sample[span_start] = 1  # start
+                inner_ranks_per_sample[span_end] = 3  # end
+            else:
+                inner_ranks_per_sample[span_start] = 1  # start
+                inner_ranks_per_sample[span_start + 1 : span_end] = 2
+                inner_ranks_per_sample[span_end] = 3  # end
+        inner_ranks.append(inner_ranks_per_sample)
+    return torch.stack(inner_ranks)
+
+
+def _generate_first_token_idxes(line_rank_id: torch.Tensor, max_segment_num: int = 150):
+    first_token_idxes = []
+    first_token_idxes_mask = []
+    for line_ranks_per_sample in line_rank_id:
+        first_token_idxes_per_sample = []
+        first_token_idxes_mask_per_sample = []
+        last_lr = None
+        for curr_idx, lr in enumerate(line_ranks_per_sample):
+            if last_lr is not None and lr != last_lr and lr != 0:
+                first_token_idxes_per_sample.append(curr_idx)
+            last_lr = lr
+
+        # make mask
+        if len(first_token_idxes_per_sample) > max_segment_num:
+            first_token_idxes_per_sample = first_token_idxes_per_sample[
+                :max_segment_num
+            ]
+
+        first_token_idxes_mask_per_sample = [1] * len(first_token_idxes_per_sample) + [
+            0
+        ] * (max_segment_num - len(first_token_idxes_per_sample))
+        first_token_idxes_per_sample = first_token_idxes_per_sample + [0] * (
+            max_segment_num - len(first_token_idxes_per_sample)
+        )
+        first_token_idxes_mask.append(first_token_idxes_mask_per_sample)
+        first_token_idxes.append(first_token_idxes_per_sample)
+
+    first_token_idxes = torch.tensor(first_token_idxes, device=line_rank_id.device)
+    first_token_idxes_mask = torch.tensor(
+        first_token_idxes_mask, device=line_rank_id.device, dtype=torch.float32
+    )
+    return first_token_idxes, first_token_idxes_mask
diff --git a/docgenie/data/_transforms/mmdet.py b/docgenie/data/_transforms/mmdet.py
new file mode 100755
index 0000000000000000000000000000000000000000..5600dafe4931771330f44c66cbe9083656931754
--- /dev/null
+++ b/docgenie/data/_transforms/mmdet.py
@@ -0,0 +1,345 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+from mmdet.registry import TRANSFORMS
+from PIL.Image import Image as PILImage
+from pydantic import Field
+
+from docgenie.data._core._data_types import (
+    DocumentInstance,
+    LayoutAnalysisAnnotation,
+    MMDetInput,
+)
+from docgenie.data._transforms._generics._base import (
+    BaseTransform as DocGenieBaseTransform,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+@TRANSFORMS.register_module()
+class RandomChoiceResize(BaseTransform):
+    def __init__(self, scales: Sequence[int | tuple], **resize_kwargs) -> None:
+        super().__init__()
+
+        import mmengine
+        from mmdet.datasets.transforms import Resize
+
+        if isinstance(scales, list):
+            self.scales = scales
+        else:
+            self.scales = [scales]
+        assert mmengine.is_seq_of(self.scales, (tuple, int))
+        self.resize = Resize(scale=0, backend="pillow", **resize_kwargs)
+
+    @cache_randomness
+    def _random_select(self) -> tuple[int, int]:
+        """Randomly select an scale from given candidates.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(scale, scale_dix)``,
+            where ``scale`` is the selected image scale and
+            ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        scale_idx = np.random.randint(len(self.scales))
+        scale = self.scales[scale_idx]
+        return scale, scale_idx
+
+    def transform(self, results: dict) -> dict:
+        """Apply resize transforms on results from a list of scales.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        target_scale, scale_idx = self._random_select()
+        self.resize.scale = target_scale
+        results = self.resize(results)
+        results["scale_idx"] = scale_idx
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f"(scales={self.scales}"
+        repr_str += f", resize={self.resize})"
+        return repr_str
+
+
+class DocumentInstanceMMDetTransform(DocGenieBaseTransform[MMDetInput]):
+    train_scale: list[tuple[int, int]] | tuple[int, int] = Field(
+        default=[
+            (480, 1333),
+            (512, 1333),
+            (800, 1333),
+        ],
+        description="Scale for training images.",
+    )
+    test_scale: list[tuple[int, int]] | tuple[int, int] = Field(
+        default=(1333, 800), description="Scale for testing images."
+    )
+    is_training: bool = Field(
+        default=False, description="Whether the transform is used for training."
+    )
+    use_test_time_augmentation: bool = Field(
+        default=False, description="Whether to use test time augmentation."
+    )
+    use_flip: bool = Field(
+        default=False, description="Whether to use flip augmentation during testing."
+    )
+    use_fixed_size: bool = Field(
+        default=False, description="Whether to use fixed size resizing."
+    )
+    fixed_size: int = Field(
+        default=800, description="Fixed size to resize the shorter side to."
+    )
+
+    def get_output_data_model(self) -> type[MMDetInput]:
+        return MMDetInput
+
+    def model_post_init(self, context) -> None:
+        import torchvision.transforms as T
+
+        # self._transform = T.Compose([])
+        # return
+        from mmdet.datasets.transforms import (
+            LoadAnnotations,
+            PackDetInputs,
+            RandomFlip,
+            Resize,
+        )
+
+        if self.is_training:
+            # from mmcv.transforms import RandomChoiceResize, TestTimeAug
+            from mmcv.transforms import TestTimeAug
+
+            train_scale = self.train_scale
+            if isinstance(self.train_scale, tuple):
+                train_scale = [self.train_scale]
+
+            self._transform = T.Compose(
+                [
+                    LoadAnnotations(with_bbox=True, with_mask=False, box_type=None),
+                    Resize(scale=self.fixed_size, keep_ratio=False)
+                    if self.use_fixed_size
+                    else RandomChoiceResize(
+                        scales=train_scale, keep_ratio=self.use_fixed_size is False
+                    ),
+                    *([RandomFlip(prob=0.5)] if self.use_flip else []),
+                    PackDetInputs(
+                        meta_keys=(
+                            "id",
+                            "img_id",
+                            "img_path",
+                            "ori_shape",
+                            "img_shape",
+                            "scale_factor",
+                            "flip",
+                            "flip_direction",
+                        )
+                    ),
+                ]
+            )
+        else:
+            from mmcv.transforms import TestTimeAug
+
+            if self.use_test_time_augmentation:
+                if isinstance(self.test_scale, tuple):
+                    test_scale = [self.test_scale]
+                self._transform = T.Compose(
+                    [
+                        LoadAnnotations(with_bbox=True, with_mask=False, box_type=None),
+                        TestTimeAug(
+                            transforms=[
+                                [
+                                    RandomChoiceResize(
+                                        scales=test_scale, keep_ratio=True
+                                    )
+                                ],
+                                [RandomFlip(prob=0.0), RandomFlip(prob=1.0)],
+                                [
+                                    PackDetInputs(
+                                        meta_keys=(
+                                            "__key__",
+                                            "__index__",
+                                            "img_id",
+                                            "img_path",
+                                            "ori_shape",
+                                            "img_shape",
+                                            "scale_factor",
+                                            "flip",
+                                            "flip_direction",
+                                        )
+                                    )
+                                ],
+                            ]
+                        ),
+                    ]
+                )
+            else:
+                import torchvision.transforms as T
+                from mmcv.transforms import TestTimeAug
+
+                if isinstance(self.test_scale, list):
+                    test_scale = self.test_scale[0]
+                else:
+                    test_scale = self.test_scale
+
+                self._transform = T.Compose(
+                    [
+                        LoadAnnotations(with_bbox=True, with_mask=False, box_type=None),
+                        Resize(
+                            scale=self.fixed_size,
+                            keep_ratio=self.use_fixed_size is False,
+                        ),
+                        PackDetInputs(
+                            meta_keys=(
+                                "__key__",
+                                "__index__",
+                                "img_id",
+                                "img_path",
+                                "ori_shape",
+                                "img_shape",
+                                "scale_factor",
+                                "flip",
+                                "flip_direction",
+                            )
+                        ),
+                    ]
+                )
+
+    def _extract_annotated_objects(self, document_instance: DocumentInstance):
+        assert document_instance.annotations is not None, (
+            f"Document instance must have annotations for {self.__class__} ."
+        )
+        layout_annotations = None
+        for annotation in document_instance.annotations:
+            if isinstance(annotation, LayoutAnalysisAnnotation):
+                layout_annotations = annotation.annotated_objects
+                break
+        assert layout_annotations is not None, (
+            f"Document instance must have layout annotations for {self.__class__}."
+        )
+        return layout_annotations
+
+    def _get_image(self, document_instance: DocumentInstance) -> PILImage:
+        assert document_instance.image is not None, (
+            "DocumentInstance image must be loaded before applying transforms."
+        )
+        assert isinstance(document_instance.image.content, PILImage), (
+            "DocumentInstance image content must be a PIL Image."
+        )
+        return document_instance.image.content
+
+    def _is_valid_bbox(
+        self, bbox: list[float], image_width: int, image_height: int
+    ) -> bool:
+        x1, y1, x2, y2 = bbox
+        if 0 <= x1 < x2 <= image_width and 0 <= y1 < y2 <= image_height:
+            return True
+        if (x2 - x1) > 1 and (y2 - y1) > 1:
+            return True
+        return False
+
+    def _unnormalize_bbox(
+        self, bbox: list[float], image_width: int, image_height: int
+    ) -> list[float]:
+        x1, y1, x2, y2 = bbox
+        return [
+            x1 * image_width,
+            y1 * image_height,
+            x2 * image_width,
+            y2 * image_height,
+        ]
+
+    def _clip_bbox(
+        self, bbox: list[float], image_width: int, image_height: int
+    ) -> list[float]:
+        x1, y1, x2, y2 = bbox
+        x1 = min(max(x1, 0), image_width - 1)
+        x2 = min(max(x2, 0), image_width - 1)
+        y1 = min(max(y1, 0), image_height - 1)
+        y2 = min(max(y2, 0), image_height - 1)
+        return [x1, y1, x2, y2]
+
+    def _prepare_instances(
+        self, document_instance: DocumentInstance, image_width: int, image_height: int
+    ) -> list[dict]:
+        annotated_objects = self._extract_annotated_objects(document_instance)
+
+        instances = []
+        is_bbox_normalized = annotated_objects.bbox.normalized
+        for bbox, label, iscrowd in zip(
+            annotated_objects.bbox.value,
+            annotated_objects.label.value,
+            annotated_objects.iscrowd,
+            strict=True,
+        ):
+            if is_bbox_normalized:
+                bbox = self._unnormalize_bbox(bbox, image_width, image_height)
+
+            # first clip the bbox to be within image bounds, then check validity
+            bbox = self._clip_bbox(bbox, image_width, image_height)
+
+            if not self._is_valid_bbox(bbox, image_width, image_height):
+                logger.warning(
+                    f"Invalid bbox {bbox} for image of size ({image_width}, {image_height}) in document instance {document_instance.sample_id}. Skipping this bbox."
+                )
+                continue
+
+            instance = {
+                "bbox": [float(coord) for coord in bbox],
+                "bbox_label": label,
+                "ignore_flag": 1 if iscrowd else 0,
+            }
+            instances.append(instance)
+
+        return instances
+
+    def _apply_transforms(self, document_instance: DocumentInstance) -> MMDetInput:
+        image = self._get_image(document_instance)
+        output = self._transform(
+            {
+                "id": document_instance.sample_id,
+                "img_id": document_instance.index,
+                "instances": self._prepare_instances(
+                    document_instance,
+                    image_width=image.width,
+                    image_height=image.height,
+                ),
+                "img": np.array(image),
+                "img_shape": (
+                    image.height,
+                    image.width,
+                ),
+                "ori_shape": (
+                    image.height,
+                    image.width,
+                ),
+            }
+        )
+        output = MMDetInput(**output)
+
+        return output
+
+    def __call__(self, document_instance: DocumentInstance):
+        return self._apply_transforms(document_instance)
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  is_training={self.is_training},\n"
+            f"  use_test_time_augmentation={self.use_test_time_augmentation},\n"
+            f"  transform={self._transform},\n"
+            f")"
+        )
diff --git a/docgenie/data/_transforms/utilities.py b/docgenie/data/_transforms/utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..4f4d8ab740687ac000dc82c904b93eb24e6c2a40
--- /dev/null
+++ b/docgenie/data/_transforms/utilities.py
@@ -0,0 +1,8 @@
+import hashlib
+import json
+
+def generate_transform_hash(kwargs_dict):
+    """Generate a unique hash for transform kwargs."""
+    # Sort the dictionary to ensure consistent hashing
+    sorted_kwargs = json.dumps(kwargs_dict, sort_keys=True, default=str)
+    return hashlib.md5(sorted_kwargs.encode()).hexdigest()[:8]
diff --git a/docgenie/data/_transforms/vlms/tranforms.py b/docgenie/data/_transforms/vlms/tranforms.py
new file mode 100755
index 0000000000000000000000000000000000000000..dc2f05cded6a9beb2bcdfed56d86a37a7e8ca0f3
--- /dev/null
+++ b/docgenie/data/_transforms/vlms/tranforms.py
@@ -0,0 +1,142 @@
+from abc import ABC, abstractmethod
+
+from atria_core.utilities.repr import RepresentationMixin
+from pydantic import BaseModel
+
+from docgenie.data._core._data_types import (
+    ConditionalGenerationModelInput,
+    DatasetLabels,
+    DocumentInstance,
+)
+from docgenie.data._transforms._tokenizers._utilities import _extract_annotations
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class BaseVLMTokenizer(RepresentationMixin, BaseModel, ABC):
+    """Base class for VLM tokenizers"""
+
+    tokenizer_name: str = "deepseek-community/deepseek-vl-1.3b-base"
+    tokenizer_cache_dir: str = "./cache"
+    is_training: bool = True
+    dataset_labels: DatasetLabels
+
+    def model_post_init(self, context) -> None:
+        self._default_init_kwargs = {
+            "cache_dir": self.tokenizer_cache_dir,
+            "local_files_only": False,
+            "apply_ocr": False,
+        }
+        self._default_call_kwargs = {
+            "add_special_tokens": True,
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 1024,
+            "stride": 0,
+            "pad_to_multiple_of": 8,
+            "return_tensors": "pt",
+        }
+
+        self._setup_processor()
+        self._tokenizer = (
+            self._processor.tokenizer
+            if hasattr(self._processor, "tokenizer")
+            else self._processor
+        )
+
+    def _setup_processor(self):
+        """Setup processor - can be overridden by child classes"""
+        from transformers import AutoProcessor
+
+        self._processor = AutoProcessor.from_pretrained(
+            self.tokenizer_name, **self._default_init_kwargs
+        )
+
+    def _get_common_kwargs(self, document_instance: DocumentInstance) -> tuple:
+        """Extract common data from document instance"""
+        image = document_instance.image.load().content.convert("RGB")
+        words = (
+            document_instance.content.words
+            if document_instance.content is not None
+            else []
+        )
+        boxes = (
+            document_instance.content.word_bboxes.value
+            if document_instance.content is not None
+            else []
+        )
+
+        if not words:
+            words = ["None"]
+            boxes = [[0, 0, 0, 0]]
+
+        return image, words, boxes
+
+    def _tokenize_target(self, target_text: str, max_length: int = 128):
+        """Common target tokenization logic"""
+
+        target_token_ids = self._tokenizer.encode(
+            target_text,
+            add_special_tokens=True,
+            return_tensors="pt",
+            max_length=max_length,
+            truncation=True,
+            padding="max_length",
+        )[0]
+
+        # Set padding token IDs to -100 to ignore in loss computation
+        target_token_ids[target_token_ids == 0] = -100
+        return target_token_ids
+
+    @abstractmethod
+    def _prepare_instances(
+        self, document_instance: DocumentInstance, annotations
+    ) -> ConditionalGenerationModelInput:
+        """Prepare instances for the specific task type"""
+        pass
+
+    def __call__(self, document_instance: DocumentInstance):
+        annotations = _extract_annotations(document_instance)
+        return self._prepare_instances(document_instance, annotations)
+
+
+class SequenceClassificationVLMTokenizer(BaseVLMTokenizer):
+    """Tokenizer for sequence classification tasks"""
+
+    def _prepare_instances(
+        self, document_instance: DocumentInstance, annotations
+    ) -> ConditionalGenerationModelInput:
+        import torch
+
+        possible_labels = self.dataset_labels.classification or []
+        image, words, boxes = self._get_common_kwargs(document_instance)
+
+        prompt = f"Document Classification. Classify the document into one of these categories: {', '.join(possible_labels)}. Document: "
+        target_text = annotations.label.name
+
+        # Tokenize input
+        if self.tokenizer_name == "microsoft/udop-large":
+            tokenized_instance = self._processor(
+                image, prompt, text_pair=words, boxes=boxes, **self._default_call_kwargs
+            )
+        elif self.tokenizer_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+            tokenized_instance = self._processor(
+                prompt, text_pair=" ".join(words), **self._default_call_kwargs
+            )
+
+        for key, value in tokenized_instance.items():
+            tokenized_instance[key] = value.squeeze(0)
+
+        target_token_ids = self._tokenize_target(target_text, max_length=16)
+
+        return ConditionalGenerationModelInput(
+            **tokenized_instance,
+            index=torch.tensor(document_instance.index),
+            sample_id=document_instance.sample_id,
+            words=words,
+            target_text=target_text,
+            target_token_ids=target_token_ids,
+            _tokenizer_name=self.tokenizer_name,
+            _tokenizer_init_kwargs=self._default_init_kwargs,
+        )
diff --git a/docgenie/data/_transforms/vlms/utilities.py b/docgenie/data/_transforms/vlms/utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..d755ee4f4d2d680cf0e8d7ecbb69eb53985849a1
--- /dev/null
+++ b/docgenie/data/_transforms/vlms/utilities.py
@@ -0,0 +1,63 @@
+
+
+import json
+from docgenie.data._core._utilities import TaskType
+
+
+def _prepare_system_messages(task_type: TaskType, labels: list[str]) -> str:
+    if task_type == TaskType.sequence_classification:
+        return f"You are a document classification model. Classify the document into one of the given categories: {json.dumps(labels)}."
+
+    elif task_type == TaskType.token_classification:
+        return f"You are an information extraction model. Extract all the entities present in this document. Choose from the given entity categories: {json.dumps(labels)}."
+
+    elif task_type == TaskType.extractive_qa:
+        return "You are a question answering model. Answer the question based on the content of the document."
+
+    elif task_type == TaskType.layout_analysis:
+        return f"""
+        You are a layout analysis model. Extract the layout entities present in the document. 
+        Choose from the given layout categories: {json.dumps(labels)}. 
+        Provide the output in the format <x0><y0><x1><y1><label>, 
+        where (x0, y0) and (x1, y1) are the top-left and bottom-right coordinates of the bounding box respectively.
+
+        The coordinates should be normalized relative to the full image such that:
+        - Top-left (0,0)
+        - Bottom-right (1000,1000)
+        - All bounding boxes must be integers in the range [0, 1000]
+
+        Example output:
+        100,150,400,300,{labels[0]}
+        50,350,950,450,{labels[1]}
+        """
+    else:
+        raise NotImplementedError(f"Task type {task_type} not supported.")
+
+
+def _format_sample_into_chat_template(system_message, image, text_input, target_text):
+    return {
+        "images": [image],
+        "messages": [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_message}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {
+                        "type": "text",
+                        "text": text_input,
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": target_text}],
+            },
+        ],
+    }
diff --git a/docgenie/data/cmds/prepare_synth_datasets.py b/docgenie/data/cmds/prepare_synth_datasets.py
new file mode 100755
index 0000000000000000000000000000000000000000..eb3055d92ad6cc86c0c7447af6220e5d6be34a36
--- /dev/null
+++ b/docgenie/data/cmds/prepare_synth_datasets.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+import numpy as np
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.data._core._data_types import DocumentInstance
+from docgenie.data.interface import (
+    load_synthetic_dataset,
+    prepare_synthetic_dataset,
+)
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: PrepareSynthDatasets,
+):
+    # prepare dataset config
+    dataset_name = cfg.dataset_name
+
+    dsdef = SynDatasetDefinition.from_file(
+        ENV.SYN_DATASETS_DIR / dataset_name / f"{dataset_name}.yaml"
+    )
+
+    # manually fix the mapping for doclaynet datasets
+    if dsdef.name.startswith("doclaynet"):
+        if dsdef.task == "CLASSIFICATION":
+            dsdef.base_dataset_name = "doclaynet_4k_cls"
+        elif dsdef.task == "DLA":
+            dsdef.base_dataset_name = "doclaynet_4k_dla"
+
+    # prepare synthetic dataset
+    prepare_synthetic_dataset(
+        dsdef=dsdef,
+        force_overwrite=cfg.force_overwrite,
+        resize_images=cfg.resize_images,
+        clip_bboxes_to_foreground=cfg.clip_bboxes_to_foreground,
+    )
+
+    # load and visualize samples
+    synthesized_dataset = load_synthetic_dataset(
+        dataset_name=cfg.dataset_name, split="train"
+    )
+
+    # log prepared info
+    logger.info("Prepared synthetic dataset: %s", synthesized_dataset)
+
+    # visualize single sample from this dataset
+    assert synthesized_dataset.train is not None, (
+        "Train split is None in the synthesized dataset"
+    )
+    for idx, sample in enumerate(synthesized_dataset.train):
+        if idx == 0:
+            logger.info("First sample in the synthesized dataset:")
+            logger.info(sample)
+
+        sample: DocumentInstance
+        if cfg.run_sanity_check:
+            # make sure here that we alway shave
+            assert sample.content is not None
+            if sample.content.word_bboxes:
+                assert sample.content.word_bboxes.normalized, (
+                    "Word bounding boxes are not normalized"
+                )
+                assert (
+                    np.array(sample.content.word_bboxes.value).min() >= -0.01
+                    and np.array(sample.content.word_bboxes.value).max() <= 1.01
+                ), (
+                    "Word bounding boxes for sample={}, words={}, are not in [0, 1] range, Got: min={}, max={}".format(
+                        sample.sample_id,
+                        sample.content.words,
+                        np.array(sample.content.word_bboxes.value).min(),
+                        np.array(sample.content.word_bboxes.value).max(),
+                    )
+                )
+            if sample.content.word_segment_level_bboxes:
+                assert sample.content.word_segment_level_bboxes.normalized, (
+                    "Segment level bounding boxes are not normalized"
+                )
+                assert (
+                    np.array(sample.content.word_segment_level_bboxes.value).min()
+                    >= -0.01
+                    and np.array(sample.content.word_segment_level_bboxes.value).max()
+                    <= 1.01
+                ), (
+                    "Segment level bounding boxes for sample={}, words={}, are not in [0, 1] range, Got: min={}, max={}".format(
+                        sample.sample_id,
+                        sample.content.words,
+                        np.array(sample.content.word_segment_level_bboxes.value).min(),
+                        np.array(sample.content.word_segment_level_bboxes.value).max(),
+                    )
+                )
+
+
+class PrepareSynthDatasets(pydantic.BaseModel):
+    """
+    Configuration for visualizing dataset samples.
+    """
+
+    dataset_name: str
+    n_samples: int = 16
+    run_sanity_check: bool = True
+    force_overwrite: bool = False
+    resize_images: bool = False
+    clip_bboxes_to_foreground: bool = False
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=PrepareSynthDatasets,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/data/cmds/preprocess_datasets.py b/docgenie/data/cmds/preprocess_datasets.py
new file mode 100755
index 0000000000000000000000000000000000000000..50f83127370c59758877242ffc2d628e4a65b50d
--- /dev/null
+++ b/docgenie/data/cmds/preprocess_datasets.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: PreprocessDataset,
+):
+    import torch
+
+    from docgenie.data import load_data_pipeline, load_preprocessed_data_pipeline
+
+    # setup data pipeline and dataloaders with preprocessing
+    # this will save preprocessed msgpacks
+    preprocessed_dataset = load_preprocessed_data_pipeline(
+        dataset_name=cfg.dataset_name,
+        collate_fn=None,
+    )
+
+    if cfg.run_sanity_check:
+        logger.info(
+            "Running sanity check between preprocessed and unpreprocessed data..."
+        )
+        # for sanity check we make sure before and after preprocessing
+        # we get the same results in the first unshuffled batch
+        unprocessed_dataset = load_data_pipeline(
+            dataset_name=cfg.dataset_name,
+            collate_fn=None,
+        )
+
+        # get preprocessed batches
+        batches = {"train": {}, "validation": {}, "test": {}}
+        for split, dataloader in zip(
+            ["train", "validation", "test"],
+            [
+                preprocessed_dataset.train_dataloader(batch_size=16, shuffle=False),
+                preprocessed_dataset.validation_dataloader(batch_size=16),
+                preprocessed_dataset.test_dataloader(batch_size=16),
+            ],
+        ):
+            if dataloader is None:
+                continue
+            logger.info(f"Loading preprocessed batches from {split}...")
+            batches[split]["preprocessed"] = next(iter(dataloader))
+
+        for split, dataloader in zip(
+            ["train", "validation", "test"],
+            [
+                unprocessed_dataset.train_dataloader(batch_size=16, shuffle=False),
+                unprocessed_dataset.validation_dataloader(batch_size=16),
+                unprocessed_dataset.test_dataloader(batch_size=16),
+            ],
+        ):
+            if dataloader is None:
+                continue
+            logger.info(f"Loading unprocessed batches from {split}...")
+            batches[split]["unprocessed"] = next(iter(dataloader))
+
+        for split, data in batches.items():
+            logger.info(f"Running sanity check for {split} split...")
+            if "preprocessed" not in data or "unprocessed" not in data:
+                logger.warning(
+                    f"Skipping sanity check for {split} split as one of the dataloaders is missing."
+                )
+                continue
+            for s1, s2 in zip(data["preprocessed"], data["unprocessed"]):
+
+                def compare_values(val1, val2, name):
+                    if val1 is None and val2 is None:
+                        return
+                    if torch.is_tensor(val1) and torch.is_tensor(val2):
+                        assert torch.allclose(val1, val2), (
+                            f"{name} do not match after preprocessing."
+                        )
+                    else:
+                        assert val1 == val2, f"{name} do not match after preprocessing."
+
+                compare_values(s1.token_ids, s2.token_ids, "Token IDs")
+                compare_values(s1.token_bboxes, s2.token_bboxes, "Token bounding boxes")
+                compare_values(s1.token_type_ids, s2.token_type_ids, "Token type IDs")
+                compare_values(s1.token_labels, s2.token_labels, "Token labels")
+                compare_values(s1.attention_mask, s2.attention_mask, "Attention masks")
+                compare_values(s1.word_ids, s2.word_ids, "Word IDs")
+                compare_values(s1.sequence_ids, s2.sequence_ids, "Sequence IDs")
+                compare_values(
+                    s1.overflow_to_sample_mapping,
+                    s2.overflow_to_sample_mapping,
+                    "Overflow to sample mappings",
+                )
+                compare_values(s1.index, s2.index, "Indices")
+                compare_values(s1.sample_id, s2.sample_id, "Sample IDs")
+                compare_values(s1.image, s2.image, "Images")
+                compare_values(s1.label, s2.label, "Labels")
+                compare_values(s1.words, s2.words, "Words")
+                compare_values(s1.question_id, s2.question_id, "Question IDs")
+                compare_values(s1.qa_question, s2.qa_question, "QA questions")
+                compare_values(s1.qa_answers, s2.qa_answers, "QA answers")
+                compare_values(
+                    s1.token_answer_start, s2.token_answer_start, "Token answer starts"
+                )
+
+
+class PreprocessDataset(pydantic.BaseModel):
+    """
+    Configuration for visualizing dataset samples.
+    """
+
+    dataset_name: str
+    root_datasets_dir: str = ENV.BASE_DATASETS_DIR
+    n_samples: int = 16
+    run_sanity_check: bool = True
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=PreprocessDataset,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/data/cmds/print_dataset_info.py b/docgenie/data/cmds/print_dataset_info.py
new file mode 100755
index 0000000000000000000000000000000000000000..e938cf8b3a3df5bff6cd68d874a25dfd639b05ef
--- /dev/null
+++ b/docgenie/data/cmds/print_dataset_info.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie.data._core._visualization_utilities import _compute_qa_stats
+from docgenie.data.interface import load_dataset
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: PrintDatasetInfo,
+):
+    dataset = load_dataset(
+        dataset_name=cfg.dataset_name,
+        is_synthetic=cfg.synthetic,
+    )
+
+    split_reader = dataset.train
+    if split_reader is None:
+        split_reader = dataset.validation
+    if split_reader is None:
+        split_reader = dataset.test
+    if split_reader is None:
+        raise ValueError("No valid split found in the dataset.")
+
+    logger.info(dataset)
+    logger.info(
+        "Sample ids of first 10 samples: %s",
+        [split_reader[idx].sample_id for idx in range(min(10, len(dataset.train)))],
+    )
+    for sample in split_reader:
+        print(sample)
+        break
+
+    avg_num_pages = 0
+    has_ocr = False
+    has_normalized_bboxes = False
+    for sample in split_reader:
+        if sample.pdf is not None:
+            avg_num_pages += sample.pdf.num_pages
+        else:
+            avg_num_pages += 1  # if no pdf, assume single page
+
+        if sample.content is not None:
+            if sample.content.words is not None:
+                has_ocr = True
+            if sample.content.word_bboxes is not None:
+                has_normalized_bboxes = sample.content.word_bboxes.normalized
+    avg_num_pages /= len(split_reader)
+
+    logger.info(
+        f"Average number of pages in the {cfg.dataset_name} dataset (train split): {avg_num_pages}"
+    )
+    logger.info(f"Has OCR={has_ocr}")
+    logger.info(f"Has Normalized BBoxes={has_normalized_bboxes}")
+
+    if cfg.dataset_name.startswith("ex_"):
+        # Compute QA stats for each split
+        if dataset.train:
+            _compute_qa_stats(dataset.train, "train")
+        if dataset.validation:
+            _compute_qa_stats(dataset.validation, "validation")
+        if dataset.test:
+            _compute_qa_stats(dataset.test, "test")  # noqa: F821
+
+
+class PrintDatasetInfo(pydantic.BaseModel):
+    """
+    Configuration for visualizing dataset samples.
+    """
+
+    dataset_name: str
+    n_samples: int = 16
+    synthetic: bool = False
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=PrintDatasetInfo,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/data/cmds/print_mixed_dataset_info.py b/docgenie/data/cmds/print_mixed_dataset_info.py
new file mode 100755
index 0000000000000000000000000000000000000000..47e48d6a464be99aed7cfa599b501610844188b2
--- /dev/null
+++ b/docgenie/data/cmds/print_mixed_dataset_info.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+import torch
+
+from docgenie.data._core._utilities import TaskType
+from docgenie.data._core._visualization_utilities import (
+    _compute_qa_stats,
+    _extract_annotations,
+)
+from docgenie.data.interfaces.mixed_dataset import load_mixed_dataset
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: PrintMixedDatasetInfo,
+):
+    torch.manual_seed(42)
+
+    dataset = load_mixed_dataset(
+        dataset_name=cfg.dataset_name,
+        synthetic_dataset_name=cfg.synthetic_dataset_name,
+        num_real_samples=cfg.num_real_samples,
+        num_synthetic_samples=cfg.num_synthetic_samples,
+    )
+
+    split_reader = dataset.train
+    if split_reader is None:
+        split_reader = dataset.validation
+    if split_reader is None:
+        split_reader = dataset.test
+    if split_reader is None:
+        raise ValueError("No valid split found in the dataset.")
+
+    logger.info(dataset)
+    # logger.info(
+    #     "Sample ids of first 10 samples: %s",
+    #     [split_reader[idx].sample_id for idx in range(min(10, len(dataset.train)))],
+    # )
+    # for sample in split_reader:
+    #     logger.info("First sample: {}".format(sample))
+    #     break
+    avg_num_pages = 0
+    has_ocr = False
+    has_normalized_bboxes = False
+    for sample in split_reader:
+        if sample.pdf is not None:
+            avg_num_pages += sample.pdf.num_pages
+        else:
+            avg_num_pages += 1  # if no pdf, assume single page
+
+        if sample.content is not None:
+            if sample.content.words is not None:
+                has_ocr = True
+            if sample.content.word_bboxes is not None:
+                has_normalized_bboxes = sample.content.word_bboxes.normalized
+    avg_num_pages /= len(split_reader)
+
+    logger.info(
+        f"Average number of pages in the {cfg.dataset_name} dataset (train split): {avg_num_pages}"
+    )
+    logger.info(f"Has OCR={has_ocr}")
+    logger.info(f"Has Normalized BBoxes={has_normalized_bboxes}")
+
+    # find label distribution
+    if dataset.task_type == TaskType.sequence_classification:
+        label_counts = {}
+        total_labels = 0
+        for sample in split_reader:
+            annotations = _extract_annotations(sample)
+            if annotations["label"].name not in label_counts:
+                label_counts[annotations["label"].name] = 0
+            total_labels += 1
+            label_counts[annotations["label"].name] += 1
+            # if annotations["label"] is not None:
+            # label_counts[label[0]] = label_counts.get(label, 0) + 1
+            # total_labels += 1
+
+        # sort the dict by key
+        label_counts = dict(sorted(label_counts.items()))
+        logger.info("Total labels: {}".format(total_labels))
+        logger.info(
+            f"Label distribution in the {cfg.dataset_name} dataset (train split):"
+        )
+        for label, count in label_counts.items():
+            logger.info(f"  {label}: {count} ({(count / total_labels) * 100:.2f}%)")
+
+        import matplotlib.pyplot as plt
+
+        labels = list(label_counts.keys())
+        counts = [label_counts[label] for label in labels]
+
+        plt.figure(figsize=(10, 6))
+        plt.bar(labels, counts)
+        plt.xlabel("Labels")
+        plt.ylabel("Counts")
+        plt.title(
+            f"Label Distribution in mixed-{cfg.dataset_name}-{cfg.num_real_samples}-{cfg.synthetic_dataset_name}-{cfg.num_synthetic_samples}-label_distribution.png"
+        )
+        plt.xticks(rotation=45)
+        plt.savefig(
+            f"mixed-{cfg.dataset_name}-{cfg.num_real_samples}-{cfg.synthetic_dataset_name}-{cfg.num_synthetic_samples}-label_distribution.png"
+        )
+
+    if dataset.task_type == TaskType.extractive_qa:
+        # Compute QA stats for each split
+        if dataset.train:
+            _compute_qa_stats(dataset.train, "train")
+        if dataset.validation:
+            _compute_qa_stats(dataset.validation, "validation")
+        if dataset.test:
+            _compute_qa_stats(dataset.test, "test")  # noqa: F821
+
+
+class PrintMixedDatasetInfo(pydantic.BaseModel):
+    """
+    Configuration for visualizing dataset samples.
+    """
+
+    dataset_name: str
+    synthetic_dataset_name: str
+    num_real_samples: int = -1
+    num_synthetic_samples: int = -1
+    n_samples: int = 16
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=PrintMixedDatasetInfo,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/data/cmds/visualize.py b/docgenie/data/cmds/visualize.py
new file mode 100755
index 0000000000000000000000000000000000000000..612af38b5aa192d54b1a4531cfed906b923cb5f6
--- /dev/null
+++ b/docgenie/data/cmds/visualize.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie.data._core._visualization_utilities import _save_visualization
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: VisualizeDataset,
+):
+    from docgenie.data import load_dataset
+
+    # setup data pipeline and dataloaders
+    logger.info("Saving samples from split [train]...")
+    dataset = load_dataset(
+        dataset_name=cfg.dataset_name,
+        is_synthetic=cfg.synthetic,
+    )
+
+    dataset_labels = dataset.metadata.dataset_labels
+    for split, dataset in dataset.split_iterators.items():
+        if dataset is not None:
+            for idx, sample in enumerate(dataset):
+                if cfg.sample_id is not None and sample.sample_id != cfg.sample_id:
+                    continue
+
+                _save_visualization(
+                    sample,
+                    cfg.dataset_name,
+                    cfg.output_dir,
+                    split,
+                    dataset_labels=dataset_labels,
+                    visualize_gt_only=cfg.visualize_gt_only,
+                )
+                if cfg.n_samples is not None and idx + 1 >= cfg.n_samples:
+                    break
+
+
+class VisualizeDataset(pydantic.BaseModel):
+    """
+    Configuration for visualizing dataset samples.
+    """
+
+    dataset_name: str
+    output_dir: str = "data/visualizations/"
+    n_samples: int = 20
+    synthetic: bool = False
+    visualize_gt_only: bool = True
+    sample_id: str | None = None
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=VisualizeDataset,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/data/constants.py b/docgenie/data/constants.py
new file mode 100755
index 0000000000000000000000000000000000000000..8f1ab7270f662874871e30ab1f30a9b25e448784
--- /dev/null
+++ b/docgenie/data/constants.py
@@ -0,0 +1,283 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from docgenie import ENV
+
+from ._core._utilities import TaskType
+
+
+@dataclass
+class DatasetLoadConfig:
+    dataset_name: str
+    dataset_config_name: str | tuple[str, str]
+    task_type: TaskType
+    root_datasets_dir: Path | str = ENV.BASE_DATASETS_DIR
+    is_synthetic: bool = False
+
+
+@dataclass
+class SyntheticDatasetLoadConfig:
+    dataset_name: str
+    task_type: TaskType
+    dataset_config_name: str = "default"
+    root_datasets_dir: Path | str = ENV.SYN_DATASETS_PREPARED_DIR
+    is_synthetic: bool = True
+
+
+DATASET_CONFIGS = [
+    DatasetLoadConfig(
+        dataset_name="tobacco3482",
+        dataset_config_name="image_with_ocr",
+        task_type=TaskType.sequence_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="rvlcdip",
+        dataset_config_name="image_with_ocr_4k",
+        task_type=TaskType.sequence_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="cord",
+        dataset_config_name="default-bc8ca3a9",
+        task_type=TaskType.token_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="funsd",
+        dataset_config_name="default-d5de28ff",
+        task_type=TaskType.token_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="sroie",
+        dataset_config_name="default-c9d392fa",
+        task_type=TaskType.token_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="wild_receipts",
+        dataset_config_name="default-0efb7676",
+        task_type=TaskType.token_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="docile",
+        dataset_config_name="default-2b4fec84",
+        task_type=TaskType.token_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_docvqa",
+        dataset_config_name=("due_benchmark", "ExDocVQA"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_deepform",
+        dataset_config_name=("due_benchmark", "ExDeepForm"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_tabfact",
+        dataset_config_name=("due_benchmark", "ExTabFact"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_wiki",
+        dataset_config_name=("due_benchmark", "ExWikiTableQuestions"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_infographics",
+        dataset_config_name=("due_benchmark", "ExInfographicsVQA"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_klc",
+        dataset_config_name=("due_benchmark", "ExKleisterCharity"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="ex_pwc",
+        dataset_config_name=("due_benchmark", "ExPWC"),
+        task_type=TaskType.extractive_qa,
+    ),
+    DatasetLoadConfig(
+        dataset_name="publaynet",
+        dataset_config_name="4k",
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="doclaynet",  # backward compatibility
+        dataset_config_name=("doclaynet", "1k"),
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="doclaynet_4k_cls",
+        dataset_config_name=("doclaynet", "4k"),
+        task_type=TaskType.sequence_classification,
+    ),
+    DatasetLoadConfig(
+        dataset_name="doclaynet_4k_dla",
+        dataset_config_name=("doclaynet", "4k"),
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="icdar2019",
+        dataset_config_name="trackA_modern",
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="pubtables1m",
+        dataset_config_name="structure_4k",
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="fintabnet",
+        dataset_config_name="1k",
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="fintabnet_4k",
+        dataset_config_name=("fintabnet", "4k"),
+        task_type=TaskType.layout_analysis,
+    ),
+    DatasetLoadConfig(
+        dataset_name="icdar2013",
+        dataset_config_name="default-d32da75b",
+        task_type=TaskType.layout_analysis,
+    ),
+    ###
+    # ADD ALL SYNTHETIC DATASETS HERE
+    ###
+    ###
+    # ALL SYNTHETIC DATASETS FOR V1 SAMPLING STRAT
+    ###
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"docvqa_alpha={alpha}_v1",
+            task_type=TaskType.extractive_qa,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"rvlcdip_alpha={alpha}_v1",
+            task_type=TaskType.sequence_classification,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"cord_alpha={alpha}_v1",
+            task_type=TaskType.token_classification,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"publaynet_alpha={alpha}",
+            task_type=TaskType.layout_analysis,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"publaynet_correct-sampling_alpha={alpha}_v1",
+            task_type=TaskType.layout_analysis,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    ###
+    # ALL SYNTHETIC DATASETS FOR V2 SAMPLING STRAT
+    ###
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"docvqa_alpha={alpha}",
+            task_type=TaskType.extractive_qa,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"rvlcdip_alpha={alpha}",
+            task_type=TaskType.sequence_classification,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"cord_alpha={alpha}",
+            task_type=TaskType.token_classification,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"publaynet_correct-sampling_alpha={alpha}",
+            task_type=TaskType.layout_analysis,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+    ###
+    # ADDITIOANAL SYNTHETIC DATASETS HERE
+    ###
+    SyntheticDatasetLoadConfig(
+        dataset_name="tobacco3482_alpha=1.0",
+        task_type=TaskType.sequence_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_CLS",
+        task_type=TaskType.sequence_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_CLS",
+        task_type=TaskType.sequence_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="funsd_alpha=1.0",
+        task_type=TaskType.token_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="sroie_alpha=1.0",
+        task_type=TaskType.token_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="wtq_alpha=1.0",
+        task_type=TaskType.extractive_qa,
+    ),
+    # SyntheticDatasetLoadConfig(
+    #     dataset_name="ex_infographics",
+    #     task_type=TaskType.extractive_qa,
+    # ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="wtq_alpha=1.0",
+        task_type=TaskType.extractive_qa,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="kleister_alpha=1.0",
+        task_type=TaskType.extractive_qa,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_DLA",
+        task_type=TaskType.layout_analysis,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_DLA",
+        task_type=TaskType.layout_analysis,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="icdar2019_alpha=1.0",
+        task_type=TaskType.layout_analysis,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_CLS",
+        task_type=TaskType.sequence_classification,
+    ),
+    SyntheticDatasetLoadConfig(
+        dataset_name="doclaynet4k_alpha=1.0_DLA",
+        task_type=TaskType.layout_analysis,
+    ),
+    *[
+        SyntheticDatasetLoadConfig(
+            dataset_name=f"publaynet_correct-sampling_alpha={alpha}",
+            task_type=TaskType.layout_analysis,
+        )
+        for alpha in [0.5, 0.75, 1.0]
+    ],
+]
diff --git a/docgenie/data/interface.py b/docgenie/data/interface.py
new file mode 100755
index 0000000000000000000000000000000000000000..43c12517688e913450071616441773de74fc21fc
--- /dev/null
+++ b/docgenie/data/interface.py
@@ -0,0 +1,42 @@
+"""
+Defines interface for docgenie components to load datasets using DatasetFactory and log relevant information.
+"""
+
+from docgenie.data.interfaces.data_pipeline import (
+    DataPipeline,
+    load_data_pipeline,
+    load_mixed_data_pipeline,
+    load_preprocessed_data_pipeline,
+)  # noqa
+from docgenie.data.interfaces.dataset import (
+    Dataset,
+    get_dataset_config,
+    load_dataset,
+    load_preprocessed_dataset,
+    load_transform,
+)  # noqa
+from docgenie.data.interfaces.mixed_dataset import (  # noqa
+    load_mixed_dataset,
+)
+from docgenie.data.interfaces.synthetic_data import (  # noqa
+    load_synthetic_dataset,
+    prepare_synthetic_dataset,
+)
+from docgenie.data.interfaces.transform import (  # noqa
+    load_transform,
+)
+
+__all__ = [
+    "DataPipeline",
+    "Dataset",
+    "load_data_pipeline",
+    "load_preprocessed_data_pipeline",
+    "load_mixed_data_pipeline",
+    "load_transform",
+    "load_dataset",
+    "load_preprocessed_dataset",
+    "get_dataset_config",
+    "prepare_synthetic_dataset",
+    "load_synthetic_dataset",
+    "load_mixed_dataset",
+]
diff --git a/docgenie/data/interfaces/data_pipeline.py b/docgenie/data/interfaces/data_pipeline.py
new file mode 100755
index 0000000000000000000000000000000000000000..93da7a9f551bfdf3dd1eaf8db6ae4cf29231b88c
--- /dev/null
+++ b/docgenie/data/interfaces/data_pipeline.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+from docgenie.data._core._data_pipeline import DataPipeline
+from docgenie.data._core._utilities import TaskType, get_logger
+from docgenie.data.interfaces.mixed_dataset import load_mixed_dataset
+
+from .dataset import load_dataset, load_preprocessed_dataset
+from .transform import load_transform
+
+logger = get_logger(__name__)
+
+
+def get_collate_fn(task_type: TaskType | None) -> str:
+    collate_fn = "default_collate"
+    if task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+        collate_fn = (
+            "mmdet_pseudo_collate"  # for layout analysis + table detection / extraction
+        )
+    return collate_fn
+
+
+def load_data_pipeline(
+    dataset_name: str,
+    dataset_splitting_enabled: bool = False,
+    split_ratio: float = 0.9,
+    use_collate_fn: bool = True,
+    task_type: TaskType | None = None,
+    transform_type: str = "default",
+    set_task_transforms: bool = True,
+    is_synthetic: bool = False,
+    split: str | None = None,
+    **transforms_kwargs,
+) -> DataPipeline:
+    # since each datasethas specific task, we infer task-level transforms from dataset name
+    # e.g. FUNSD -> token classification, SROIE -> sequence classification
+
+    # setup dataset
+    dataset = load_dataset(
+        dataset_name=dataset_name,
+        is_synthetic=is_synthetic,
+        task_type_override=task_type,
+        split=split,
+    )
+
+    # setup data pipeline and dataloaders
+    data_pipeline = DataPipeline(
+        dataset=dataset,
+        dataset_splitting_enabled=dataset_splitting_enabled,
+        split_ratio=split_ratio,
+        collate_fn=get_collate_fn(dataset.task_type) if use_collate_fn else "identity",
+    )
+
+    # attach task-specific transforms if not preparing preprocessed dataset
+    if set_task_transforms:
+        # load transform for train
+        transform = load_transform(
+            task_type=dataset.task_type,
+            for_train=True,
+            transform_type=transform_type,
+            dataset_labels=dataset.metadata.dataset_labels,
+            **transforms_kwargs,
+        )
+
+        # set transform to data pipeline
+        data_pipeline.set_transform(transform, for_train=True, for_eval=False)
+
+        # load transform for eval
+        transform = load_transform(
+            task_type=dataset.task_type,
+            for_train=False,
+            transform_type=transform_type,
+            dataset_labels=dataset.metadata.dataset_labels,
+            **transforms_kwargs,
+        )
+
+        # set transform to data pipeline
+        data_pipeline.set_transform(transform, for_train=False, for_eval=True)
+
+    return data_pipeline
+
+
+def load_preprocessed_data_pipeline(
+    dataset_name: str,
+    dataset_splitting_enabled: bool = False,
+    split_ratio: float = 0.9,
+    use_collate_fn: bool = True,
+    task_type: TaskType | None = None,
+    is_synthetic: bool = False,
+    transform_type: str = "default",
+    split: str | None = None,
+    **transforms_kwargs,
+) -> DataPipeline:
+    # setup dataset with preprocessing
+    dataset = load_preprocessed_dataset(
+        dataset_name=dataset_name,
+        is_synthetic=is_synthetic,
+        task_type_override=task_type,
+        transform_type=transform_type,
+        split=split,
+        **transforms_kwargs,
+    )
+
+    return DataPipeline(
+        dataset=dataset,
+        dataset_splitting_enabled=dataset_splitting_enabled,
+        split_ratio=split_ratio,
+        collate_fn=get_collate_fn(dataset.task_type) if use_collate_fn else "identity",
+    )
+
+
+def load_mixed_data_pipeline(
+    dataset_name: str,
+    synthetic_dataset_name: str,
+    num_real_samples: int = -1,
+    num_synthetic_samples: int = -1,
+    load_preprocessed: bool = False,
+    dataset_splitting_enabled: bool = False,
+    split_ratio: float = 0.9,
+    use_collate_fn: bool = True,
+    task_type: TaskType | None = None,
+    transform_type: str = "default",
+    set_task_transforms: bool = True,
+    split: str | None = None,
+    **transforms_kwargs,
+) -> DataPipeline:
+    # since each datasethas specific task, we infer task-level transforms from dataset name
+    # e.g. FUNSD -> token classification, SROIE -> sequence classification
+
+    # setup dataset
+    dataset = load_mixed_dataset(
+        dataset_name=dataset_name,
+        synthetic_dataset_name=synthetic_dataset_name,
+        num_real_samples=num_real_samples,
+        num_synthetic_samples=num_synthetic_samples,
+        task_type_override=task_type,
+        split=split,
+        load_preprocessed=load_preprocessed,
+        transform_type=transform_type,
+        **transforms_kwargs,
+    )
+
+    # setup data pipeline and dataloaders
+    data_pipeline = DataPipeline(
+        dataset=dataset,
+        dataset_splitting_enabled=dataset_splitting_enabled,
+        split_ratio=split_ratio,
+        collate_fn=get_collate_fn(dataset.task_type) if use_collate_fn else "identity",
+    )
+
+    # attach task-specific transforms if not preparing preprocessed dataset
+    if not load_preprocessed and set_task_transforms:
+        # load transform for train
+        transform = load_transform(
+            task_type=dataset.task_type,
+            for_train=True,
+            transform_type=transform_type,
+            dataset_labels=dataset.metadata.dataset_labels,
+            **transforms_kwargs,
+        )
+
+        # set transform to data pipeline
+        data_pipeline.set_transform(transform, for_train=True, for_eval=False)
+
+        # load transform for eval
+        transform = load_transform(
+            task_type=dataset.task_type,
+            for_train=False,
+            transform_type=transform_type,
+            dataset_labels=dataset.metadata.dataset_labels,
+            **transforms_kwargs,
+        )
+
+        # set transform to data pipeline
+        data_pipeline.set_transform(transform, for_train=False, for_eval=True)
+
+    return data_pipeline
diff --git a/docgenie/data/interfaces/dataset.py b/docgenie/data/interfaces/dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..78183d4222e8ff6a8122035f63c72a1e4039eda9
--- /dev/null
+++ b/docgenie/data/interfaces/dataset.py
@@ -0,0 +1,168 @@
+"""
+Defines interface for docgenie components to load datasets using DatasetFactory and log relevant information.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from torch.utils.data import ConcatDataset
+
+from docgenie.data._core._dataset import Dataset
+from docgenie.data._core._dataset_factory import DatasetFactory  # noqa
+from docgenie.data._core._msgpack_dataset_writer import MsgpackDatasetWriter
+from docgenie.data._core._standard_splitter import StandardSplitter
+from docgenie.data._core._utilities import TaskType, get_logger
+from docgenie.data._transforms.utilities import generate_transform_hash
+from docgenie.data.constants import (
+    DATASET_CONFIGS,
+)
+
+from .transform import load_transform
+
+logger = get_logger(__name__)
+
+
+def get_dataset_config(dataset_name: str, is_synthetic: bool = False):
+    available_dataset_configs = list(
+        filter(
+            lambda x: x.is_synthetic == is_synthetic,
+            DATASET_CONFIGS,
+        )
+    )
+
+    dataset_config = list(
+        filter(
+            lambda x: x.dataset_name == dataset_name,
+            available_dataset_configs,
+        )
+    )
+
+    if len(dataset_config) == 0:
+        raise ValueError(
+            f"Dataset {dataset_name} with is_synthetic={is_synthetic} not found in predefined config list. "
+            f"Available datasets: {[cfg.dataset_name for cfg in available_dataset_configs]}"
+        )
+
+    return dataset_config[0]
+
+
+def preprocess_dataset(
+    dataset: Dataset,
+    transform_type: str = "default",
+    force_overwrite: bool = False,
+    output_dir: Path | dict[str, Path] | None = None,
+    **transforms_kwargs,
+):
+    for split, split_reader in dataset.split_iterators.items():
+        # load transform for this split
+        logger.info("Transforms kwargs passed for preprocessing: %s", transforms_kwargs)
+
+        # generate unique hash for transform
+        transform_hash = generate_transform_hash(transforms_kwargs)
+
+        # load the required transform
+        transform = load_transform(
+            task_type=dataset.task_type,
+            for_train=(split == "train"),
+            transform_type=transform_type,
+            dataset_labels=dataset.metadata.dataset_labels,
+            **transforms_kwargs,
+        )
+
+        # output file path
+        if output_dir is None:
+            output_dir = split_reader.data_dir / "preprocessed_v2"
+
+        if isinstance(output_dir, dict):
+            split_output_dir = output_dir[split]
+        else:
+            split_output_dir = output_dir
+
+        output_file = (
+            split_output_dir
+            / split.value
+            / f"dataset-{dataset.task_type.value}-{transform_hash}.msgpack"
+        )
+
+        # set transform
+        if isinstance(split_reader, ConcatDataset):
+            for ds in split_reader.datasets:
+                ds.set_transform(transform)
+        else:
+            split_reader.set_transform(transform)
+
+        # preprocess and save
+        writer = MsgpackDatasetWriter(
+            dataset_reader=split_reader,
+            output_file=output_file,
+            data_model=transform.get_output_data_model(),
+        )
+        dataset.split_iterators[split] = writer.write(force_overwrite=force_overwrite)
+    return dataset
+
+
+def load_dataset(
+    dataset_name: str,
+    is_synthetic: bool = False,
+    task_type_override: TaskType | None = None,
+    split: str | None = None,
+    create_train_val_splits: bool = True,
+    split_ratio: float = 0.95,
+) -> Dataset:
+    dataset_load_config = get_dataset_config(dataset_name, is_synthetic)
+    dataset = DatasetFactory.load_dataset(
+        dataset_load_config=dataset_load_config,
+        split=split,
+    )
+    if task_type_override is not None:
+        dataset.task_type = task_type_override
+
+    if is_synthetic:
+        return dataset
+
+    if dataset.validation is None and create_train_val_splits:
+        assert dataset.train is not None, (
+            "Dataset splitting enabled but no training dataset found."
+        )
+        dataset_splitter = StandardSplitter(split_ratio=split_ratio, shuffle=True)
+        train, validation = dataset_splitter(dataset.train)
+        dataset.train = train
+        dataset.validation = validation
+
+        assert dataset.train is not None, (
+            "Training split is None in the loaded dataset"
+        )  # for our experiments we always make sure we have validation split present
+        assert dataset.validation is not None, (
+            "Validation split is None in the loaded dataset"
+        )  # for our experiments we always make sure we have validation split present
+
+    return dataset
+
+
+def load_preprocessed_dataset(
+    dataset_name: str,
+    is_synthetic: bool = False,
+    task_type_override: TaskType | None = None,
+    # preprocess related args
+    transform_type: str = "default",
+    split: str | None = None,
+    **transforms_kwargs,
+) -> Dataset:
+    # load raw dataset
+    dataset = load_dataset(
+        dataset_name=dataset_name,
+        is_synthetic=is_synthetic,
+        task_type_override=task_type_override,
+        split=split,
+    )
+
+    # preprocess dataset
+    dataset = preprocess_dataset(
+        dataset=dataset,
+        force_overwrite=False,
+        transform_type=transform_type,
+        **transforms_kwargs,
+    )
+
+    return dataset
diff --git a/docgenie/data/interfaces/mixed_dataset.py b/docgenie/data/interfaces/mixed_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..027220e3414f22e21d0b4d36e60a01bd90987fe0
--- /dev/null
+++ b/docgenie/data/interfaces/mixed_dataset.py
@@ -0,0 +1,193 @@
+"""
+Defines interface for docgenie components to load datasets using DatasetFactory and log relevant information.
+"""
+
+from __future__ import annotations
+
+import copy
+from pathlib import Path
+
+import torch
+from torch.utils.data import ConcatDataset
+
+from docgenie.data._core._data_types import DatasetSplitType
+from docgenie.data._core._dataset import Dataset
+from docgenie.data._core._dataset_factory import DatasetFactory  # noqa
+from docgenie.data._core._msgpack_dataset_reader import MsgpackDatasetReader
+from docgenie.data._core._utilities import TaskType, get_logger
+
+from .dataset import load_dataset, preprocess_dataset
+
+logger = get_logger(__name__)
+
+
+def prepare_random_dataset_subset(
+    dataset: MsgpackDatasetReader,
+    num_samples: int,
+    dataset_name: str,
+    seed: int = 42,
+    cache_subset_indices: bool = False,
+) -> MsgpackDatasetReader:
+    # Select random subset of indices using torch for reproducibility
+    # copy dataset to avoid modifying the original
+
+    cache_file_path = (
+        Path("data")
+        / "cached_subsets"
+        / f"{dataset_name}_subset_{num_samples}_indices.txt"
+    )
+    if cache_subset_indices:
+        if cache_file_path.exists():
+            logger.info(
+                f"Loading cached subset indices from {cache_file_path} for dataset {dataset_name}."
+            )
+            with open(cache_file_path, "r") as f:
+                subset_indices = [int(line.strip()) for line in f]
+            dataset.set_subset_indices(subset_indices)
+            assert len(dataset) == num_samples, (
+                f"Expected dataset length {num_samples}, but got {len(dataset)}"
+            )
+            return dataset
+
+    # close open file readers before deepcopy
+    dataset.close()
+
+    # deepcopy the dataset
+    dataset = copy.deepcopy(dataset)
+
+    # select random subset of indices
+    num_samples = min(num_samples, len(dataset))
+    random_subset_indices = torch.randperm(
+        len(dataset), generator=torch.Generator().manual_seed(seed)
+    )[:num_samples].tolist()
+
+    if cache_subset_indices:
+        cache_file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(cache_file_path, "w") as f:
+            for idx in random_subset_indices:
+                f.write(f"{idx}\n")
+        logger.info(
+            f"Cached subset indices to {cache_file_path} for dataset {dataset_name}."
+        )
+
+    # set subset indices
+    dataset.set_subset_indices(random_subset_indices)
+
+    assert len(dataset) == num_samples, (
+        f"Expected dataset length {num_samples}, but got {len(dataset)}"
+    )
+    return dataset
+
+
+def load_mixed_dataset(
+    dataset_name: str,
+    synthetic_dataset_name: str,
+    num_real_samples: int = -1,
+    num_synthetic_samples: int = -1,
+    task_type_override: TaskType | None = None,
+    split: str | None = None,
+    load_preprocessed: bool = False,
+    transform_type: str = "default",
+    **transforms_kwargs,
+) -> Dataset:
+    # load real dataset
+    real_dataset = load_dataset(
+        dataset_name,
+        is_synthetic=False,
+        task_type_override=task_type_override,
+        split=split,
+    )
+
+    # first extract all splits from real dataset
+    real_train, real_validation, real_test = (
+        real_dataset.train,
+        real_dataset.validation,
+        real_dataset.test,
+    )
+
+    assert real_train is not None, "No training split found in real dataset."
+
+    # load synthetic dataset
+    synthetic_dataset = load_dataset(
+        synthetic_dataset_name,
+        is_synthetic=True,
+        task_type_override=task_type_override,
+        split=split,
+    )
+
+    synthetic_train = synthetic_dataset.train
+    assert synthetic_train is not None, "No training split found in synthetic dataset."
+
+    # make a mixed dataset name based on number of samples
+    assert real_dataset.task_type == synthetic_dataset.task_type, (
+        "Task types of real and synthetic datasets do not match."
+    )
+
+    # now prepare subsets if required
+    if num_real_samples >= 0:
+        assert real_train is not None, "Real dataset does not have a training split."
+        logger.info(
+            f"Preparing real dataset training subset with {num_real_samples} samples."
+        )
+        train_length_before = len(real_train)
+        real_train = prepare_random_dataset_subset(
+            real_train,
+            num_real_samples,
+            dataset_name=dataset_name,
+            cache_subset_indices=True,
+        )
+        logger.info(
+            f"Real dataset size before: {train_length_before}, after: {len(real_train)}"
+        )
+
+    if num_synthetic_samples >= 0:
+        logger.info(
+            f"Preparing synthetic dataset training subset with {num_synthetic_samples} samples."
+        )
+        synthetic_length_before = len(synthetic_train)
+        synthetic_train = prepare_random_dataset_subset(
+            synthetic_train,
+            num_synthetic_samples,
+            dataset_name=synthetic_dataset_name,
+        )
+        logger.info(
+            f"Synthetic dataset size before: {synthetic_length_before}, after: {len(synthetic_train)}"
+        )
+
+    # now combine datasets
+    combined_train = ConcatDataset([real_train, synthetic_train])  # type: ignore[arg-type]
+
+    # log dataset sizes
+    logger.info(
+        f"Combined training dataset size: {len(combined_train)} "
+        f"(Real: {len(real_train)}, Synthetic: {len(synthetic_train)})"
+    )
+
+    mixed_dataset = Dataset(
+        name=f"mixed_{dataset_name}-{num_real_samples}_{synthetic_dataset_name}-{num_synthetic_samples}",
+        split_iterators={
+            DatasetSplitType.train: combined_train,  # type: ignore[arg-type]
+            DatasetSplitType.validation: real_validation,
+            DatasetSplitType.test: real_test,
+        },
+        metadata=real_dataset.metadata,
+        task_type=real_dataset.task_type,
+    )
+
+    # preprocess dataset if required
+    if load_preprocessed:
+        # slighly hacky but to save space and time only store train set preprocessed separately
+        output_dir = {
+            DatasetSplitType.train: Path("data") / "mixed_datasets" / mixed_dataset.name / "preprocessed",
+            DatasetSplitType.test: Path("data") / "mixed_datasets" / dataset_name / "preprocessed",
+            DatasetSplitType.validation: Path("data") / "mixed_datasets" / dataset_name / "preprocessed",
+        }
+        mixed_dataset = preprocess_dataset(
+            dataset=mixed_dataset,
+            force_overwrite=False,
+            transform_type=transform_type,
+            output_dir=output_dir,
+            **transforms_kwargs,
+        )
+
+    return mixed_dataset
diff --git a/docgenie/data/interfaces/synthetic_data.py b/docgenie/data/interfaces/synthetic_data.py
new file mode 100755
index 0000000000000000000000000000000000000000..306812f18f22952916bbcee53806ef365ba026e5
--- /dev/null
+++ b/docgenie/data/interfaces/synthetic_data.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from docgenie import ENV
+from docgenie.data._core._msgpack_dataset_writer import MsgpackDatasetWriter
+from docgenie.data._core._synth import SynthesizedDataset
+from docgenie.data._core._utilities import TaskType, get_logger
+from docgenie.generation.models import (
+    SynDatasetDefinition,
+)
+
+from .dataset import load_dataset
+
+logger = get_logger(__name__)
+
+
+def _get_output_dataset_path(dataset_name: str) -> Path:
+    # this is important must stay same as the other datasets
+    # load the output file with the output directory structure
+    # is used by DatasetFactory to load datasets
+    return (
+        ENV.SYN_DATASETS_PREPARED_DIR
+        / dataset_name
+        / "storage"
+        / "default"
+        / "msgpack"
+        / "train"
+        / "dataset.msgpack"
+    )
+
+
+def _get_output_metadata_path(dataset_name: str) -> Path:
+    return ENV.SYN_DATASETS_PREPARED_DIR / dataset_name / "storage" / "metadata.yaml"
+
+
+def prepare_synthetic_dataset(
+    dsdef: SynDatasetDefinition,
+    force_overwrite: bool = False,
+    resize_images: bool = False,
+    clip_bboxes_to_foreground: bool = False,
+):
+    output_file = _get_output_dataset_path(dsdef.name)
+
+    # first load original dataset to get labels and task type
+    original_dataset = load_dataset(
+        dataset_name=dsdef.base_dataset_name,
+        split="train",
+    )
+
+    # get the labels based on the task
+    dataset_labels = None
+    if original_dataset.task_type == TaskType.sequence_classification:
+        dataset_labels = original_dataset.metadata.dataset_labels.classification
+    elif original_dataset.task_type == TaskType.token_classification:
+        dataset_labels = original_dataset.metadata.dataset_labels.ser
+    elif original_dataset.task_type == TaskType.layout_analysis:
+        dataset_labels = original_dataset.metadata.dataset_labels.layout
+
+    # now create synthesized dataset
+    dataset = SynthesizedDataset(
+        dsdef=dsdef,
+        task_type=original_dataset.task_type,
+        dataset_labels=dataset_labels,
+        resize_images=resize_images,
+        clip_bboxes_to_foreground=clip_bboxes_to_foreground,
+    )
+
+    # msgpack writer
+    MsgpackDatasetWriter(dataset, output_file).write(force_overwrite=force_overwrite)
+
+    # ensure parent dir exists
+    metadata_file = _get_output_metadata_path(dsdef.name)
+    with open(metadata_file, "w") as f:
+        yaml_content = original_dataset.metadata.model_dump()
+        yaml.dump(yaml_content, f)
+
+
+def load_synthetic_dataset(
+    dataset_name: str,
+    task_type_override: TaskType | None = None,
+    split: str | None = None,
+):
+    dataset = load_dataset(
+        dataset_name=dataset_name,
+        is_synthetic=True,
+        task_type_override=task_type_override,
+        split=split,
+    )
+    return dataset
diff --git a/docgenie/data/interfaces/transform.py b/docgenie/data/interfaces/transform.py
new file mode 100755
index 0000000000000000000000000000000000000000..3a4c1c34e56ad336ba9f839d02a0c1eca3647487
--- /dev/null
+++ b/docgenie/data/interfaces/transform.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from docgenie.data._core._data_types import DatasetLabels
+from docgenie.data._core._utilities import TaskType, get_logger
+from docgenie.data._transforms._generics._base import BaseTransform
+
+logger = get_logger(__name__)
+
+
+def _get_default_transform(
+    task_type: TaskType,
+    for_train: bool = False,
+    **kwargs,
+) -> BaseTransform:
+    """Get default transform for a given task type."""
+    from docgenie.data._transforms._tokenizers._document_processors import (
+        BaseDocumentProcessor,
+        QuestionAnsweringDocumentProcessor,
+        SequenceClassificationDocumentProcessor,
+        TokenClassificationDocumentProcessor,
+    )
+
+    if task_type == TaskType.generate_embeddings:
+        return BaseDocumentProcessor(**kwargs)
+    if task_type == TaskType.sequence_classification:
+        return SequenceClassificationDocumentProcessor(**kwargs)
+    elif task_type == TaskType.token_classification:
+        return TokenClassificationDocumentProcessor(**kwargs)
+    elif task_type == TaskType.extractive_qa:
+        return QuestionAnsweringDocumentProcessor(is_training=for_train, **kwargs)
+    elif task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+        from docgenie.data._transforms.mmdet import DocumentInstanceMMDetTransform
+
+        return DocumentInstanceMMDetTransform(is_training=for_train, **kwargs)
+    else:
+        raise ValueError(f"Unsupported task type for transform: {task_type}")
+
+
+def _get_conditional_generation_transform(
+    task_type: TaskType,
+    tokenizer_name: str,
+    is_training: bool = False,
+    dataset_labels: DatasetLabels | None = None,
+    **kwargs,
+) -> BaseTransform:
+    """Get conditional generation transform."""
+    from docgenie.data._transforms._tokenizers._conditional_generation import (
+        ConditionalGenerationTokenizer,
+    )
+
+    return ConditionalGenerationTokenizer(
+        tokenizer_name=tokenizer_name,
+        task_type=task_type,
+        is_training=is_training,
+        dataset_labels=dataset_labels,
+        **kwargs,
+    )
+
+
+def load_transform(
+    task_type: TaskType,
+    for_train: bool = False,
+    transform_type: str = "default",
+    dataset_labels: DatasetLabels | None = None,
+    **kwargs,
+) -> BaseTransform:
+    """Load transform based on type and task."""
+    if transform_type == "default":
+        return _get_default_transform(
+            task_type=task_type, for_train=for_train, **kwargs
+        )
+    elif transform_type == "conditional_generation":
+        return _get_conditional_generation_transform(
+            task_type=task_type,
+            is_training=for_train,
+            dataset_labels=dataset_labels,
+            **kwargs,
+        )
+    else:
+        raise ValueError(f"Unsupported transform type: {transform_type}")
diff --git a/docgenie/evaluation/model_pipeline/__init__.py b/docgenie/evaluation/model_pipeline/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..7a4bcb418d89255a1a2b9cb41150db2f3ef36421
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/__init__.py
@@ -0,0 +1 @@
+from .interface import *  # noqa
diff --git a/docgenie/evaluation/model_pipeline/_core/__init__.py b/docgenie/evaluation/model_pipeline/_core/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docgenie/evaluation/model_pipeline/_core/_base.py b/docgenie/evaluation/model_pipeline/_core/_base.py
new file mode 100755
index 0000000000000000000000000000000000000000..500c03fefa48eefda1b3be3f06365f04cf0f5e4d
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_base.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import inspect
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Self
+
+from atria_core.types import DatasetMetadata
+from atria_core.utilities.repr import RepresentationMixin
+
+from docgenie.data._core._data_types import (
+    BaseModelInput,
+)
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    import torch
+    from atria_core.types import (
+        DatasetMetadata,
+        ModelOutput,
+    )
+
+logger = get_logger(__name__)
+
+
+class ModelPipeline(ABC, RepresentationMixin):
+    def __init__(self, dataset_metadata: DatasetMetadata | None = None, **kwargs):
+        import ignite.distributed as idist
+
+        super().__init__()
+        self._dataset_metadata = dataset_metadata
+
+        if idist.get_rank() > 0:  # Stop all ranks > 0
+            idist.barrier()
+
+        self._model_name = kwargs.get("model_name", "unknown_model")
+        self._model = self._build_model(dataset_metadata=dataset_metadata, **kwargs)
+        self._possible_args = inspect.signature(self._model.forward).parameters
+
+        if idist.get_rank() == 0:
+            idist.barrier()
+
+    @property
+    def model(self) -> torch.nn.Module:
+        return self._model
+
+    def to(self, device: str | torch.device) -> Self:
+        self._model.to(device)
+        return self
+
+    def train(self):
+        self._model.train()
+        return self
+
+    def eval(self):
+        self._model.eval()
+        return self
+
+    def half(self):
+        self._model.half()
+        return self
+
+    def state_dict(self) -> dict:
+        state_dict = {}
+        if self._dataset_metadata is not None:
+            state_dict["dataset_metadata"] = self._dataset_metadata.state_dict()
+        state_dict["model"] = self.model.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        if "dataset_metadata" in state_dict:
+            self._dataset_metadata.load_state_dict(state_dict["dataset_metadata"])
+        logger.info("Loading model state dict.")
+        self._model.load_state_dict(state_dict["model"], strict=True)
+
+    @abstractmethod
+    def training_step(self, batch: BaseModelInput, **kwargs) -> ModelOutput:
+        pass
+
+    @abstractmethod
+    def evaluation_step(
+        self,
+        batch: BaseModelInput,
+        **kwargs,
+    ) -> ModelOutput:
+        pass
+
+    @abstractmethod
+    def _build_model(self, *args, **kwargs) -> torch.nn.Module:
+        pass
+
+    def __repr__(self):
+        from torchinfo import summary
+
+        return f"{self.__class__.__name__} ({self._model_name}):\n{str(summary(self._model, verbose=0, depth=3))}"
+
+    def __str__(self):
+        return self.__repr__()
diff --git a/docgenie/evaluation/model_pipeline/_core/_conditional_generation.py b/docgenie/evaluation/model_pipeline/_core/_conditional_generation.py
new file mode 100755
index 0000000000000000000000000000000000000000..7b385781a3ec7170d73db0878bbdcdcdcd59c1d0
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_conditional_generation.py
@@ -0,0 +1,503 @@
+from __future__ import annotations
+
+import re
+from abc import abstractmethod
+from typing import Any
+
+import torch
+from rapidfuzz import fuzz, process
+
+from docgenie.data._core._data_types import (
+    ConditionalGenerationModelInput,
+)
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    ClassificationModelOutput,
+    ModelOutput,
+    QAModelOutput,
+    QAPair,
+    TokenClassificationModelOutput,
+)
+from docgenie.logging import get_logger
+
+from ._base import ModelPipeline
+
+logger = get_logger(__name__)
+
+
+def convert_predictions_to_labels(predicted_text: str, words: list[str]) -> list[str]:
+    predicted_entities = []
+    for item in predicted_text:
+        parts = item.split("|", 1)
+        if len(parts) == 2:
+            predicted_entities.append((parts[1], parts[0]))
+
+    predicted_labels = ["O"] * len(words)
+    last_word_index = -1
+    for predicted_word, predicted_entity in predicted_entities:
+        try:
+            original_word_idx = words.index(predicted_word.strip(), last_word_index + 1)
+            predicted_labels[original_word_idx] = predicted_entity
+            last_word_index = original_word_idx
+        except ValueError:
+            continue  # skip if the word is not found
+
+    return predicted_labels
+
+
+def parse_key_value_pairs(entity_string, valid_tags):
+    """
+    Parses a string where each token is followed by its tag,
+    e.g. "Hello typea world typeb my typea guy typec"
+
+    Returns two lists: keys (tokens) and values (tags).
+    Any tag not in valid_tags is replaced by "O".
+    """
+
+    tags = [x.replace("B-", "").replace("I-", "") for x in valid_tags]
+    tags_str = "|".join(tags)
+    pairs = re.findall(rf"(\S+?)\s((?:B-|I-)(?:{tags_str})|O)", entity_string)
+    keys, values = [], []
+    for key, value in pairs:
+        if key.strip() == "":
+            continue
+        if value not in valid_tags:
+            value = "O"
+        keys.append(key)
+        values.append(value)
+
+    return keys, values
+
+
+def _convert_entity_string_to_dict(
+    entity_string: str, entity_str_type: str = "csv"
+) -> dict:
+    if entity_str_type == "json":
+        import json
+
+        return json.loads(entity_string)
+    elif entity_str_type == "csv":
+        predictions = entity_string.split("|")
+        predicted_entities = {}
+        for prediction in predictions:
+            if "=" not in prediction:
+                continue
+            result = prediction.split("=")
+            if len(result) != 2:
+                continue
+            entity_type, entity_value = result[0], result[1]
+            if entity_type.strip() not in predicted_entities:
+                predicted_entities[entity_type.strip()] = []
+            predicted_entities[entity_type.strip()].append(entity_value.strip())
+        return predicted_entities
+    else:
+        raise ValueError(f"Unsupported entity_str_type: {entity_str_type}")
+
+
+# def convert_entity_to_word_labels(entities, words):
+#     word_labels = ["O"] * len(words)
+#     for _, key, text in entities:
+#         result, _ = _find_text_spans_in_text_list(
+#             [text.lower()], [w.lower() for w in words]
+#         )
+#         if len(result) == 0:
+#             continue
+#         result = result[0]
+#         start_index = result["answer_start_index"]
+#         end_index = result["answer_end_index"]
+#         word_labels[start_index] = f"B-{key}"
+#         word_labels[start_index + 1 : end_index + 1] = [f"I-{key}"] * (
+#             end_index - start_index
+#         )
+#     return word_labels
+
+
+class ConditionalGenerationPipeline(ModelPipeline):
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        dataset_metadata: DatasetMetadata,
+    ):
+        self._tokenizer = None
+
+        super().__init__(
+            model_name=model_name,
+            dataset_metadata=dataset_metadata,
+            model_cache_dir=model_cache_dir,
+        )
+
+    def _build_model(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        **kwargs,
+    ) -> Module:
+        from transformers import (
+            AutoProcessor,
+            T5ForConditionalGeneration,
+            UdopForConditionalGeneration,
+        )
+
+        self._processor = AutoProcessor.from_pretrained(
+            model_name,
+            cache_dir=model_cache_dir,
+        )
+        if model_name == "microsoft/udop-large":
+            self._tokenizer = self._processor.tokenizer
+            return UdopForConditionalGeneration.from_pretrained(
+                model_name,
+                cache_dir=model_cache_dir,
+            )
+        elif model_name in ["google-t5/t5-large", "google-t5/t5-base"]:
+            self._tokenizer = self._processor
+            return T5ForConditionalGeneration.from_pretrained(
+                model_name,
+                cache_dir=model_cache_dir,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported model name for ConditionalGenerationPipeline: {model_name}"
+            )
+
+    def training_step(
+        self, batch: ConditionalGenerationModelInput, **kwargs
+    ) -> ModelOutput:
+        # assume token bboxes are in [0, 1] range
+        token_bboxes = (
+            (batch.bbox * 1000.0).clip(0, 1000).long()
+            if batch.bbox is not None
+            else None
+        )
+        inputs = {
+            "input_ids": batch.input_ids,
+            "attention_mask": batch.attention_mask,
+            "bbox": token_bboxes,
+            "pixel_values": batch.pixel_values,
+            "labels": batch.target_token_ids,
+        }
+        # for key, value in inputs.items():
+        #     print(key, value.shape if isinstance(value, torch.Tensor) else value)
+        # test = inputs["labels"].clone()
+        # test[test == -100] = self._tokenizer.pad_token_id
+        # decoded_labels = self._tokenizer.batch_decode(
+        #     test, skip_special_tokens=False
+        # )
+        # print('decoded labels:', decoded_labels)
+
+        for key in list(inputs.keys()):
+            if key not in self._possible_args:
+                inputs.pop(key)
+
+        assert inputs["attention_mask"] is not None, "Attention mask cannot be None"
+        output = self._model(**inputs)
+        return ModelOutput(loss=output.loss)
+
+    def evaluation_step(
+        self, batch: ConditionalGenerationModelInput, **kwargs
+    ) -> ModelOutput:
+        # assume token bboxes are in [0, 1] range
+        token_bboxes = (
+            (batch.bbox * 1000.0).clip(0, 1000).long()
+            if batch.bbox is not None
+            else None
+        )
+        inputs = {
+            "input_ids": batch.input_ids,
+            "attention_mask": batch.attention_mask,
+            "bbox": token_bboxes,
+            "pixel_values": batch.pixel_values,
+        }
+        for key in list(inputs.keys()):
+            if key not in self._possible_args:
+                inputs.pop(key)
+        # print('input text',inputs['input_ids'])
+        # decoded_input_texts = self._processor.batch_decode(
+        #     inputs['input_ids'], skip_special_tokens=False
+        # )
+        # print('decoded input texts:', decoded_input_texts)
+        assert inputs["attention_mask"] is not None, "Attention mask cannot be None"
+        predicted_texts = self._generate_predictions(inputs)
+        print("Evaluation step, predicted_texts:", predicted_texts)
+        return self._prepare_evaluation_output(batch, predicted_texts)
+
+    def _generate_predictions(self, inputs: dict[str, Any], **kwargs) -> list[str]:
+        predicted_ids = self._model.generate(**inputs, max_length=2048)
+        predicted_texts = self._processor.batch_decode(
+            predicted_ids, skip_special_tokens=True
+        )
+        return predicted_texts
+
+    @abstractmethod
+    def _prepare_evaluation_output(
+        self, batch: ConditionalGenerationModelInput, predicted_texts: list[str]
+    ) -> ModelOutput:
+        pass
+
+
+class GenerativeSequenceClassificationPipeline(ConditionalGenerationPipeline):
+    def _prepare_evaluation_output(
+        self, batch: ConditionalGenerationModelInput, predicted_texts: list[str]
+    ) -> ModelOutput:
+        # for classification the expected output is document class name
+        possible_labels = self._dataset_metadata.dataset_labels.classification
+        assert possible_labels is not None, (
+            "Possible labels cannot be None for classification task"
+        )
+        assert batch.target_text is not None, (
+            "Target text cannot be None for classification task"
+        )
+        target_label_values = [possible_labels.index(t) for t in batch.target_text]
+        predicted_label_values = []
+        for predicted_text in predicted_texts:
+            predicted_label_name = predicted_text.strip()
+            if predicted_label_name in possible_labels:
+                predicted_label_value = possible_labels.index(predicted_text.strip())
+            else:
+                predicted_label_value = torch.randint(
+                    0, len(possible_labels), (1,)
+                ).item()
+            predicted_label_values.append(predicted_label_value)
+
+        print("predicted_label_values", predicted_label_values)
+        print("target_label_values", target_label_values)
+
+        # create logits from ground truth label values
+        # initialize logits with small random values
+        # this is a hack to pass the logits for evaluation metrics
+        logits = torch.zeros((len(predicted_label_values), len(possible_labels)))
+
+        # Set high probability for the correct labels
+        for i, target_idx in enumerate(predicted_label_values):
+            logits[i, target_idx] = 1.0
+
+        return ClassificationModelOutput(
+            loss=0.0,  # just pass a dummy value
+            gt_label_value=torch.tensor(target_label_values),
+            logits=logits,
+        )
+
+    def _generate_predictions(self, inputs: dict[str, Any], **kwargs) -> list[str]:
+        predicted_ids = self._model.generate(**inputs, max_length=32)
+        predicted_texts = self._processor.batch_decode(
+            predicted_ids, skip_special_tokens=True
+        )
+        return predicted_texts
+
+
+class GenerativeTokenClassificationPipeline(ConditionalGenerationPipeline):
+    def _prepare_evaluation_output(
+        self,
+        batch,
+        predicted_texts: list[str],
+        match_threshold: int = 80,
+    ):
+        predicted_labels = []
+        target_labels = []
+        for words_per_sample, predicted_text_per_sample, word_labels_per_sample in zip(
+            batch.words, predicted_texts, batch.word_labels
+        ):
+            predicted_words_per_sample, predicted_labels_per_sample = (
+                parse_key_value_pairs(
+                    predicted_text_per_sample, valid_tags=self._get_token_labels()
+                )
+            )
+
+            predicted_labels_per_sample_cleaned = ["O"] * len(word_labels_per_sample)
+            for word_idx, word in enumerate(words_per_sample):
+                # Fuzzy match the predicted word to the remaining reference words
+                result = process.extractOne(
+                    query=word, choices=predicted_words_per_sample, scorer=fuzz.ratio
+                )
+                if result is None:
+                    continue
+
+                match, score, predicted_word_idx = result
+                if score >= match_threshold:
+                    predicted_words_per_sample.pop(
+                        predicted_word_idx
+                    )  # remove to prevent re-matching
+                    predicted_labels_per_sample_cleaned[word_idx] = (
+                        predicted_labels_per_sample.pop(predicted_word_idx)
+                    )
+                    print(
+                        "Matched word:",
+                        word,
+                        "at index:",
+                        word_idx,
+                        "with label:",
+                        predicted_labels_per_sample_cleaned[word_idx],
+                        "target label:",
+                        word_labels_per_sample[word_idx],
+                    )
+                else:
+                    # No good match found, skip
+                    continue
+            # we remap all the target entities to the text to get the target labels in BIO format
+            # this is necessary because the words are now per chunk of the input and not the whole document
+            # therefore we find all entities in this chunk and assign the labels accordingly
+            # target_labels_per_sample = ["O"] * len(words_per_sample)
+            # for key, value in target_entities.items():
+            #     for v in value:
+            #         output = _find_text_spans_in_text_list(v, words_per_sample)
+            #         start_idx, end_idx = (
+            #             output["answer_start_index"],
+            #             output["answer_end_index"],
+            #         )
+            #         target_labels_per_sample[start_idx] = "B-" + key
+            #         target_labels_per_sample[start_idx + 1 : end_idx + 1] = ["I-" + key] * (
+            #             end_idx - start_idx
+            #         )
+
+            # predicted_labels_per_sample = ["O"] * len(words_per_sample)
+            # for key, value in extracted_entities.items():
+            #     for v in value:
+            #         output = _find_text_spans_in_text_list(v, words_per_sample)
+            #         start_idx, end_idx = (
+            #             output["answer_start_index"],
+            #             output["answer_end_index"],
+            #         )
+            #         predicted_labels_per_sample[start_idx] = "B-" + key
+            #         predicted_labels_per_sample[start_idx + 1 : end_idx + 1] = ["I-" + key] * (
+            #             end_idx - start_idx
+            #         )
+
+            # # now we match the extracted entities with ground truth entities using fuzzy matching
+            # # if the match is found above the threshold, we use the ground truth entity text for labeling
+            # matches = []
+            # for key in target_entities.keys():
+            #     if key not in extracted_entities:
+            #         continue
+            #     for ext in extracted_entities[key]:
+            #         # Find the best match in ground truth
+            #         match, score, idx = process.extractOne(
+            #             ext, target_entities[key], scorer=fuzz.ratio
+            #         )
+            #         if score >= match_threshold:
+            #             matches.append((key, match))  # matched
+
+            # predicted_labels_per_sample = ["O"] * len(words_per_sample)
+            # for m in matches:
+            #     output = _find_text_spans_in_text_list(m[1], words_per_sample)
+            #     start_idx, end_idx = (
+            #         output["answer_start_index"],
+            #         output["answer_end_index"],
+            #     )
+            #     predicted_labels_per_sample[start_idx] = "B-" + m[0]
+            #     predicted_labels_per_sample[start_idx + 1 : end_idx + 1] = ["I-" + m[0]] * (
+            #         end_idx - start_idx
+            #     )
+
+            # predicted_labels_per_sample_padded = ["O"] * len(target_labels_per_sample)
+            # for idx, label in enumerate(predicted_labels_per_sample):
+            #     if idx >= len(predicted_labels_per_sample_padded):
+            #         break
+            #     predicted_labels_per_sample_padded[idx] = label
+
+            target_labels.append(word_labels_per_sample)
+            predicted_labels.append(predicted_labels_per_sample_cleaned)
+
+        print("p_labels", predicted_labels)
+        print("t_labels", target_labels)
+        return TokenClassificationModelOutput(
+            loss=0.0,
+            predicted_label_names=predicted_labels,
+            target_label_names=target_labels,
+        )
+
+    # def _prepare_evaluation_output(
+    #     self, batch: ConditionalGenerationModelInput, predicted_texts: list[str]
+    # ) -> ModelOutput:
+    #     predicted_labels = []
+    #     id_to_label_map = {
+    #         idx: label for idx, label in enumerate(self._get_token_labels())
+    #     }
+    #     for words_per_sample, predicted_text_per_sample in zip(
+    #         batch.words, predicted_texts
+    #     ):
+    #         predicted_labels_per_sample = convert_predictions_to_labels(
+    #             predicted_text_per_sample,
+    #             words_per_sample,
+    #         )
+    #         predicted_labels.append(predicted_labels_per_sample)
+    #     return TokenClassificationModelOutput(
+    #         loss=0.0,
+    #         predicted_label_names=predicted_labels,
+    #         target_label_names=batch.word_labels,
+    #     )
+
+    def _get_token_labels(self) -> list[str]:
+        return self._dataset_metadata.dataset_labels.ser
+
+    # def _build_model(
+    #     self,
+    #     model_name: str,
+    #     model_cache_dir: str,
+    #     **kwargs,
+    # ) -> Module:
+    #     model = super()._build_model(
+    #         model_name=model_name, model_cache_dir=model_cache_dir, **kwargs
+    #     )
+    #     possible_labels = self._get_token_labels()
+    #     possible_labels = [f"<{lbl}>" for lbl in possible_labels]
+    #     num_added_tokens = self._tokenizer.add_special_tokens({"additional_special_tokens": possible_labels})
+    #     logger.info(f"Added {num_added_tokens} special tokens for entity labels: {possible_labels}")
+    #     model.resize_token_embeddings(len(self._tokenizer))
+    #     return model
+
+    def _generate_predictions(self, inputs: dict[str, Any], **kwargs) -> list[str]:
+        predicted_ids = self._model.generate(**inputs, max_length=1024)
+        predicted_texts = self._processor.batch_decode(
+            predicted_ids, skip_special_tokens=True
+        )
+        return predicted_texts
+
+
+class GenerativeQuestionAnsweringPipeline(ConditionalGenerationPipeline):
+    def _prepare_evaluation_output(
+        self, batch: ConditionalGenerationModelInput, predicted_texts: list[str]
+    ) -> ModelOutput:
+        qa_outputs = []
+        for sample_id, predicted_answer, question_text in zip(
+            batch.sample_id, predicted_texts, batch.question_text
+        ):
+            if "_page_" in sample_id:
+                sample_id = sample_id.split("_page_")[0]  # get the original sample id
+            else:
+                sample_id = sample_id.split("_subsample_")[
+                    0
+                ]  # get the original sample id
+            qa_outputs.append(
+                QAPair(
+                    sample_id=sample_id, question=question_text, answer=predicted_answer
+                )
+            )
+
+        return QAModelOutput(loss=0.0, qa_pairs=qa_outputs)
+
+    def _generate_predictions(self, inputs: dict[str, Any], **kwargs) -> list[str]:
+        predicted_ids = self._model.generate(**inputs, max_length=128)
+        predicted_texts = self._processor.batch_decode(
+            predicted_ids, skip_special_tokens=True
+        )
+        return predicted_texts
+
+
+class GenerativeLayoutAnalysisPipeline(ConditionalGenerationPipeline):
+    def _prepare_evaluation_output(
+        self, batch: ConditionalGenerationModelInput, predicted_texts: list[str]
+    ) -> ModelOutput:
+        # # we extract all entities from the predicted text
+        # extracted_layout_entities = []
+
+        # # get image shape
+        # image_width, image_height = batch.image.size[0], batch.image.size[1]
+        # for predicted_text_per_sample in predicted_texts:
+        #     pattern = r"<(\d+)><(\d+)><(\d+)><(\d+)><([^>]+)>"
+        #     entities_per_sample = re.findall(pattern, predicted_text_per_sample)
+        #     for (x1, y1, x2, y2, label) in entities_per_sample:
+        #         extracted_layout_entities.append(
+        #             (int(x1), int(y1), int(x2), int(y2), label)
+        #         )
+
+        #     print('matches',matches)
+        return ModelOutput(loss=0.0)
diff --git a/docgenie/evaluation/model_pipeline/_core/_data_types.py b/docgenie/evaluation/model_pipeline/_core/_data_types.py
new file mode 100755
index 0000000000000000000000000000000000000000..34647325c8c898240524e2d722637dff3c1cbdac
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_data_types.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import torch
+
+
+@dataclass(frozen=True)
+class ModelOutput:
+    loss: torch.Tensor | None = None
+
+
+@dataclass(frozen=True)
+class ClassificationModelOutput(ModelOutput):
+    logits: torch.Tensor | None = None
+    prediction_probs: torch.Tensor | None = None
+    gt_label_value: torch.Tensor | None = None
+    gt_label_name: list[str] | None = None
+    predicted_label_value: torch.Tensor | None = None
+    predicted_label_name: list[str] | None = None
+
+
+@dataclass(frozen=True)
+class TokenClassificationModelOutput(ModelOutput):
+    logits: torch.Tensor | None = None
+    predicted_label_names: list[list[str]] | None = None
+    target_label_names: list[list[str]] | None = None
+
+
+@dataclass(frozen=True)
+class LayoutTokenClassificationModelOutput(TokenClassificationModelOutput):
+    token_labels: torch.Tensor | None = None
+    token_bboxes: torch.Tensor | None = None
+
+
+@dataclass(frozen=True)
+class QAPair:
+    sample_id: str
+    question: str
+    answer: str
+
+
+@dataclass(frozen=True)
+class QAModelOutput(ModelOutput):
+    qa_pairs: list[QAPair] | None = None
+
+
+@dataclass(frozen=True)
+class MMDetEvaluationOutput(ModelOutput):
+    loss_dict: dict | None = None
+    det_data_samples: list[Any] | None = None
+    class_labels: list[str] | None = None
diff --git a/docgenie/evaluation/model_pipeline/_core/_detection.py b/docgenie/evaluation/model_pipeline/_core/_detection.py
new file mode 100755
index 0000000000000000000000000000000000000000..113c6a6625d93ce96ee11f36809ada2d27c7747e
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_detection.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import os
+
+import torch
+from mmdet.registry import MODELS
+from mmdet.utils import register_all_modules
+from mmengine.config import Config
+from torch.nn import Module
+
+from docgenie.data._core._data_types import DatasetMetadata, MMDetInput
+from docgenie.evaluation.model_pipeline._core._data_types import MMDetEvaluationOutput
+from docgenie.logging import get_logger
+
+from ._base import ModelPipeline
+
+logger = get_logger(__name__)
+
+register_all_modules()
+
+
+class DetectionPipeline(ModelPipeline):
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        dataset_metadata: DatasetMetadata,
+    ):
+        super().__init__(
+            model_name=model_name,
+            model_cache_dir=model_cache_dir,
+            dataset_metadata=dataset_metadata,
+        )
+
+    def _build_model(
+        self, model_name: str, model_cache_dir: str, pretrained: bool = True, **kwargs
+    ) -> Module:
+        from mmengine.runner import load_checkpoint
+
+        cfg_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "mmdet_cfgs/configs"
+        )
+        available_cfgs = {
+            os.path.splitext(f)[0]: os.path.join(cfg_dir, f)
+            for f in os.listdir(cfg_dir)
+            if f.endswith(".py")
+        }
+
+        if model_name not in available_cfgs:
+            raise ValueError(
+                f"Unsupported model name: {model_name}. "
+                f"Available options: {list(available_cfgs.keys())}"
+            )
+
+        path = available_cfgs[model_name]
+
+        config = Config.fromfile(path)
+        config.model.data_preprocessor.bgr_to_rgb = False
+
+        # reconfigure given the dataset
+        num_classes = len(self._dataset_metadata.dataset_labels.layout)
+        if "roi_head" in config.model:
+            if isinstance(config.model.roi_head.bbox_head, list):
+                # cascade-rcnn
+                for head in config.model.roi_head.bbox_head:
+                    head.num_classes = num_classes
+            else:
+                # faster-rcnn
+                config.model.roi_head.bbox_head.num_classes = num_classes
+        elif "bbox_head" in config.model:
+            # yolov3
+            config.model.bbox_head.num_classes = num_classes
+        else:
+            raise Exception(
+                "Could not automatically locate bbox_head — check model structure."
+            )
+
+        logger.info("Building model from config:")
+        logger.info(config.pretty_text)
+
+        model = MODELS.build(config.model)
+        load_checkpoint(model, config.model_path)
+        model._is_init = True
+
+        # # reinit head layers
+        # if "roi_head" in config.model:
+        #     if isinstance(config.model.roi_head.bbox_head, list):
+        #         # cascade-rcnn
+        #         for head in model.roi_head.bbox_head:
+        #             head.init_weights()
+        #     else:
+        #         # faster-rcnn
+        #         model.roi_head.bbox_head.init_weights()
+        # elif "bbox_head" in config.model:
+        #     # yolov3
+        #     model.bbox_head.init_weights()
+
+        return model
+
+    def training_step(self, batch: MMDetInput, **kwargs) -> MMDetEvaluationOutput:  # type: ignore
+        batch_dict = {
+            "inputs": batch.inputs,
+            "data_samples": batch.data_samples,
+        }
+        batch_dict = self._model.data_preprocessor(batch_dict, training=True)
+        losses = self._model._run_forward(batch_dict, mode="loss")
+        loss, loss_dict = self._model.parse_losses(losses)
+
+        return MMDetEvaluationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            det_data_samples=batch.data_samples,
+            class_labels=[
+                label.upper() for label in self._dataset_metadata.dataset_labels.layout
+            ],
+        )
+
+    def evaluation_step(  # type: ignore
+        self, batch: MMDetInput, stage="validation", **kwargs
+    ) -> MMDetEvaluationOutput:
+        batch_dict = {
+            "inputs": batch.inputs,
+            "data_samples": batch.data_samples,
+        }
+
+        if stage == "validation":
+            det_data_samples = self._model.val_step(batch_dict)
+        elif stage == "test":
+            det_data_samples = self._model.test_step(batch_dict)
+        else:
+            raise ValueError(f"Unsupported stage: {stage}")
+
+        return MMDetEvaluationOutput(
+            loss=torch.tensor(0.0),
+            det_data_samples=det_data_samples,
+            class_labels=[
+                label.upper() for label in self._dataset_metadata.dataset_labels.layout
+            ],
+        )
diff --git a/docgenie/evaluation/model_pipeline/_core/_question_answering.py b/docgenie/evaluation/model_pipeline/_core/_question_answering.py
new file mode 100755
index 0000000000000000000000000000000000000000..b28c262ba68861f2a77df10ba4980e56f4052f79
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_question_answering.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+from atria_core.types import (
+    DatasetMetadata,
+)
+from torch.nn.modules import Module
+
+from docgenie.data._core._data_types import (
+    DocumentInstanceModelInput,
+    OverflowStrategy,
+)
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    QAModelOutput,
+    QAPair,
+)
+from docgenie.logging import get_logger
+
+from ._base import ModelPipeline
+
+logger = get_logger(__name__)
+
+
+class QuestionAnsweringPipeline(ModelPipeline):
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        dataset_metadata: DatasetMetadata,
+        pretrained: bool = True,
+        use_bbox: bool = True,
+        use_image: bool = True,
+        training_overflow_strategy: OverflowStrategy = OverflowStrategy.select_random,
+        evaluation_overflow_strategy: OverflowStrategy = OverflowStrategy.select_all,
+        input_stride: int = 0,
+    ):
+        super().__init__(
+            model_name=model_name,
+            pretrained=pretrained,
+            dataset_metadata=dataset_metadata,
+            model_cache_dir=model_cache_dir,
+        )
+        self._use_bbox = use_bbox
+        self._use_image = use_image
+        self._training_overflow_strategy = training_overflow_strategy
+        self._evaluation_overflow_strategy = evaluation_overflow_strategy
+        self._input_stride = input_stride
+
+    def _build_model(
+        self, model_name: str, model_cache_dir: str, pretrained: bool = True, **kwargs
+    ) -> Module:
+        from rich.pretty import pretty_repr
+        from transformers import AutoConfig, AutoModelForQuestionAnswering
+
+        if pretrained:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir
+            )
+
+            logger.info(
+                f"Initializing the model with the following config:\n {pretty_repr(hf_config, expand_all=True)}"
+            )
+            return AutoModelForQuestionAnswering.from_pretrained(
+                model_name,
+                config=hf_config,
+                cache_dir=model_cache_dir,
+            )
+        else:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir
+            )
+            return AutoModelForQuestionAnswering.from_config(
+                config=hf_config,
+            )
+
+    def _verify_and_filter_inputs(
+        self,
+        batch: DocumentInstanceModelInput,
+    ) -> dict[str, torch.Tensor | None]:
+        # assume token bboxes are in [0, 1] range
+        token_bboxes = (
+            (batch.token_bboxes * 1000.0).clip(0, 1000).long()
+            if batch.token_bboxes is not None and self._use_bbox
+            else None
+        )
+        pixel_values = (
+            batch.image if batch.image is not None and self._use_image else None
+        )
+        inputs = {
+            "input_ids": batch.token_ids,
+            "token_type_ids": batch.token_type_ids,
+            "attention_mask": batch.attention_mask,
+            "bbox": token_bboxes,
+            "pixel_values": pixel_values,
+            "start_positions": batch.token_answer_start,
+            "end_positions": batch.token_answer_end,
+            "segment_index": batch.segment_index,
+            "segment_inner_token_rank": batch.segment_inner_token_rank,
+            "first_token_idxes": batch.first_token_idxes,
+            "first_token_idxes_mask": batch.first_token_idxes_mask,
+        }
+
+        # assert that we always get the arguments
+        assert inputs["input_ids"] is not None, "Attention mask cannot be None"
+        assert inputs["attention_mask"] is not None, "Attention mask cannot be None"
+        assert inputs["pixel_values"] is not None, "Pixel values cannot be None"
+        assert inputs["bbox"] is not None, "Labels cannot be None"
+        assert inputs["start_positions"] is not None, "Start positions cannot be None"
+        assert inputs["end_positions"] is not None, "End positions cannot be None"
+
+        filtered_inputs = {}
+        for key in list(inputs.keys()):
+            if key not in self._possible_args:
+                continue
+            filtered_inputs[key] = inputs[key]
+        return filtered_inputs
+
+    def _model_forward(self, batch: DocumentInstanceModelInput) -> Any:
+        inputs = self._verify_and_filter_inputs(batch)
+        return self._model(**inputs)
+
+    def training_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> QAModelOutput:
+        batch = batch.resolve_sample_overflow(
+            overflow_strategy=self._training_overflow_strategy
+        )
+        output = self._model_forward(batch)
+        return QAModelOutput(
+            loss=output.loss,
+        )
+
+    def evaluation_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> QAModelOutput:
+        from .utilities import _postprocess_qa_predictions
+
+        questions = batch.qa_question
+        batch = batch.resolve_sample_overflow(
+            overflow_strategy=self._evaluation_overflow_strategy
+        )
+        output = self._model_forward(batch)
+        pred_answers_per_question_id = _postprocess_qa_predictions(
+            words=batch.words,
+            word_ids=batch.word_ids.detach().cpu(),
+            sequence_ids=batch.sequence_ids.detach().cpu(),
+            question_ids=batch.sample_id,  # each sample has a single question index for uniqueness
+            start_logits=output.start_logits.detach().cpu(),
+            end_logits=output.end_logits.detach().cpu(),
+        )
+
+        qa_outputs = []
+        for (qid, preds), question in zip(
+            pred_answers_per_question_id.items(), questions
+        ):
+            if "_page_" in qid:
+                sample_id = qid.split("_page_")[0]  # get the original sample id
+            else:
+                sample_id = qid.split("_subsample_")[0]  # get the original sample id
+            answer = preds[0]["text"]  # taking the top prediction
+
+            # we ignore samples with no answer during evaluation
+            qa_outputs.append(
+                QAPair(sample_id=sample_id, question=question, answer=answer)
+            )
+
+        return QAModelOutput(loss=output.loss, qa_pairs=qa_outputs)
diff --git a/docgenie/evaluation/model_pipeline/_core/_sequence_classification.py b/docgenie/evaluation/model_pipeline/_core/_sequence_classification.py
new file mode 100755
index 0000000000000000000000000000000000000000..b129fde8432f0017e29b25f297159d206a459bcc
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_sequence_classification.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+from atria_core.types import (
+    DatasetMetadata,
+)
+from torch.nn.modules import Module
+
+from docgenie.data._core._data_types import (
+    DocumentInstanceModelInput,
+)
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    ClassificationModelOutput,
+)
+from docgenie.evaluation.model_pipeline._core.utilities import _get_logits_from_output
+from docgenie.logging import get_logger
+
+from ._base import ModelPipeline
+
+logger = get_logger(__name__)
+
+
+class SequenceClassificationPipeline(ModelPipeline):
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        dataset_metadata: DatasetMetadata,
+        pretrained: bool = True,
+        use_bbox: bool = True,
+        use_image: bool = True,
+        input_stride: int = 0,
+    ):
+        import torch
+
+        super().__init__(
+            model_name=model_name,
+            pretrained=pretrained,
+            dataset_metadata=dataset_metadata,
+            model_cache_dir=model_cache_dir,
+        )
+        self._use_bbox = use_bbox
+        self._use_image = use_image
+        self._input_stride = input_stride
+        self._loss_fn = torch.nn.CrossEntropyLoss()
+
+    def _build_model(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        pretrained: bool = True,
+        dataset_metadata: DatasetMetadata | None = None,
+    ) -> Module:
+        from rich.pretty import pretty_repr
+        from transformers import AutoConfig, AutoModelForSequenceClassification
+
+        assert (
+            dataset_metadata is not None
+            and dataset_metadata.dataset_labels.classification is not None
+        ), (
+            "Dataset metadata with classification labels must be provided to build the model."
+        )
+        self._labels = dataset_metadata.dataset_labels.classification
+        num_labels = len(self._labels)
+        logger.info("Using %d labels for classification.", num_labels)
+        if pretrained:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir, num_labels=num_labels
+            )
+
+            logger.debug(
+                f"Initializing the model with the following config:\n {pretty_repr(hf_config, expand_all=True)}"
+            )
+            return AutoModelForSequenceClassification.from_pretrained(
+                model_name,
+                config=hf_config,
+                cache_dir=model_cache_dir,
+            )
+        else:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir, num_labels=num_labels
+            )
+            return AutoModelForSequenceClassification.from_config(
+                config=hf_config,
+            )
+
+    def _output_transform(
+        self,
+        loss: torch.Tensor,
+        logits: torch.Tensor,
+        batch: DocumentInstanceModelInput,
+    ) -> ClassificationModelOutput:
+        assert batch.label is not None, "Labels cannot be None"
+        predicted_labels = logits.argmax(dim=-1)
+        return ClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            prediction_probs=logits.softmax(dim=-1),
+            gt_label_value=batch.label,
+            gt_label_name=[self._labels[i] for i in batch.label.tolist()],
+            predicted_label_value=predicted_labels,
+            predicted_label_name=[self._labels[i] for i in predicted_labels.tolist()],
+        )
+
+    def _verify_and_filter_inputs(
+        self,
+        batch: DocumentInstanceModelInput,
+    ) -> dict[str, torch.Tensor | None]:
+        # assume token bboxes are in [0, 1] range
+        token_bboxes = (
+            (batch.token_bboxes * 1000.0).clip(0, 1000).long()
+            if batch.token_bboxes is not None and self._use_bbox
+            else None
+        )
+        pixel_values = (
+            batch.image if batch.image is not None and self._use_image else None
+        )
+        inputs = {
+            "input_ids": batch.token_ids,
+            "token_type_ids": batch.token_type_ids,
+            "attention_mask": batch.attention_mask,
+            "bbox": token_bboxes,
+            "pixel_values": pixel_values,
+            "labels": batch.label,
+            "segment_index": batch.segment_index,
+            "segment_inner_token_rank": batch.segment_inner_token_rank,
+            "first_token_idxes": batch.first_token_idxes,
+            "first_token_idxes_mask": batch.first_token_idxes_mask,
+        }
+
+        # assert that we always get the arguments
+        assert inputs["input_ids"] is not None, "Attention mask cannot be None"
+        assert inputs["attention_mask"] is not None, "Attention mask cannot be None"
+        assert inputs["pixel_values"] is not None, "Pixel values cannot be None"
+        assert inputs["labels"] is not None, "Labels cannot be None"
+        assert inputs["bbox"] is not None, "Labels cannot be None"
+
+        filtered_inputs = {}
+        for key in list(inputs.keys()):
+            if key not in self._possible_args:
+                continue
+            filtered_inputs[key] = inputs[key]
+        return filtered_inputs
+
+    def _model_forward(self, batch: DocumentInstanceModelInput) -> Any:
+        inputs = self._verify_and_filter_inputs(batch)
+        logits = _get_logits_from_output(self._model(**inputs))
+        loss = self._loss_fn(logits, batch.label)
+        return self._output_transform(loss, logits, batch)
+
+    def training_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> ClassificationModelOutput:
+        batch = batch.select_first_overflow_samples()
+        return self._model_forward(batch)
+
+    def evaluation_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> ClassificationModelOutput:
+        batch = batch.select_first_overflow_samples()
+        return self._model_forward(batch)
diff --git a/docgenie/evaluation/model_pipeline/_core/_token_classification.py b/docgenie/evaluation/model_pipeline/_core/_token_classification.py
new file mode 100755
index 0000000000000000000000000000000000000000..949aadef581ae52e36f20fef6e24da63f5bbaa09
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/_token_classification.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+from torch.nn.modules import Module
+
+from docgenie.data._core._data_types import (
+    DatasetMetadata,
+    DocumentInstanceModelInput,
+    OverflowStrategy,
+)
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    TokenClassificationModelOutput,
+)
+from docgenie.logging import get_logger
+
+from ._base import ModelPipeline
+
+logger = get_logger(__name__)
+
+
+class TokenClassificationPipeline(ModelPipeline):
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        dataset_metadata: DatasetMetadata,
+        pretrained: bool = True,
+        use_bbox: bool = True,
+        use_image: bool = True,
+        training_overflow_strategy: OverflowStrategy = OverflowStrategy.select_random,
+        evaluation_overflow_strategy: OverflowStrategy = OverflowStrategy.select_all,
+        input_stride: int = 0,
+    ):
+        import torch
+
+        super().__init__(
+            model_name=model_name,
+            pretrained=pretrained,
+            dataset_metadata=dataset_metadata,
+            model_cache_dir=model_cache_dir,
+        )
+        self._use_bbox = use_bbox
+        self._use_image = use_image
+        self._training_overflow_strategy = training_overflow_strategy
+        self._evaluation_overflow_strategy = evaluation_overflow_strategy
+        self._input_stride = input_stride
+        self._loss_fn = torch.nn.CrossEntropyLoss()
+
+    def _build_model(
+        self,
+        model_name: str,
+        model_cache_dir: str,
+        pretrained: bool = True,
+        dataset_metadata: DatasetMetadata | None = None,
+    ) -> Module:
+        from rich.pretty import pretty_repr
+        from transformers import AutoConfig, AutoModelForTokenClassification
+
+        assert (
+            dataset_metadata is not None
+            and dataset_metadata.dataset_labels.ser is not None
+        ), "Dataset labels must be provided in dataset metadata."
+        self._labels = dataset_metadata.dataset_labels.ser
+        num_labels = len(self._labels)
+        logger.info("Using %d labels for classification.", num_labels)
+        if pretrained:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir, num_labels=num_labels
+            )
+
+            logger.info(
+                f"Initializing the model with the following config:\n {pretty_repr(hf_config, expand_all=True)}"
+            )
+            return AutoModelForTokenClassification.from_pretrained(
+                model_name,
+                config=hf_config,
+                cache_dir=model_cache_dir,
+            )
+        else:
+            hf_config = AutoConfig.from_pretrained(
+                model_name, cache_dir=model_cache_dir, num_labels=num_labels
+            )
+            return AutoModelForTokenClassification.from_config(
+                config=hf_config,
+            )
+
+    def _gather_target_labels(self, batch: DocumentInstanceModelInput):
+        assert batch.token_labels is not None, "Token labels cannot be None"
+        target_label_names = []
+        target_label_values = []
+        for target in batch.token_labels:
+            curr_target_label_names = [self._labels[i] for i in target[target != -100]]
+            target_label_names.append(curr_target_label_names)
+
+        return target_label_names, target_label_values
+
+    def _output_transform(
+        self,
+        batch: DocumentInstanceModelInput,
+        model_output: TokenClassificationModelOutput,
+    ) -> TokenClassificationModelOutput:
+        assert batch.token_labels is not None, "Token labels cannot be None"
+        target_label_names = []
+        predicted_label_names = []
+        predictions = model_output.logits.argmax(-1)
+        for prediction, target in zip(predictions, batch.token_labels, strict=True):
+            curr_target_label_names = [self._labels[i] for i in target[target != -100]]
+            curr_predicted_label_names = [
+                self._labels[i] for i in prediction[target != -100]
+            ]
+            target_label_names.append(curr_target_label_names)
+            predicted_label_names.append(curr_predicted_label_names)
+
+        return TokenClassificationModelOutput(
+            loss=model_output.loss,
+            logits=model_output.logits,
+            predicted_label_names=predicted_label_names,
+            target_label_names=target_label_names,
+        )
+
+    def _verify_and_filter_inputs(
+        self,
+        batch: DocumentInstanceModelInput,
+    ) -> dict[str, torch.Tensor | None]:
+        # assume token bboxes are in [0, 1] range
+        token_bboxes = (
+            (batch.token_bboxes * 1000.0).clip(0, 1000).long()
+            if batch.token_bboxes is not None and self._use_bbox
+            else None
+        )
+        pixel_values = (
+            batch.image if batch.image is not None and self._use_image else None
+        )
+        inputs = {
+            "input_ids": batch.token_ids,
+            "token_type_ids": batch.token_type_ids,
+            "attention_mask": batch.attention_mask,
+            "bbox": token_bboxes,
+            "pixel_values": pixel_values,
+            "labels": batch.token_labels,
+            "segment_index": batch.segment_index,
+            "segment_inner_token_rank": batch.segment_inner_token_rank,
+            "first_token_idxes": batch.first_token_idxes,
+            "first_token_idxes_mask": batch.first_token_idxes_mask,
+        }
+
+        # assert that we always get the arguments
+        assert inputs["input_ids"] is not None, "Attention mask cannot be None"
+        assert inputs["attention_mask"] is not None, "Attention mask cannot be None"
+        assert inputs["pixel_values"] is not None, "Pixel values cannot be None"
+        assert inputs["labels"] is not None, "Labels cannot be None"
+        assert inputs["bbox"] is not None, "Labels cannot be None"
+
+        filtered_inputs = {}
+        for key in list(inputs.keys()):
+            if key not in self._possible_args:
+                continue
+            filtered_inputs[key] = inputs[key]
+        return filtered_inputs
+
+    def _model_forward(self, batch: DocumentInstanceModelInput) -> Any:
+        inputs = self._verify_and_filter_inputs(batch)
+        model_output = self._model(**inputs)
+        return self._output_transform(batch, model_output)
+
+    def training_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> TokenClassificationModelOutput:
+        batch = batch.resolve_sample_overflow(self._training_overflow_strategy)
+        return self._model_forward(batch)
+
+    def evaluation_step(  # type: ignore[override]
+        self, batch: DocumentInstanceModelInput, **kwargs
+    ) -> TokenClassificationModelOutput:
+        batch = batch.resolve_sample_overflow(self._evaluation_overflow_strategy)
+        return self._model_forward(batch)
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..aaba50dbbc2662d4b9c3d05e477d10fa8d0d8ced
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn.py
@@ -0,0 +1,206 @@
+# model settings
+model = dict(
+    type="CascadeRCNN",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
+    ),
+    rpn_head=dict(
+        type="RPNHead",
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type="AnchorGenerator",
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64],
+        ),
+        bbox_coder=dict(
+            type="DeltaXYWHBBoxCoder",
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type="SmoothL1Loss", beta=1.0 / 9.0, loss_weight=1.0),
+    ),
+    roi_head=dict(
+        type="CascadeRoIHead",
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type="SingleRoIExtractor",
+            roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        bbox_head=[
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.1, 0.1, 0.2, 0.2],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.05, 0.05, 0.1, 0.1],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.033, 0.033, 0.067, 0.067],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+        ],
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+            ),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False,
+        ),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+        ],
+    ),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            score_thr=0.05, nms=dict(type="nms", iou_threshold=0.5), max_per_img=100
+        ),
+    ),
+)
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn_doclaynet.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn_doclaynet.py
new file mode 100755
index 0000000000000000000000000000000000000000..4e773d4834168593e7e4e2a4d5b5f5f0b0a3de7d
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/cascade-rcnn_r50_fpn_doclaynet.py
@@ -0,0 +1,206 @@
+# model settings
+model = dict(
+    type="CascadeRCNN",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
+    ),
+    rpn_head=dict(
+        type="RPNHead",
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type="AnchorGenerator",
+            scales=[4, 8, 16],
+            ratios=[0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0],  # add elongated anchors
+            strides=[4, 8, 16, 32, 64],
+        ),
+        bbox_coder=dict(
+            type="DeltaXYWHBBoxCoder",
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type="SmoothL1Loss", beta=1.0 / 9.0, loss_weight=1.0),
+    ),
+    roi_head=dict(
+        type="CascadeRoIHead",
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type="SingleRoIExtractor",
+            roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        bbox_head=[
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.1, 0.1, 0.2, 0.2],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.05, 0.05, 0.1, 0.1],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+            dict(
+                type="Shared2FCBBoxHead",
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type="DeltaXYWHBBoxCoder",
+                    target_means=[0.0, 0.0, 0.0, 0.0],
+                    target_stds=[0.033, 0.033, 0.067, 0.067],
+                ),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0
+                ),
+                loss_bbox=dict(type="SmoothL1Loss", beta=1.0, loss_weight=1.0),
+            ),
+        ],
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+            ),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False,
+        ),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+            dict(
+                assigner=dict(
+                    type="MaxIoUAssigner",
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                ),
+                sampler=dict(
+                    type="RandomSampler",
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True,
+                ),
+                pos_weight=-1,
+                debug=False,
+            ),
+        ],
+    ),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            score_thr=0.05, nms=dict(type="nms", iou_threshold=0.5), max_per_img=100
+        ),
+    ),
+)
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn.py
new file mode 100755
index 0000000000000000000000000000000000000000..4a3df6f55493582e7d601c3159647a0d757cba5a
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn.py
@@ -0,0 +1,128 @@
+# model settings
+model = dict(
+    type="FasterRCNN",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
+    ),
+    rpn_head=dict(
+        type="RPNHead",
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type="AnchorGenerator",
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64],
+        ),
+        bbox_coder=dict(
+            type="DeltaXYWHBBoxCoder",
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+    ),
+    roi_head=dict(
+        type="StandardRoIHead",
+        bbox_roi_extractor=dict(
+            type="SingleRoIExtractor",
+            roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        bbox_head=dict(
+            type="Shared2FCBBoxHead",
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type="DeltaXYWHBBoxCoder",
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2],
+            ),
+            reg_class_agnostic=False,
+            loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+            ),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False,
+        ),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+            ),
+            pos_weight=-1,
+            debug=False,
+        ),
+    ),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            score_thr=0.05, nms=dict(type="nms", iou_threshold=0.5), max_per_img=100
+        ),
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ),
+)
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn_doclaynet.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn_doclaynet.py
new file mode 100755
index 0000000000000000000000000000000000000000..928bb82bddad47244941012d281d68e041c741ae
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/_base_/faster-rcnn_r50_fpn_doclaynet.py
@@ -0,0 +1,128 @@
+# model settings
+model = dict(
+    type="FasterRCNN",
+    data_preprocessor=dict(
+        type="DetDataPreprocessor",
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+    ),
+    backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=True,
+        style="pytorch",
+        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
+    ),
+    neck=dict(
+        type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5
+    ),
+    rpn_head=dict(
+        type="RPNHead",
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type="AnchorGenerator",
+            scales=[8],
+            ratios=[1 / 6.0, 0.5, 1.0, 2.0, 6.0],
+            strides=[4, 8, 16, 32, 64],
+        ),
+        bbox_coder=dict(
+            type="DeltaXYWHBBoxCoder",
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+    ),
+    roi_head=dict(
+        type="StandardRoIHead",
+        bbox_roi_extractor=dict(
+            type="SingleRoIExtractor",
+            roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        bbox_head=dict(
+            type="Shared2FCBBoxHead",
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type="DeltaXYWHBBoxCoder",
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2],
+            ),
+            reg_class_agnostic=False,
+            loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type="L1Loss", loss_weight=1.0),
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+            ),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False,
+        ),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            assigner=dict(
+                type="MaxIoUAssigner",
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1,
+            ),
+            sampler=dict(
+                type="RandomSampler",
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+            ),
+            pos_weight=-1,
+            debug=False,
+        ),
+    ),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type="nms", iou_threshold=0.7),
+            min_bbox_size=0,
+        ),
+        rcnn=dict(
+            score_thr=0.05, nms=dict(type="nms", iou_threshold=0.5), max_per_img=100
+        ),
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ),
+)
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..d4acd986b8eda3895737ca6457b666bec0429c16
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    "../_base_/cascade-rcnn_r50_fpn.py",
+]
+
+model_path = "https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth"
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco_doclaynet.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco_doclaynet.py
new file mode 100755
index 0000000000000000000000000000000000000000..d4552b50cb1362d02f399e70066e1bf7ee8d06d2
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/cascade-rcnn_r50_fpn_1x_coco_doclaynet.py
@@ -0,0 +1,5 @@
+_base_ = [
+    "../_base_/cascade-rcnn_r50_fpn_doclaynet.py",
+]
+
+model_path = "https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth"
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..7e5145e365013d2a1dd9a9e46ff49a0a2eba715d
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    "../_base_/faster-rcnn_r50_fpn.py",
+]
+
+model_path = "https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth"
diff --git a/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco_doclaynet.py b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco_doclaynet.py
new file mode 100755
index 0000000000000000000000000000000000000000..15a7a5ba6b5671638f21c2b7a01696bb43ac7dee
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/mmdet_cfgs/configs/faster-rcnn_r50_fpn_1x_coco_doclaynet.py
@@ -0,0 +1,5 @@
+_base_ = [
+    "../_base_/faster-rcnn_r50_fpn_doclaynet.py",
+]
+
+model_path = "https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth"
diff --git a/docgenie/evaluation/model_pipeline/_core/utilities.py b/docgenie/evaluation/model_pipeline/_core/utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..ab36ce94c3ea7b5a65dc791ec3d34e6bb2c409fc
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/_core/utilities.py
@@ -0,0 +1,177 @@
+import collections
+from dataclasses import is_dataclass
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+
+if TYPE_CHECKING:
+    import torch
+
+
+def _get_logits_from_output(model_output: Any) -> "torch.Tensor":
+    import torch
+
+    if isinstance(model_output, torch.Tensor):
+        return model_output
+    elif isinstance(model_output, tuple):
+        return model_output[0]
+    elif is_dataclass(model_output):
+        return model_output.logits
+    elif isinstance(model_output, dict):
+        if "logits" in model_output:
+            return model_output["logits"]
+    else:
+        raise ValueError(f"Could not extract logits from model output: {model_output}")
+
+
+def _convert_to_list(x):
+    """
+    Converts the input to a list.
+
+    Args:
+        x: Input data, which can be a list or a torch.Tensor.
+
+    Returns:
+        A list representation of the input.
+    """
+    import torch
+
+    if isinstance(x, list):
+        return x
+    if isinstance(x, torch.Tensor):
+        x = x.tolist()
+    return x
+
+
+def _postprocess_qa_predictions(
+    words,
+    word_ids,
+    sequence_ids,
+    question_ids,
+    start_logits,
+    end_logits,
+    n_best_size: int = 20,
+    max_answer_length: int = 100,
+):
+    word_ids = _convert_to_list(word_ids)
+    sequence_ids = _convert_to_list(sequence_ids)
+    question_ids = _convert_to_list(question_ids)
+
+    features_per_example = collections.defaultdict(list)
+    for feature_id, question_id in enumerate(
+        question_ids
+    ):  # each example has a unique question id
+        features_per_example[question_id].append(feature_id)
+
+    # The dictionaries we have to fill.
+    all_predictions_per_question_id = collections.OrderedDict()
+
+    # Let's loop over all the examples!
+    for question_id, feature_indices in features_per_example.items():
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            feature_start_logits = start_logits[feature_index].numpy()
+            feature_end_logits = end_logits[feature_index].numpy()
+
+            feature_word_ids = word_ids[feature_index]
+            feature_sequence_ids = sequence_ids[feature_index]
+
+            num_question_tokens = 0
+            while feature_sequence_ids[num_question_tokens] != 1:
+                num_question_tokens += 1
+
+            feature_null_score = feature_start_logits[0] + feature_end_logits[0]
+            if (
+                min_null_prediction is None
+                or min_null_prediction["score"] > feature_null_score
+            ):
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": feature_start_logits[0],
+                    "end_logit": feature_end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(feature_start_logits)[
+                -1 : -n_best_size - 1 : -1
+            ].tolist()
+            end_indexes = np.argsort(feature_end_logits)[
+                -1 : -n_best_size - 1 : -1
+            ].tolist()
+
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    if (
+                        start_index < num_question_tokens
+                        or end_index < num_question_tokens
+                        or start_index >= len(feature_word_ids)
+                        or end_index >= len(feature_word_ids)
+                        or feature_word_ids[start_index] is None
+                        or feature_word_ids[end_index] is None
+                    ):
+                        continue
+                    if (
+                        end_index < start_index
+                        or end_index - start_index + 1 > max_answer_length
+                    ):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "word_ids": (
+                                feature_word_ids[start_index],
+                                feature_word_ids[end_index],
+                            ),
+                            "score": feature_start_logits[start_index]
+                            + feature_end_logits[end_index],
+                            "start_logit": feature_start_logits[start_index],
+                            "end_logit": feature_end_logits[end_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(
+            prelim_predictions, key=lambda x: x["score"], reverse=True
+        )[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        first_feature_id = features_per_example[question_id][0]
+        context = words[first_feature_id]
+
+        for pred in predictions:
+            offsets = pred.pop("word_ids")
+            pred["text"] = " ".join(
+                [x.strip() for x in context[offsets[0] : offsets[1] + 1]]
+            )
+
+        if len(predictions) == 0 or (
+            len(predictions) == 1 and predictions[0]["text"] == ""
+        ):
+            predictions.insert(
+                0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
+            )
+
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        for prob, pred in zip(probs, predictions, strict=True):
+            pred["probability"] = prob
+
+        all_predictions_per_question_id[question_ids[feature_index]] = [
+            {
+                k: (
+                    float(v)
+                    if isinstance(v, np.float16 | np.float32 | np.float64)
+                    else v
+                )
+                for k, v in pred.items()
+            }
+            for pred in predictions
+        ]
+
+    return all_predictions_per_question_id
diff --git a/docgenie/evaluation/model_pipeline/cmd/test_pipeline.py b/docgenie/evaluation/model_pipeline/cmd/test_pipeline.py
new file mode 100755
index 0000000000000000000000000000000000000000..19ce4855b8ce2c0008b4777aadea2d4aaa177ebc
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/cmd/test_pipeline.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie import ENV
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def main(
+    cfg: TestModelPipeline,
+):
+    from docgenie.data import (
+        load_data_pipeline,
+        load_preprocessed_data_pipeline,
+        TaskType,
+        DATASET_TASK_MAP,
+    )
+    from docgenie.evaluation.model_pipeline import load_model_pipeline
+
+    # setup data pipeline and dataloaders
+    logger.info("Saving samples from split [train]...")
+    data_pipeline = load_data_pipeline(
+        dataset_name=cfg.dataset_name,
+    )
+
+    train_dataloader, validation_dataloader, test_dataloader = (
+        data_pipeline.train_dataloader(batch_size=cfg.n_samples),
+        data_pipeline.validation_dataloader(batch_size=cfg.n_samples),
+        data_pipeline.test_dataloader(batch_size=cfg.n_samples),
+    )
+
+    task_type = DATASET_TASK_MAP.get(cfg.dataset_name)
+    model_pipeline = load_model_pipeline(
+        task_type=task_type,
+        model_name=cfg.model_name,
+        model_cache_dir=cfg.model_cache_dir,
+        dataset_metadata=data_pipeline.dataset_metadata,
+    )
+
+    logger.info(f"Loaded model pipeline:\n{model_pipeline}")
+
+    for split, dataloader in zip(
+        ["train", "validation", "test"],
+        [train_dataloader, validation_dataloader, test_dataloader],
+    ):
+        logger.info(f"Performing a single step on split [{split}]...")
+        if dataloader is not None:
+            batch = next(iter(dataloader))
+            if split == "train":
+                outputs = model_pipeline.training_step(batch)
+            else:
+                outputs = model_pipeline.evaluation_step(batch)
+            logger.info(f"Outputs:\n{outputs}")
+        else:
+            logger.warning(f"No dataloader found for split [{split}]!")
+
+
+class TestModelPipeline(pydantic.BaseModel):
+    dataset_name: str
+    model_name: str = "microsoft/layoutlmv3-base"
+    model_cache_dir: str = ENV.MODELS_DIR
+    root_datasets_dir: str = ENV.BASE_DATASETS_DIR
+    n_samples: int = 16
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=TestModelPipeline,
+    )
+    main(parser.parse_typed_args())
diff --git a/docgenie/evaluation/model_pipeline/interface.py b/docgenie/evaluation/model_pipeline/interface.py
new file mode 100755
index 0000000000000000000000000000000000000000..45939e03177b5381ca50d22261e10d3dc7a54b59
--- /dev/null
+++ b/docgenie/evaluation/model_pipeline/interface.py
@@ -0,0 +1,103 @@
+"""
+Defines interface for docgenie components to load datasets using DatasetFactory and log relevant information.
+"""
+
+from __future__ import annotations
+
+from docgenie.data._core._data_types import DatasetMetadata
+from docgenie.data._core._utilities import TaskType
+from docgenie.evaluation.model_pipeline._core._conditional_generation import (
+    GenerativeLayoutAnalysisPipeline,
+    GenerativeQuestionAnsweringPipeline,
+    GenerativeSequenceClassificationPipeline,
+    GenerativeTokenClassificationPipeline,
+)
+from docgenie.evaluation.model_pipeline._core._detection import (
+    DetectionPipeline,
+)
+from docgenie.evaluation.model_pipeline._core._question_answering import (
+    QuestionAnsweringPipeline,
+)
+from docgenie.evaluation.model_pipeline._core._sequence_classification import (
+    SequenceClassificationPipeline,
+)
+from docgenie.evaluation.model_pipeline._core._token_classification import (
+    TokenClassificationPipeline,
+)
+from docgenie.logging import get_logger
+
+from ._core._base import ModelPipeline  # noqa
+from ._core._data_types import *  # noqa
+
+logger = get_logger(__name__)
+
+
+def load_model_pipeline(
+    task_type: TaskType,
+    model_name: str,
+    model_cache_dir: str,
+    dataset_metadata: DatasetMetadata,
+    **kwargs,
+) -> ModelPipeline:
+    if model_name in [
+        "microsoft/udop-large",
+        "google-t5/t5-base",
+        "google-t5/t5-large",
+    ]:
+        if task_type == TaskType.sequence_classification:
+            return GenerativeSequenceClassificationPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+            )
+        elif task_type == TaskType.token_classification:
+            return GenerativeTokenClassificationPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+            )
+        elif task_type == TaskType.extractive_qa:
+            return GenerativeQuestionAnsweringPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+            )
+        elif task_type == TaskType.layout_analysis:
+            return GenerativeLayoutAnalysisPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+            )
+        else:
+            raise ValueError(f"Unsupported task type: {task_type}")
+    else:
+        if task_type == TaskType.sequence_classification:
+            return SequenceClassificationPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+                **kwargs,
+            )
+        elif task_type == TaskType.token_classification:
+            return TokenClassificationPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+                **kwargs,
+            )
+        elif task_type == TaskType.extractive_qa:
+            return QuestionAnsweringPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+                **kwargs,
+            )
+        elif task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+            return DetectionPipeline(
+                model_name=model_name,
+                model_cache_dir=model_cache_dir,
+                dataset_metadata=dataset_metadata,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unsupported task type: {task_type}")
diff --git a/docgenie/evaluation/runners/__init__.py b/docgenie/evaluation/runners/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docgenie/evaluation/runners/_config.py b/docgenie/evaluation/runners/_config.py
new file mode 100755
index 0000000000000000000000000000000000000000..038e3ad0d5d6ebb74f5d9cc1909cec31583bdc6b
--- /dev/null
+++ b/docgenie/evaluation/runners/_config.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Literal
+
+import pydantic.v1 as pydantic
+
+from docgenie import ENV
+
+
+class RunnerConfig(pydantic.BaseModel):
+    # do eval
+    do_train: bool = True
+    do_eval: bool = True
+
+    # runtime args
+    project_name: str = "docgenie-experiments"
+    device_id: int = 0
+    run_name: str
+    runs_dir: str = ENV.RUNS_DIR
+    seed: int = 42
+    deterministic: bool = False
+    backend: str | None = "nccl"
+    n_devices: int = 1
+    # dataset args
+    dataset_name: str
+    root_datasets_dir: str = ENV.BASE_DATASETS_DIR
+    # dataloader args
+    train_batch_size: int = 8
+    eval_batch_size: int = 8
+    num_workers: int = 4
+    pin_memory: bool = True
+    # dataset split args
+    dataset_splitting_enabled: bool = False
+    split_ratio: float = 0.9
+    # preprocessed dataset
+    use_preprocessed_dataset: bool = False
+    # tokenizer args
+    tokenizer_name: str | None = None
+    use_segment_level_bboxes: bool = False
+    resize_width: int | None = None
+    resize_height: int | None = None
+    use_imagenet_mean_std: bool = False
+    ignore_samples_with_no_answer: bool = False
+    add_segment_level_info: bool = False  # this is only used in geolayoutlm
+    # only_prepare data
+    only_prepare_data: bool = False
+    # mmdet detection pipeline args
+    use_flip: bool = True
+    use_fixed_size: bool = False
+    fixed_size: int = 800
+
+    # evaluator configs
+    with_amp: bool = False
+    use_best: bool = True
+    do_validation: bool = True
+
+    # model args
+    model_checkpoint_path: str | None = None
+    model_name: str = "microsoft/layoutlmv3-base"
+    model_cache_dir: str = ENV.MODELS_DIR / "pretrained"
+    pretrained_checkpoint: str | None = None
+    # training args
+    optimizer: str = "adamw"
+    weight_decay: float = 0.01
+    momentum: float = 0.9
+    num_epochs: int = 50
+    warmup_steps: int = 0
+    gradient_accumulation_steps: int = 1
+    enable_grad_clipping: bool = False
+    max_grad_norm: float = 1.0
+    # lr scheduler args
+    lr_start: float = 1e-5
+    lr_end: float = 1e-8
+    lr_schedule_warmup_steps_frac_of_total: float = 0.1
+    # validation args
+    validate_every_n_epochs: float = 1.0
+    enable_early_stopping: bool = True
+    # checkpoint args
+    save_ckpt_every_n_epochs: int = 1
+    keep_n_checkpoints: int = 1
+    monitored_metric: str | None = None
+    monitored_metric_mode: str = "max"  # "min" or "max"
+    save_weights_only: bool = False
+    test_run: bool = False
+    # Per sample eval
+    split_for_per_sample_eval: Literal["train", "validation", "test"] = "test"
+
+    @property
+    def output_dir(self) -> Path:
+        return Path(self.runs_dir) / self.run_name
+
+    def state_dict(self) -> dict:
+        return self.dict()
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        for key, value in state_dict.items():
+            if key in self.__fields__:
+                setattr(self, key, value)
+
+    def dict(self, *args, **kwargs):
+        self.model_cache_dir = str(self.model_cache_dir)
+        self.root_datasets_dir = str(self.root_datasets_dir)
+        self.runs_dir = str(self.runs_dir)
+        return super().dict(*args, **kwargs)
+
+
+class MixedRunnerConfig(RunnerConfig):
+    synthetic_dataset_name: str
+    num_real_samples: int = -1
+    num_synthetic_samples: int = -1
diff --git a/docgenie/evaluation/runners/_evaluation_step.py b/docgenie/evaluation/runners/_evaluation_step.py
new file mode 100755
index 0000000000000000000000000000000000000000..c945da6fc6cdf4e45f9bda1b3041c6196d0add46
--- /dev/null
+++ b/docgenie/evaluation/runners/_evaluation_step.py
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import torch
+from ignite.engine import Engine
+
+from docgenie.data._core._data_types import BaseModelInput
+from docgenie.evaluation.model_pipeline import ModelPipeline
+
+
+class EvaluationStep:
+    def __init__(
+        self, model_pipeline: ModelPipeline, device: str, stage: str = "validation"
+    ):
+        self.model_pipeline = model_pipeline
+        self.device = device
+        self.stage = stage
+
+    def __call__(self, engine: "Engine", batch: "BaseModelInput"):
+        self.model_pipeline.eval()
+
+        with torch.no_grad():
+            batch = batch.to(self.device)
+            return self.model_pipeline.evaluation_step(
+                batch=batch,
+                stage=self.stage,
+            )
diff --git a/docgenie/evaluation/runners/_metrics/__init__.py b/docgenie/evaluation/runners/_metrics/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..9579c4da4a1ccf94a3d53a496f2d21ea85428cec
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/__init__.py
@@ -0,0 +1,4 @@
+from .classification import load_classification_metrics  # noqa: F401
+from .detection import load_detection_metrics  # noqa: F401
+from .question_answering import load_extractive_qa_metrics  # noqa: F401
+from .token_classification import load_token_classification_metrics  # noqa: F401
diff --git a/docgenie/evaluation/runners/_metrics/classification.py b/docgenie/evaluation/runners/_metrics/classification.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd59281c4dfc84b1a51427318e1959e2aa8edf90
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/classification.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from docgenie.evaluation.model_pipeline import ClassificationModelOutput
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+if TYPE_CHECKING:
+    from ignite.metrics import Metric
+
+
+def load_classification_metrics(
+    device: str, num_classes: int | None = None
+) -> dict[str, Metric]:
+    from ignite.metrics import Accuracy, ConfusionMatrix, Precision, Recall
+
+    def _output_transform(output: ClassificationModelOutput):
+        assert isinstance(output, ClassificationModelOutput), (
+            f"Expected {ClassificationModelOutput}, got {type(output)}"
+        )
+        if output.logits is None:
+            assert (
+                output.predicted_label_value is not None
+                and output.gt_label_value is not None
+            ), (
+                "Both predicted_label_value and gt_label_value must be provided when logits are None"
+            )
+            return (
+                output.predicted_label_value,
+                output.gt_label_value,
+            )
+
+        assert output.gt_label_value is not None, (
+            "gt_label_value must be provided when logits are present"
+        )
+        return (
+            output.logits,
+            output.gt_label_value,
+        )  # pred logits, gt_labels
+
+    def _f1_score(output_transform: Callable, device: str = "cpu") -> Metric:
+        from ignite.metrics import Precision, Recall
+
+        precision = Precision(
+            average=False, output_transform=output_transform, device=device
+        )
+        recall = Recall(average=False, output_transform=output_transform, device=device)
+        return (precision * recall * 2 / (precision + recall)).mean()
+
+    return {
+        "accuracy": Accuracy(
+            is_multilabel=False, device=device, output_transform=_output_transform
+        ),
+        "precision": Precision(
+            average=True, device=device, output_transform=_output_transform
+        ),
+        "recall": Recall(
+            average=True, device=device, output_transform=_output_transform
+        ),
+        "confusion_matrix": ConfusionMatrix(
+            average="recall",
+            device=device,
+            num_classes=num_classes,
+            output_transform=_output_transform,
+        ),
+        "f1": _f1_score(device=device, output_transform=_output_transform),
+    }
diff --git a/docgenie/evaluation/runners/_metrics/common.py b/docgenie/evaluation/runners/_metrics/common.py
new file mode 100755
index 0000000000000000000000000000000000000000..c207326bd8767bda23c62eed55337f3893dc7408
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/common.py
@@ -0,0 +1,95 @@
+import itertools
+from collections.abc import Callable, Mapping
+from typing import cast
+
+import ignite.distributed as idist
+import torch
+from ignite.metrics.metric import Metric, reinit__is_reduced
+
+from docgenie.data import get_logger
+from typing import Any
+
+logger = get_logger(__name__)
+
+
+class EpochDictMetric(Metric):
+    def __init__(
+        self,
+        compute_fn: Callable[[torch.Tensor, torch.Tensor], float],
+        output_transform: Callable = lambda x: x,
+        check_compute_fn: bool = True,
+        device: str | torch.device = torch.device("cpu"),
+    ) -> None:
+        if not callable(compute_fn):
+            raise TypeError("Argument compute_fn should be callable.")
+
+        self.compute_fn = compute_fn
+        self._check_compute_fn = check_compute_fn
+
+        super().__init__(output_transform=output_transform, device=device)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._fn_inputs: list[list[torch.Tensor]] = []
+        self._result: float | None = None
+
+    @reinit__is_reduced
+    def update(self, output: list[Any]) -> None:
+        output = [
+            (
+                value.detach().clone().to(self._device)
+                if isinstance(value, torch.Tensor)
+                else value
+            )
+            for value in output
+        ]
+        self._fn_inputs.append(output)
+
+        # Check once the signature and execution of compute_fn
+        if len(self._fn_inputs) == 1 and self._check_compute_fn:
+            try:
+                self.compute_fn(*self._fn_inputs[0])
+            except Exception as e:
+                logger.warning(
+                    f"Probably, there can be a problem with `compute_fn`:\n {e}.",
+                    EpochDictMetricWarning,
+                )
+
+    def compute(self) -> float:
+        if self._result is None:
+            _gathered_fn_inputs = []
+            for item_idx in range(len(self._fn_inputs[0])):
+                gathered_fn_input = [
+                    self._fn_inputs[batch_idx][item_idx]
+                    for batch_idx in range(len(self._fn_inputs))
+                ]
+                if isinstance(gathered_fn_input[0], torch.Tensor):
+                    if len(gathered_fn_input[0].shape) == 0:
+                        gathered_fn_input = torch.tensor(gathered_fn_input)
+                    else:
+                        gathered_fn_input = torch.cat(gathered_fn_input, dim=0)
+                elif isinstance(gathered_fn_input[0], list):
+                    gathered_fn_input = list(
+                        itertools.chain.from_iterable(gathered_fn_input)
+                    )
+
+                ws = idist.get_world_size()
+                if ws > 1:
+                    # All gather across all processes
+                    gathered_fn_input = idist.all_gather(gathered_fn_input)
+                _gathered_fn_inputs.append(gathered_fn_input)
+
+            self._result = 0.0
+            if idist.get_rank() == 0:
+                # Run compute_fn on zero rank only
+                self._result = self.compute_fn(*_gathered_fn_inputs)
+
+            if ws > 1:
+                # broadcast result to all processes
+                self._result = cast(float, idist.broadcast(self._result, src=0))
+
+            return self._result
+
+
+class EpochDictMetricWarning(UserWarning):
+    pass
diff --git a/docgenie/evaluation/runners/_metrics/detection.py b/docgenie/evaluation/runners/_metrics/detection.py
new file mode 100755
index 0000000000000000000000000000000000000000..eb82e2fc8e714150afe8dc7c62183af99fdbedc0
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/detection.py
@@ -0,0 +1,281 @@
+import json
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass
+
+import ignite.distributed as idist
+import torch
+from ignite.engine import Engine
+from ignite.metrics import Metric
+from ignite.metrics.metric import reinit__is_reduced
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from docgenie.evaluation.model_pipeline._core._data_types import MMDetEvaluationOutput
+
+
+def xyxy2xywh(bbox):
+    return [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
+
+
+@dataclass
+class GroundTruthInstances:
+    bboxes: torch.Tensor
+    labels: torch.Tensor
+
+    def detach(self):
+        self.bboxes = self.bboxes.detach()
+        self.labels = self.labels.detach()
+
+
+@dataclass
+class PredInstances:
+    bboxes: torch.Tensor
+    labels: torch.Tensor
+    scores: torch.Tensor
+
+    def detach(self):
+        self.bboxes = self.bboxes.detach()
+        self.labels = self.labels.detach()
+        self.scores = self.scores.detach()
+
+
+@dataclass
+class CocoCategory:
+    id: int
+    name: str
+    supercategory: str
+
+
+def _cocoeval_output_transform(model_output: MMDetEvaluationOutput):
+    from mmdet.structures.bbox import scale_boxes
+
+    assert isinstance(model_output, MMDetEvaluationOutput), (
+        f"Expected {MMDetEvaluationOutput}, got {type(model_output)}"
+    )
+    image_ids: list[int] = []
+    gt_instances: list[GroundTruthInstances] = []
+    pred_instances: list[PredInstances] = []
+
+    for batch_sample in model_output.det_data_samples:
+        scale_factor = batch_sample.metainfo.get("scale_factor")
+        if "gt_instances" in batch_sample:
+            batch_sample.gt_instances.bboxes = scale_boxes(
+                batch_sample.gt_instances.bboxes, [1 / s for s in scale_factor]
+            )
+
+        image_ids.append(batch_sample.metainfo["img_id"])
+        gt_instances.append(
+            GroundTruthInstances(
+                bboxes=batch_sample.gt_instances["bboxes"],
+                labels=batch_sample.gt_instances["labels"],
+            )
+        )
+        pred_instances.append(
+            PredInstances(
+                bboxes=batch_sample.pred_instances["bboxes"],
+                labels=batch_sample.pred_instances["labels"],
+                scores=batch_sample.pred_instances["scores"],
+            )
+        )
+    return image_ids, gt_instances, pred_instances
+
+
+class COCOEvalMetric(Metric):
+    """COCO-style evaluation metric for object detection.
+
+    This class accumulates only essential data and performs COCO evaluation
+    after gathering the results from all GPUs during `compute()`.
+
+    Args:
+        device: The device where internal storage will reside.
+    """
+
+    _state_dict_all_req_keys = ("_image_ids", "_gt_instances", "_pred_instances")
+    _output_keys = (
+        "AP",
+        "AP50",
+        "AP75",
+        "APs",
+        "APm",
+        "APl",
+        "AR1",
+        "AR10",
+        "AR100",
+        "ARs",
+        "ARm",
+        "ARl",
+    )
+
+    def __init__(
+        self,
+        output_transform: Callable = lambda x: x,
+        device: str | torch.device = "cpu",
+    ) -> None:
+        super().__init__(output_transform=output_transform, device=device)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._image_ids: list[int] = []
+        self._gt_instances: list[GroundTruthInstances] = []
+        self._pred_instances: list[PredInstances] = []
+        self._result: dict[str, float] | None = None
+
+    @torch.no_grad()
+    def iteration_completed(self, engine: Engine) -> None:
+        output = self._output_transform(engine.state.output)
+        image_ids, gt_instances, pred_instances = output
+        self.update(image_ids, gt_instances, pred_instances)
+
+    def _check_type(
+        self,
+        image_ids: list[int],
+        gt_instances: list[GroundTruthInstances],
+        pred_instances: list[PredInstances],
+    ) -> None:
+        if not isinstance(image_ids, list) or not all(
+            isinstance(i, int) for i in image_ids
+        ):
+            raise TypeError("image_ids must be a list of integers.")
+        if not isinstance(gt_instances, list) or not all(
+            isinstance(gt, GroundTruthInstances) for gt in gt_instances
+        ):
+            raise TypeError("gt_instances must be a list of GroundTruthInstances.")
+        if not isinstance(pred_instances, list) or not all(
+            isinstance(pred, PredInstances) for pred in pred_instances
+        ):
+            raise TypeError("pred_instances must be a list of PredInstances.")
+
+    @reinit__is_reduced
+    def update(
+        self,
+        image_ids: list[int],
+        gt_instances: list[GroundTruthInstances],
+        pred_instances: list[PredInstances],
+    ):
+        self._check_type(image_ids, gt_instances, pred_instances)
+        for x in gt_instances:
+            x.detach()
+        for x in pred_instances:
+            x.detach()
+        self._image_ids.extend(image_ids)
+        self._gt_instances.extend(gt_instances)
+        self._pred_instances.extend(pred_instances)
+
+    def compute(self) -> float:
+        if len(self._image_ids) == 0:
+            raise ValueError("No data available for COCO evaluation.")
+
+        if self._result is None:
+            ws = idist.get_world_size()
+            if ws > 1:
+                self._image_ids = idist.all_gather(self._image_ids)
+                self._gt_instances = idist.all_gather(self._gt_instances)
+                self._pred_instances = idist.all_gather(self._pred_instances)
+
+            if idist.get_rank() == 0:
+                # Run compute_fn on zero rank only
+                self._result = self._call_coco_eval(
+                    self._image_ids, self._gt_instances, self._pred_instances
+                )
+
+            if ws > 1:
+                # broadcast result to all processes
+                self._result = idist.broadcast(self._result, src=0)
+
+        return self._result
+
+    def _call_coco_eval(
+        self,
+        image_ids: list[int],
+        gt_instances: list[GroundTruthInstances],
+        pred_instances: list[PredInstances],
+    ) -> float:
+        # Create COCO format ground truth annotations
+        coco_gt_annotations = []
+        for idx, gt in enumerate(gt_instances):
+            image_id = image_ids[idx]
+            for i, bbox in enumerate(gt.bboxes):
+                bbox = xyxy2xywh(bbox.tolist())
+                coco_gt_annotations.append(
+                    {
+                        "id": len(coco_gt_annotations) + 1,
+                        "image_id": image_id,
+                        "category_id": int(gt.labels[i])
+                        + 1,  # COCO categories start at 1
+                        "bbox": bbox,
+                        "area": bbox[2] * bbox[3],  # width * height
+                        "iscrowd": 0,
+                    }
+                )
+
+        # Create COCO format predicted annotations
+        coco_pred_annotations = []
+        for idx, pred in enumerate(pred_instances):
+            image_id = image_ids[idx]
+            for i, bbox in enumerate(pred.bboxes):
+                if len(bbox) == 0:  # Skip if there's no prediction
+                    continue
+                bbox = xyxy2xywh(bbox.tolist())
+                coco_pred_annotations.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": int(pred.labels[i])
+                        + 1,  # COCO categories start at 1
+                        "bbox": bbox,
+                        "score": float(pred.scores[i]),  # COCO needs score
+                    }
+                )
+
+        # If there are no predictions, return zeros to prevent evaluation errors
+        if len(coco_pred_annotations) == 0:
+            return dict.fromkeys(self._output_keys, 0.0)
+
+        # Initialize COCO ground truth and result annotations
+        coco_gt = COCO()
+        coco_gt.dataset = {
+            "images": [{"id": image_id} for image_id in image_ids],
+            "annotations": coco_gt_annotations,
+            "categories": [{"id": 1, "name": "table"}],
+            "info": {},
+        }
+        with open("coco_gt.json", "w") as f:
+            json.dump(coco_gt.dataset, f)
+        coco_gt.createIndex()
+        coco_dt = coco_gt.loadRes(coco_pred_annotations)
+        with open("coco_dt.json", "w") as f:
+            json.dump(coco_dt.dataset, f)
+
+        # Perform evaluation
+        coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
+        coco_eval.params.imgIds = image_ids
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        return {key: coco_eval.stats[i] for i, key in enumerate(self._output_keys)}
+
+    def completed(self, engine: Engine, name: str) -> None:
+        result = self.compute()
+        if isinstance(result, Mapping):
+            if name in result.keys():
+                raise ValueError(
+                    f"Argument name '{name}' is conflicting with mapping keys: {list(result.keys())}"
+                )
+
+            for key, value in result.items():
+                engine.state.metrics[name + "_" + key] = value
+        else:
+            if isinstance(result, torch.Tensor):
+                if len(result.size()) == 0:
+                    result = result.item()
+                elif "cpu" not in result.device.type:
+                    result = result.cpu()
+
+            engine.state.metrics[name] = result
+
+
+def load_detection_metrics(device: str) -> dict[str, Metric]:
+    return {
+        "coco_eval": COCOEvalMetric(
+            output_transform=_cocoeval_output_transform, device=device
+        ),
+    }
diff --git a/docgenie/evaluation/runners/_metrics/question_answering.py b/docgenie/evaluation/runners/_metrics/question_answering.py
new file mode 100755
index 0000000000000000000000000000000000000000..a79dc2d0c78ae73d2b35e8618885a42ea4b7ce19
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/question_answering.py
@@ -0,0 +1,263 @@
+import copy
+import json
+from collections import defaultdict
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+
+import due_evaluator
+import torch
+from due_evaluator.utils import property_scores_to_string
+from ignite.engine import Engine
+from ignite.metrics.metric import Metric
+
+from docgenie import ENV
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    QAModelOutput,
+)
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+BASE_DATASETS_URI = Path(ENV.BASE_DATASETS_DIR / "due_benchmark" / "datasets")
+
+
+@dataclass
+class DueEvalConfig:
+    reference_path: Path
+    metric: str
+    ignore_case: bool = True
+    is_pwc: bool = False
+
+    def split_reference_path(self, split: str) -> Path:
+        split = "dev" if split == "validation" else split
+        return Path(str(self.reference_path).format(split=split))
+
+
+# extractive DUE configs expect final answers in format of sample id -> list of (question, answer) pairs
+EX_DUE_DATASET_CONFIGS = {
+    "ex_docvqa": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "DocVQA/aws_neurips_time/DocVQA/{split}/document.jsonl",
+        metric="ANLS",
+        ignore_case=True,
+    ),
+    "ex_docvqa_hw": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "DocVQA/aws_neurips_time/DocVQA/{split}/document_hw.jsonl",
+        metric="ANLS",
+        ignore_case=True,
+    ),
+    "ex_infographics": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "InfographicsVQA/aws_neurips_time/infographics_vqa/{split}/document.jsonl",
+        metric="ANLS",
+        ignore_case=True,
+    ),
+    "ex_klc": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "KleisterCharity/aws_neurips_time/kleister-charity/{split}/document.jsonl",
+        metric="F1",
+        ignore_case=True,
+    ),
+    "ex_deepform": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "DeepForm/aws_neurips_time/DeepForm/{split}/document.jsonl",
+        metric="F1",
+        ignore_case=True,
+    ),
+    "ex_pwc": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "PWC/aws_neurips_time/AxCell/{split}/document.jsonl",
+        metric="GROUP-ANLS",
+        ignore_case=True,
+        is_pwc=True,
+    ),
+    "ex_wiki": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "WikiTableQuestions/aws_neurips_time/WikiTableQuestions/{split}/document.jsonl",
+        metric="WTQ",
+        ignore_case=False,
+    ),
+    "ex_tabfact": DueEvalConfig(
+        reference_path=BASE_DATASETS_URI
+        / "TabFact/aws_neurips_time/TabFact/{split}/document.jsonl",
+        metric="F1",
+        ignore_case=False,
+    ),
+}
+
+
+class ExDueEvalMetric(Metric):
+    def __init__(
+        self,
+        dataset_name: str,
+        stage: str,
+        device: str | torch.device = torch.device("cpu"),
+    ) -> None:
+        self._eval_config = copy.deepcopy(EX_DUE_DATASET_CONFIGS[dataset_name])
+        self._eval_config.reference_path = self._eval_config.split_reference_path(stage)
+        super().__init__(device=device)
+
+    def reset(self) -> None:
+        self._sample_level_qa_pairs = defaultdict(dict)
+
+    def update(self, model_output: QAModelOutput) -> None:
+        if self._eval_config.is_pwc:
+            return self._update_pwc(model_output)
+        else:
+            return self._update_default(model_output)
+
+    def _update_pwc(self, model_output: QAModelOutput) -> None:
+        raise NotImplementedError("PWC evaluation not implemented yet.")
+
+    def _update_default(self, model_output: QAModelOutput) -> None:
+        for qa_pair in model_output.qa_pairs:
+            if qa_pair.question not in self._sample_level_qa_pairs[qa_pair.sample_id]:
+                self._sample_level_qa_pairs[qa_pair.sample_id][qa_pair.question] = (
+                    qa_pair.answer
+                )
+
+    def _compute_default(self) -> float:
+        reference = []
+        answers = []
+        logger.info(
+            f"Preparing answers and reference for DueEvaluator based on reference file: {self._eval_config.reference_path}"
+        )
+        with open(self._eval_config.reference_path) as expected:
+            for per_sample_reference in expected:
+                per_sample_reference = json.loads(per_sample_reference)
+
+                # for these datasets, we always have one value per key
+                # also add assertion that this is true so we catch any issues early
+                # for ann in per_sample_reference["annotations"]:
+                #     assert len(ann["values"]) == 1, (
+                #         f"For default compute, we expect one value per key. "
+                #         f"Found {len(ann['values'])} values for key {ann['key']} in sample {per_sample_reference['name']}"
+                #     )
+
+                # for each line we extract the doc id (sample_id) which is in the line['name'] field
+                # and get the list of (question, answer) pairs from our predictions
+                if per_sample_reference["name"] not in self._sample_level_qa_pairs:
+                    # logger.warning(
+                    #     f"Sample ID {per_sample_reference['name']} not found in predictions. Skipping."
+                    # )
+                    continue
+                predicted_key_values = self._sample_level_qa_pairs[
+                    per_sample_reference["name"]
+                ]
+
+                per_sample_answers = []
+                for ann in per_sample_reference["annotations"]:
+                    per_sample_answers.append(
+                        {
+                            "key": ann["key"],
+                            "values": [
+                                {"value": predicted_key_values.get(ann["key"], "")}
+                            ],  # if no answer found, return empty string
+                        }
+                    )
+                answers.append(
+                    {
+                        "name": per_sample_reference["name"],
+                        "annotations": per_sample_answers,
+                    }
+                )
+                reference.append(per_sample_reference)
+
+        # log info
+        assert len(reference) == len(answers), (
+            f"Number of samples in reference and answers should be the same. "
+            f"Found {len(reference)} in reference and {len(answers)} in answers."
+        )
+        logger.info("Running DUE Evaluation on %d samples", len(reference))
+        logger.info("First reference sample:")
+        logger.info(json.dumps(reference[0], indent=2))
+        logger.info("First answer sample:")
+        logger.info(json.dumps(answers[0], indent=2))
+
+        # load the eval reference file
+        evaluator = due_evaluator.DueEvaluator(
+            reference=reference,
+            answers=answers,
+            property_set=None,
+            ignore_case=self._eval_config.ignore_case,
+            metric=self._eval_config.metric,
+        )
+        scores = property_scores_to_string(
+            [evaluator], "json", ["Precision", "Recall", "F1"]
+        )
+        scores = json.loads(scores)
+        print("Scores", scores)
+        return scores["ALL"]
+
+    def compute(self) -> float:
+        if self._eval_config.is_pwc:
+            raise NotImplementedError("PWC evaluation not implemented yet.")
+        else:
+            return self._compute_default()
+
+    def completed(self, engine: Engine, name: str) -> None:
+        result = self.compute()
+        if isinstance(result, Mapping):
+            if name in result.keys():
+                raise ValueError(
+                    f"Argument name '{name}' is conflicting with mapping keys: {list(result.keys())}"
+                )
+
+            for key, value in result.items():
+                engine.state.metrics[name + "/" + key] = value
+        else:
+            if isinstance(result, torch.Tensor):
+                if len(result.size()) == 0:
+                    result = result.item()
+                elif "cpu" not in result.device.type:
+                    result = result.cpu()
+
+            engine.state.metrics[name] = result
+
+
+def load_extractive_qa_metrics(
+    dataset_name: str, stage: str, device: str
+) -> dict[str, Metric]:
+    metrics = {
+        "ex_due_eval": ExDueEvalMetric(
+            dataset_name=dataset_name, stage=stage, device=device
+        ),
+    }
+
+    if dataset_name == "ex_docvqa":
+        metrics["ex_due_eval_hw"] = ExDueEvalMetric(
+            dataset_name="ex_docvqa_hw", stage=stage, device=device
+        )
+
+    return metrics
+
+
+# def test_step(engine, batch):
+#     return QAModelOutput(
+#         qa_pairs=[
+#             QAPair(sample_id="rnbx0223_193", question="What is the Compound Annual Growth Rate (CAGR) for dividend payout?", answer="Artificial Intelligence"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the Compound Annual Growth Rate (CAGR) for net worth per share?", answer="Machine Learning"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the Compound Annual Growth Rate (CAGR) for net worth?", answer="A programming language"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the Compound Annual Growth Rate (CAGR) for total assets?", answer="A snake"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the dividend payout in 1996?", answer="A car"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the dividend payout in 2012?", answer="A bike"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the net worth in 1996 (Rs. Cr.)?", answer="A train"),
+#             QAPair(sample_id="rnbx0223_193", question="What is the net worth in 2012 (Rs. Cr.)?", answer="A plane"),
+#         ]
+#     )
+
+# engine = Engine(test_step)
+
+# metrics = load_extractive_qa_metrics("ex_docvqa", device="cpu")
+# for metric_name, metric in metrics.items():
+#     logger.info(f"Attaching metric {metric_name} to engine")
+#     metric.attach(
+#         engine,
+#         f"train/{metric_name}",
+#         usage=EpochWise(),
+#     )
+
+# state = engine.run([None], max_epochs=1)
+# print(state.metrics)
diff --git a/docgenie/evaluation/runners/_metrics/token_classification.py b/docgenie/evaluation/runners/_metrics/token_classification.py
new file mode 100755
index 0000000000000000000000000000000000000000..edfda1214b8ccbc8b92568fb57a2943c6fa18d95
--- /dev/null
+++ b/docgenie/evaluation/runners/_metrics/token_classification.py
@@ -0,0 +1,90 @@
+from collections.abc import Mapping
+
+import torch
+from ignite.engine import Engine
+from ignite.metrics.metric import Metric, reinit__is_reduced
+
+from docgenie.evaluation.model_pipeline._core._data_types import (
+    TokenClassificationModelOutput,
+)
+
+
+class SeqEvalMetric(Metric):
+    def __init__(
+        self,
+        device: str | torch.device = torch.device("cpu"),
+        scheme: str = "IOB2",
+    ) -> None:
+        self._check_done = False
+        self._scheme = scheme
+        super().__init__(device=device)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._total_target_label_names: list[list[str]] = []
+        self._total_predicted_label_names: list[list[str]] = []
+
+    @reinit__is_reduced
+    def update(self, model_output: TokenClassificationModelOutput) -> None:
+        self._total_target_label_names.extend(model_output.target_label_names)
+        self._total_predicted_label_names.extend(model_output.predicted_label_names)
+
+    def compute(self) -> float:
+        from seqeval.metrics import (
+            accuracy_score,
+            classification_report,
+            f1_score,
+            precision_score,
+            recall_score,
+        )
+
+        scores = {
+            "accuracy_score": accuracy_score(
+                self._total_target_label_names, self._total_predicted_label_names
+            ),
+            "precision_score": precision_score(
+                self._total_target_label_names,
+                self._total_predicted_label_names,
+                scheme=self._scheme,
+            ),
+            "recall_score": recall_score(
+                self._total_target_label_names,
+                self._total_predicted_label_names,
+                scheme=self._scheme,
+            ),
+            "f1_score": f1_score(
+                self._total_target_label_names,
+                self._total_predicted_label_names,
+                scheme=self._scheme,
+            ),
+            "classification_report": classification_report(
+                self._total_target_label_names,
+                self._total_predicted_label_names,
+                scheme=self._scheme,
+                output_dict=True,
+            ),
+        }
+        return scores
+
+    def completed(self, engine: Engine, name: str) -> None:
+        result = self.compute()
+        if isinstance(result, Mapping):
+            if name in result.keys():
+                raise ValueError(
+                    f"Argument name '{name}' is conflicting with mapping keys: {list(result.keys())}"
+                )
+
+            for key, value in result.items():
+                engine.state.metrics[name + "/" + key] = value
+        else:
+            if isinstance(result, torch.Tensor):
+                if len(result.size()) == 0:
+                    result = result.item()
+                elif "cpu" not in result.device.type:
+                    result = result.cpu()
+
+            engine.state.metrics[name] = result
+
+
+def load_token_classification_metrics(device: str) -> dict[str, Metric]:
+    return {"seqeval": SeqEvalMetric(device=device)}
diff --git a/docgenie/evaluation/runners/_training_step.py b/docgenie/evaluation/runners/_training_step.py
new file mode 100755
index 0000000000000000000000000000000000000000..42494bfc31ff1b945484451f4308d7dbca690c95
--- /dev/null
+++ b/docgenie/evaluation/runners/_training_step.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import torch
+from ignite.engine import Engine
+from torch.amp import autocast
+from torch.cuda.amp import GradScaler
+
+from docgenie.data._core._data_types import BaseModelInput
+from docgenie.evaluation.model_pipeline import ModelPipeline
+
+
+class TrainingStep:
+    def __init__(
+        self,
+        model_pipeline: ModelPipeline,
+        optimizer: "torch.optim.Optimizer",
+        lr_scheduler: "torch.optim.lr_scheduler.LRScheduler",
+        device: str,
+        with_amp: bool,
+        gradient_accumulation_steps: int,
+        enable_grad_clipping: bool,
+        max_grad_norm: float,
+    ):
+        self.model_pipeline = model_pipeline
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.device = device
+        self.with_amp = with_amp
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.enable_grad_clipping = enable_grad_clipping
+        self.max_grad_norm = max_grad_norm
+
+        # setup grad scaler for mixed precision training
+        self.scaler = GradScaler(enabled=self.with_amp)
+
+    def __call__(self, engine: "Engine", batch: "BaseModelInput"):
+        from torch.nn.utils import clip_grad_norm_
+
+        self.model_pipeline.train()
+        batch = batch.to(self.device)
+
+        if (engine.state.iteration - 1) % self.gradient_accumulation_steps == 0:
+            self.optimizer.zero_grad()
+
+        with autocast(device_type=str(self.device), enabled=self.with_amp):
+            outputs = self.model_pipeline.training_step(batch=batch)
+            loss = outputs.loss
+
+            # accumulate loss if required
+            assert loss is not None, "Loss is None in training step."
+            if self.gradient_accumulation_steps > 1:
+                loss = loss / self.gradient_accumulation_steps
+
+        # backward pass with grad scaler
+        self.scaler.scale(loss).backward()
+
+        if engine.state.iteration % self.gradient_accumulation_steps == 0:
+            if self.enable_grad_clipping:
+                self.scaler.unscale_(self.optimizer)
+                clip_grad_norm_(
+                    self.model_pipeline.model.parameters(),
+                    max_norm=self.max_grad_norm,
+                )
+
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            self.lr_scheduler.step()
+        return outputs
diff --git a/docgenie/evaluation/runners/mixed_runner.py b/docgenie/evaluation/runners/mixed_runner.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d9e5ff10c88123c51f7b3fdf83254442bbaf179
--- /dev/null
+++ b/docgenie/evaluation/runners/mixed_runner.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pydantic_argparse
+
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interface import get_dataset_config, load_mixed_data_pipeline
+from docgenie.evaluation.runners._config import MixedRunnerConfig
+from docgenie.evaluation.runners.runner_v2 import Runner
+from docgenie.evaluation.runners.utilities import (
+    prepare_transform_kwargs,
+)
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    pass
+
+
+logger = get_logger(__name__)
+
+
+class MixedRunner(Runner):
+    def setup_data_pipeline(self):
+        """Setup and return the data pipeline."""
+        logger.info("=" * 50)
+        logger.info("BUILDING DATA PIPELINE")
+        logger.info("=" * 50)
+
+        self.config: MixedRunnerConfig
+
+        task_type = get_dataset_config(self.config.dataset_name).task_type
+        if task_type == TaskType.extractive_qa:
+            assert self.config.use_preprocessed_dataset, (
+                "Extractive QA datasets require preprocessed datasets. Pass the --use-preprocessed-dataset flag"
+            )
+
+        transform_kwargs = prepare_transform_kwargs(self.config)
+        data_pipeline = load_mixed_data_pipeline(
+            dataset_name=self.config.dataset_name,
+            synthetic_dataset_name=self.config.synthetic_dataset_name,
+            num_real_samples=self.config.num_real_samples,
+            num_synthetic_samples=self.config.num_synthetic_samples,
+            load_preprocessed=self.config.use_preprocessed_dataset,
+            dataset_splitting_enabled=False,  # for now we don't use train/val split in mixed datasets
+            **transform_kwargs,
+        )
+
+        # Log detailed data pipeline information
+        logger.info(f"Dataset: {data_pipeline.dataset}")
+        logger.info(f"Dataset metadata: {data_pipeline.dataset_metadata}")
+        logger.info(f"Dataset labels: {data_pipeline.dataset_metadata.dataset_labels}")
+
+        return data_pipeline
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=MixedRunnerConfig,
+    )
+    MixedRunner(parser.parse_typed_args()).run()
diff --git a/docgenie/evaluation/runners/per_sample_evaluator.py b/docgenie/evaluation/runners/per_sample_evaluator.py
new file mode 100755
index 0000000000000000000000000000000000000000..276e415551b8f69a4e815baacd3656080e8152eb
--- /dev/null
+++ b/docgenie/evaluation/runners/per_sample_evaluator.py
@@ -0,0 +1,304 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+import torch
+import pydantic_argparse
+from tqdm import tqdm
+from typing import TYPE_CHECKING
+from docgenie.evaluation.runners._config import RunnerConfig
+from docgenie.evaluation.model_pipeline import ModelPipeline
+from docgenie.data import TaskType
+from docgenie.evaluation.runners._metrics import (
+    load_classification_metrics,
+    load_token_classification_metrics,
+)
+from docgenie.evaluation.runners._metrics.detection import load_detection_metrics
+from docgenie.evaluation.runners._metrics.question_answering import load_extractive_qa_metrics
+from docgenie.logging import get_logger
+from collections import defaultdict
+from docgenie.evaluation.runners.runner import (
+    setup_data_pipeline,
+    setup_model_pipeline,
+    log_system_info,
+    log_run_configuration,
+)
+import numpy as np
+if TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+logger = get_logger(__name__)
+
+def _make_json_safe(obj):
+    """Recursively convert torch and numpy types to JSON-serializable types."""
+    if isinstance(obj, (np.integer, np.int32, np.int64)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float32, np.float64)):
+        return float(obj)
+    elif isinstance(obj, (torch.Tensor,)):
+        if obj.numel() == 1:
+            return obj.item()
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {k: _make_json_safe(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [_make_json_safe(v) for v in obj]
+    else:
+        return obj
+def load_metrics(dataset_name: str, stage: str, task_type: TaskType, device: str, dataset_labels):
+    if task_type == TaskType.sequence_classification:
+        num_labels = len(dataset_labels.classification)
+        return load_classification_metrics(device=device, num_classes=num_labels)
+    elif task_type == TaskType.token_classification:
+        return load_token_classification_metrics(device=device)
+    elif task_type == TaskType.extractive_qa:
+        return load_extractive_qa_metrics(dataset_name=dataset_name, stage=stage, device=device)
+    elif task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+        return load_detection_metrics(device=device)
+    else:
+        raise NotImplementedError(f"Metrics not implemented for task type: {task_type}")
+
+
+def _extractive_qa_per_sample_eval(dataloader:DataLoader | None,model_pipeline,metrics,device):
+    # Group outputs by document ID (remove _subsample_X suffix)
+    document_outputs = defaultdict(list)
+    document_sample_ids = {}
+    
+    logger.info("Collecting predictions for all subsamples...")
+    
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(dataloader, desc="Running inference")):
+            batch_dict = batch.to_dict()
+            batch = batch.to(device)
+            outputs = model_pipeline.evaluation_step(batch=batch)
+            
+          
+            sample_id = batch_dict["sample_id"][0]  
+            
+            # ID: "rnbx0223_193_subsample_0" -> "rnbx0223_193"
+            if "_subsample_" in sample_id:
+                doc_id = sample_id.rsplit("_subsample_", 1)[0]
+            else:
+                doc_id = sample_id
+            
+     
+            document_outputs[doc_id].append(outputs)
+            document_sample_ids[doc_id] = sample_id  
+    
+    logger.info(f"Found {len(document_outputs)} unique documents with subsamples")
+    ''''''
+
+    results = []
+    for doc_id in tqdm(document_outputs.keys(), desc="Evaluating per document"):
+        doc_outputs = document_outputs[doc_id]
+        
+        
+        sample_metrics = {}
+        for name, metric in metrics.items():
+            metric.reset()
+            
+     
+            for output in doc_outputs:
+                metric.update(output)
+            
+  
+            m_value = metric.compute()
+            
+            if isinstance(m_value, torch.Tensor):
+                if m_value.numel() == 1:
+                    m_value = m_value.item()
+                else:
+                    m_value = m_value.cpu().tolist()
+            elif isinstance(m_value, dict):
+                m_value = {
+                    k: (v.item() if torch.is_tensor(v) and v.numel() == 1 else 
+                        v.cpu().tolist() if torch.is_tensor(v) else v) 
+                    for k, v in m_value.items()
+                }
+            
+            sample_metrics[name] = m_value
+        
+        sample_result = {
+            "sample_id": doc_id,
+            "num_questions": len(doc_outputs),
+            "metrics": sample_metrics,
+        }
+        
+        results.append(sample_result)
+    
+    return results
+
+def _token_classificatio_per_sample_eval(dataloader:DataLoader | None,model_pipeline,metrics,device):
+    results=[]
+    ''''''
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(dataloader, desc="Per-sample evaluation")):
+            batch_dict=batch.to_dict()
+            # print("sample batch is ",batch.print_info())
+            batch = batch.to(device)
+            outputs = model_pipeline.evaluation_step(batch=batch)
+            
+            # Compute metrics for this single sample
+            sample_metrics = {}
+            for name, metric in metrics.items():
+                metric.reset()
+                metric.update(outputs)
+                m_value = metric.compute()
+                if isinstance(m_value, torch.Tensor):
+                    m_value = m_value.item()
+                elif isinstance(m_value, dict):
+                    m_value = {k: (v.item() if torch.is_tensor(v) else v) for k, v in m_value.items()}
+                sample_metrics[name] = m_value
+
+
+            sample_result = {
+                "sample_id": batch_dict["sample_id"],
+                "metrics": sample_metrics,
+            }
+            # print(getattr(batch,"sample_id"))
+
+            results.append(sample_result)
+    return results
+
+def _sequence_classification_per_sample_eval(dataloader:DataLoader | None,model_pipeline,metrics,device):
+    '''We trained and evaluate our models with precesion and recall
+    having macro averaging, which takes average of P and R with total
+    number of classes and for one sample it doesn't make any sense because
+    only 1 class get predicted and rest 15 classes will be 0 and have
+    no support for P and R leading to errors.''' 
+    results=[]
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(dataloader, desc="Per-sample evaluation")):
+            batch_dict=batch.to_dict()
+            # print("sample batch is ",batch.print_info())
+            batch = batch.to(device)
+            outputs = model_pipeline.evaluation_step(batch=batch)
+
+            
+            # Compute metrics for this single sample
+
+            for name, metric in metrics.items():
+                if name=="accuracy":
+                    metric.reset()
+                    metric.update((outputs.logits, outputs.gt_label_value))
+                    m_value = metric.compute()
+                    predicted_class = outputs.predicted_label_value.item()
+                    gt_class = outputs.gt_label_value.item()
+                    is_correct = (predicted_class == gt_class)
+                    
+                    # Get prediction confidence
+                    prediction_confidence = outputs.prediction_probs[0, predicted_class].item()
+                    gt_class_confidence = outputs.prediction_probs[0, gt_class].item()
+                    
+                    sample_result = {
+                        "sample_id": batch_dict["sample_id"],
+                        "correct": is_correct,
+                        "predicted_label": outputs.predicted_label_name[0] if outputs.predicted_label_name else None,
+                        "predicted_label_id": predicted_class,
+                        "gt_label": outputs.gt_label_name[0] if outputs.gt_label_name else None,
+                        "gt_label_id": gt_class,
+                        "prediction_confidence": prediction_confidence,
+                        "gt_class_confidence": gt_class_confidence,
+                        "loss": outputs.loss.item() if outputs.loss is not None else None,
+                        "accuracy":m_value,
+                    }
+
+                    results.append(sample_result)
+                else:
+                    continue
+    return results
+
+
+
+def evaluate_per_sample(config: RunnerConfig):
+    """Run inference on each sample in the test/train/val set and compute metrics per sample."""
+    from docgenie.evaluation.runners.utilities import _initialize_torch
+    logger.info("=" * 80)
+    logger.info("DOCGENIE PER-SAMPLE EVALUATION STARTED")
+    logger.info("=" * 80)
+
+    log_system_info()
+    log_run_configuration(config)
+    logger.info("Initializing torch runtime...")
+    __DEVICE__= "cuda" if torch.cuda.is_available() and config.device_id >= 0 else "cpu"
+
+    _initialize_torch(seed=config.seed, deterministic=config.deterministic)
+
+    
+
+    data_pipeline = setup_data_pipeline(config)
+    model_pipeline, task_type = setup_model_pipeline(config, data_pipeline)
+
+
+    resume_checkpoint = None
+    if config.output_dir.exists():
+        ckpts = sorted(config.output_dir.glob("epoch_checkpoint_*.pt"))
+        if ckpts:
+            resume_checkpoint = ckpts[-1]
+            logger.info(f"Loading model checkpoint: {resume_checkpoint}")
+            checkpoint_data = torch.load(resume_checkpoint, map_location="cpu")
+            from ignite.handlers import Checkpoint
+            Checkpoint.load_objects(
+                to_load={"model_pipeline": model_pipeline}, checkpoint=checkpoint_data, strict=False
+            )
+
+    model_pipeline = model_pipeline.to(__DEVICE__)
+    model_pipeline.eval()
+
+    metrics = load_metrics(
+        dataset_name=config.dataset_name,
+        stage="test",
+        task_type=task_type,
+        device=__DEVICE__,
+        dataset_labels=data_pipeline.dataset_metadata.dataset_labels,
+    )
+
+    '''Here we will get the split based on config or 
+    if not available then fallback to what is available'''
+    if config.split_for_per_sample_eval.lower()=="test":
+        dataloader = data_pipeline.test_dataloader(
+            batch_size=1,  # force per-sample evaluation
+            num_workers=0,
+            pin_memory=False,
+        )
+        if dataloader is None:
+            dataloader = data_pipeline.validation_dataloader(batch_size=1, num_workers=0, pin_memory=False)
+            logger.warning("Test split not found, using validation split for evaluation.")
+
+    elif config.split_for_per_sample_eval.lower()=="val":
+        dataloader = data_pipeline.validation_dataloader(batch_size=1, num_workers=0, pin_memory=False)
+    else:
+        dataloader=data_pipeline.train_dataloader(
+            batch_size=1,
+            num_workers=0,
+            pin_memory=False
+        )
+
+    logger.info(f"Running per-sample evaluation on {len(dataloader.dataset)} samples...")
+
+    if task_type == TaskType.sequence_classification:
+        results=_sequence_classification_per_sample_eval(dataloader=dataloader,model_pipeline=model_pipeline,metrics=metrics,device=__DEVICE__)
+    elif task_type == TaskType.token_classification:
+        results=_token_classificatio_per_sample_eval(dataloader=dataloader,model_pipeline=model_pipeline,metrics=metrics,device=__DEVICE__)
+    elif task_type==TaskType.extractive_qa:
+        results=_extractive_qa_per_sample_eval(dataloader=dataloader,model_pipeline=model_pipeline,metrics=metrics,device=__DEVICE__)
+    else:
+        raise NotImplementedError(f"Per sample evaluation not implemented for task type: {task_type}")
+
+
+    # --------------------------------------------------------------------------------
+    # Dumping the results to JSON here
+    # --------------------------------------------------------------------------------
+    safe_results = _make_json_safe(results)
+    output_path = Path(config.output_dir) / f"{config.dataset_name}_per_sample_results.json"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(safe_results, f, indent=4)
+
+    logger.info(f"Per-sample evaluation completed! Results saved to: {output_path}")
+
+# --------------------------------------------------------------------------------
+# Entry point
+# --------------------------------------------------------------------------------
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(model=RunnerConfig)
+    config = parser.parse_typed_args()
+    evaluate_per_sample(config)
diff --git a/docgenie/evaluation/runners/runner_v2.py b/docgenie/evaluation/runners/runner_v2.py
new file mode 100755
index 0000000000000000000000000000000000000000..d3c8b22e23b8b29baed4b90906a2854285f106ac
--- /dev/null
+++ b/docgenie/evaluation/runners/runner_v2.py
@@ -0,0 +1,576 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pydantic_argparse
+from ignite.engine import Events
+from ignite.handlers import Checkpoint, EarlyStopping
+
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interfaces.dataset import get_dataset_config
+from docgenie.evaluation.runners._config import RunnerConfig
+from docgenie.evaluation.runners._evaluation_step import EvaluationStep
+from docgenie.evaluation.runners._training_step import TrainingStep
+from docgenie.evaluation.runners.utilities import (
+    _find_best_checkpoint_in_dir,
+    _find_resume_checkpoint_in_dir,
+    _get_linear_schedule_with_min_lr,
+    _initialize_torch,
+    _initialize_wandb,
+    _load_metrics,
+    _load_optimizer,
+    _log_run_configuration,
+    _log_system_info,
+    configure_engine,
+    configure_model_checkpointer,
+    prepare_transform_kwargs,
+)
+from docgenie.logging import get_logger
+
+if TYPE_CHECKING:
+    from ignite.engine import Engine
+
+
+logger = get_logger(__name__)
+
+
+class Runner:
+    def __init__(self, config: RunnerConfig):
+        self.config = config
+
+    def setup_logger(self):
+        logger.info("=" * 50)
+        logger.info("BUILDING LOGGER")
+        logger.info("=" * 50)
+        resume_checkpoint = _find_resume_checkpoint_in_dir(self.config.output_dir)
+        logger.info("Found checkpoint: {}".format(resume_checkpoint))
+        return resume_checkpoint, _initialize_wandb(
+            project=self.config.project_name,
+            id=self.config.run_name,
+            name=self.config.run_name,
+            config=self.config.dict(),
+            tags=[self.config.dataset_name, self.config.model_name],
+            runs_dir=self.config.runs_dir,
+            resume=resume_checkpoint is not None,
+        )
+
+    def setup_data_pipeline(self):
+        """Setup and return the data pipeline."""
+        from docgenie.data import load_data_pipeline, load_preprocessed_data_pipeline
+
+        logger.info("=" * 50)
+        logger.info("BUILDING DATA PIPELINE")
+        logger.info("=" * 50)
+        task_type = get_dataset_config(self.config.dataset_name).task_type
+        if task_type == TaskType.extractive_qa:
+            assert self.config.use_preprocessed_dataset, (
+                "Extractive QA datasets require preprocessed datasets. Pass the --use-preprocessed-dataset flag"
+            )
+
+        transform_kwargs = prepare_transform_kwargs(self.config)
+        if self.config.use_preprocessed_dataset:
+            data_pipeline = load_preprocessed_data_pipeline(
+                dataset_name=self.config.dataset_name,
+                dataset_splitting_enabled=self.config.dataset_splitting_enabled,
+                split_ratio=self.config.split_ratio,
+                **transform_kwargs,
+            )
+        else:
+            data_pipeline = load_data_pipeline(
+                dataset_name=self.config.dataset_name,
+                dataset_splitting_enabled=self.config.dataset_splitting_enabled,
+                split_ratio=self.config.split_ratio,
+                **transform_kwargs,
+            )
+
+        # Log detailed data pipeline information
+        logger.info(f"Dataset: {data_pipeline.dataset}")
+        logger.info(f"Dataset metadata: {data_pipeline.dataset_metadata}")
+        logger.info(f"Dataset labels: {data_pipeline.dataset_metadata.dataset_labels}")
+
+        return data_pipeline
+
+    def setup_model_pipeline(self):
+        """Setup and return the model pipeline."""
+        import torch
+
+        from docgenie.evaluation.model_pipeline import load_model_pipeline
+
+        logger.info("=" * 50)
+        logger.info("BUILDING MODEL PIPELINE")
+        logger.info("=" * 50)
+        model_pipeline = load_model_pipeline(
+            task_type=self.data_pipeline.dataset.task_type,
+            model_name=self.config.model_name,
+            model_cache_dir=self.config.model_cache_dir,
+            dataset_metadata=self.data_pipeline.dataset_metadata,
+        )
+        logger.info(f"Model architecture: {model_pipeline}")
+        logger.info(f"Moving model to device: {self.device}")
+        model_pipeline = model_pipeline.to(self.device)
+
+        if self.config.pretrained_checkpoint is not None:
+            logger.info(
+                f"Loading pretrained model from checkpoint: {self.config.pretrained_checkpoint}"
+            )
+            checkpoint_data = torch.load(
+                self.config.pretrained_checkpoint, map_location="cpu"
+            )
+            Checkpoint.load_objects(
+                to_load={"model_pipeline": model_pipeline},
+                checkpoint=checkpoint_data,
+                strict=True,
+            )
+        return model_pipeline
+
+    def setup_optimizer_and_scheduler(self):
+        logger.info("=" * 50)
+        logger.info("SETTING UP OPTIMIZER AND LR SCHEDULER")
+        logger.info("=" * 50)
+
+        optimizer = _load_optimizer(
+            optimizer=self.config.optimizer,
+            parameters=self.model_pipeline.model.parameters(),
+            lr=self.config.lr_start,
+            weight_decay=self.config.weight_decay,
+            momentum=self.config.momentum,
+        )
+
+        # total number of training steps
+        num_training_steps = self.config.num_epochs * len(self.train_dataloader)
+        # build lr scheduler
+        warmup_steps = int(
+            num_training_steps * self.config.lr_schedule_warmup_steps_frac_of_total
+        )
+        logger.info(f"Total training epochs: {self.config.num_epochs}")
+        logger.info(f"Steps per epoch: {len(self.train_dataloader)}")
+        logger.info(f"Total training steps: {num_training_steps}")
+        logger.info(f"Initial LR: {self.config.lr_start}")
+        logger.info(f"Final LR: {self.config.lr_end}")
+        logger.info(
+            f"Warmup steps: {warmup_steps} ({self.config.lr_schedule_warmup_steps_frac_of_total:.1%} of total)"
+        )
+
+        lr_scheduler = _get_linear_schedule_with_min_lr(
+            optimizer,
+            num_warmup_steps=warmup_steps,
+            num_training_steps=num_training_steps,
+            initial_lr=self.config.lr_start,
+            final_lr=self.config.lr_end,
+        )
+
+        # create optimizer and lr scheduler
+        logger.info("Using optimizer:")
+        logger.info(optimizer)
+
+        # create optimizer and lr scheduler
+        logger.info("Using lr_scheduler:")
+        logger.info(lr_scheduler)
+
+        return optimizer, lr_scheduler
+
+    def setup_dataloaders(self):
+        """Setup and return the dataloaders."""
+        train_dataloader = self.data_pipeline.train_dataloader(
+            batch_size=self.config.train_batch_size,
+            num_workers=self.config.num_workers,
+            pin_memory=self.config.pin_memory,
+        )
+        val_dataloader = self.data_pipeline.validation_dataloader(
+            batch_size=self.config.eval_batch_size,
+            num_workers=self.config.num_workers,
+            pin_memory=self.config.pin_memory,
+        )
+        test_dataloader = self.data_pipeline.test_dataloader(
+            batch_size=self.config.eval_batch_size,
+            num_workers=self.config.num_workers,
+            pin_memory=self.config.pin_memory,
+        )
+
+        # setup dataloader
+        if test_dataloader is None:
+            logger.warning(
+                "This dataset does not have a test split, using validation set for evaluation."
+            )
+            test_dataloader = self.data_pipeline.validation_dataloader(
+                batch_size=self.config.eval_batch_size,
+                num_workers=self.config.num_workers,
+                pin_memory=self.config.pin_memory,
+            )
+
+        assert train_dataloader is not None, "Train dataloader is None."
+        assert val_dataloader is not None, "Validation dataloader is None."
+        assert test_dataloader is not None, "Test dataloader is None."
+
+        return train_dataloader, val_dataloader, test_dataloader
+
+    def setup(self):
+        # Log system information
+        _log_system_info()
+
+        # Log run configuration
+        _log_run_configuration(self.config)
+
+        # initialize training
+        _initialize_torch(
+            seed=self.config.seed, deterministic=self.config.deterministic
+        )
+
+        # initialize torch device (cpu or gpu)
+        self.device = "cuda" if self.config.device_id >= 0 else "cpu"
+        logger.info(f"Selected device: {self.device}")
+
+        # setup wandb logger
+        self.resume_checkpoint, self.wandb_logger = self.setup_logger()
+
+        # setup data pipeline
+        self.data_pipeline = self.setup_data_pipeline()
+
+        # if only_prepare_data flag is set, exit after data preparation
+        if self.config.only_prepare_data:
+            logger.info("Only data preparation flag is set. Exiting after data setup.")
+            logger.info(
+                "Total samples in train set: {}".format(
+                    len(self.data_pipeline.dataset.train)
+                )
+            )  # type: ignore
+            logger.info(
+                "Total samples in validation set: {}".format(
+                    len(self.data_pipeline.dataset.validation)
+                )
+            )  # type: ignore
+            logger.info(
+                "Total samples in test set: {}".format(
+                    len(self.data_pipeline.dataset.test)
+                )
+            )  # type: ignore
+
+            # print first ids of val samples
+            if self.data_pipeline.dataset.validation is not None:
+                val_sample_ids = [
+                    sample.sample_id for sample in self.data_pipeline.dataset.validation
+                ]
+                logger.info(f"First 10 validation sample IDs: {val_sample_ids[:10]}")
+
+            sys.exit(0)
+
+        # set task type
+        self.task_type = self.data_pipeline.dataset.task_type
+
+        # setup model pipeline
+        self.model_pipeline = self.setup_model_pipeline()
+
+        # setup dataloaders
+        self.train_dataloader, self.val_dataloader, self.test_dataloader = (
+            self.setup_dataloaders()
+        )
+
+    def train(
+        self,
+    ) -> None:
+        from ignite.engine import Engine
+
+        assert self.model_pipeline is not None, "Model pipeline is not initialized."
+        assert self.data_pipeline is not None, "Data pipeline is not initialized."
+
+        logger.info("=" * 50)
+        logger.info("SETTING UP TRAINING")
+        logger.info("=" * 50)
+
+        # create optimizer and lr scheduler
+        optimizer, lr_scheduler = self.setup_optimizer_and_scheduler()
+
+        # setup model to devoce
+        logger.info(f"Moving model to device: {self.device}")
+        self.model_pipeline = self.model_pipeline.to(self.device)
+        self.model_pipeline.train()
+
+        # make training step
+        training_step = TrainingStep(
+            model_pipeline=self.model_pipeline,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            device=self.device,
+            with_amp=self.config.with_amp,
+            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+            enable_grad_clipping=self.config.enable_grad_clipping,
+            max_grad_norm=self.config.max_grad_norm,
+        )
+
+        # make validation step
+        validation_step = EvaluationStep(
+            model_pipeline=self.model_pipeline, device=self.device, stage="validation"
+        )
+
+        # make and create the training engine
+        training_engine = Engine(training_step)
+
+        # configure training engine
+        configure_engine(
+            engine=training_engine,
+            stage="train",
+            metrics=None,  # no metrics for training stage
+            optimizer=optimizer,
+            wandb_logger=self.wandb_logger,
+            model_name=self.config.model_name,
+            dataset_name=self.config.dataset_name,
+            task_type=self.task_type,
+            output_dir=self.config.output_dir,
+        )
+
+        validation_engine = None
+        if self.config.do_validation:
+            # configure validation engine
+            validation_engine = Engine(validation_step)
+
+            # configure validation engine
+            configure_engine(
+                engine=validation_engine,
+                stage="validation",
+                metrics=_load_metrics(
+                    stage="validation",
+                    dataset_name=self.config.dataset_name,
+                    task_type=self.task_type,
+                    device=self.device,
+                    dataset_labels=self.data_pipeline.dataset_metadata.dataset_labels,
+                ),
+                parent_engine=training_engine,
+                wandb_logger=self.wandb_logger,
+                model_name=self.config.model_name,
+                dataset_name=self.config.dataset_name,
+                task_type=self.task_type,
+                output_dir=self.config.output_dir,
+            )
+
+            if self.config.enable_early_stopping:
+                assert self.config.monitored_metric is not None, (
+                    "monitored_metric must be set for early stopping."
+                )
+                es_handler = EarlyStopping(
+                    patience=10,
+                    score_function=Checkpoint.get_default_score_fn(
+                        self.config.monitored_metric,
+                        -1 if self.config.monitored_metric_mode == "min" else 1.0,
+                    ),
+                    trainer=training_engine,
+                )
+                validation_engine.add_event_handler(Events.COMPLETED, es_handler)
+
+            def run_validation(engine: Engine) -> None:
+                logger.info(
+                    f"Running validation engine on total samples {len(self.val_dataloader.dataset)} "  # type: ignore
+                    f"with batch size [{self.val_dataloader.batch_size}]"
+                )
+                validation_engine.run(
+                    self.val_dataloader,
+                    epoch_length=1 if engine.state.epoch == 0 else None,
+                )
+
+            if self.config.validate_every_n_epochs > 1.0:
+                training_engine.add_event_handler(
+                    Events.EPOCH_COMPLETED(
+                        every=int(self.config.validate_every_n_epochs)
+                    )
+                    | Events.STARTED,
+                    run_validation,
+                )
+            else:
+                assert self.config.validate_every_n_epochs > 0.0, (
+                    "validate_every_n_epochs must be positive."
+                )
+                validate_every_n_iterations = int(
+                    self.config.validate_every_n_epochs * len(self.train_dataloader)
+                )
+                logger.info(
+                    f"Validating every {validate_every_n_iterations} iterations."
+                )
+                training_engine.add_event_handler(
+                    Events.ITERATION_COMPLETED(every=validate_every_n_iterations)
+                    | Events.STARTED,
+                    run_validation,
+                )
+
+        # Configure model checkpointing
+        configure_model_checkpointer(
+            config=self.config,
+            training_engine=training_engine,
+            validation_engine=validation_engine,
+            model_pipeline=self.model_pipeline,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            resume_checkpoint=self.resume_checkpoint,
+        )
+
+        resume_epoch = training_engine.state.epoch
+        if (
+            training_engine._is_done(training_engine.state)
+            and resume_epoch >= self.config.num_epochs
+        ):  # if we are resuming from last checkpoint and training is already finished
+            logger.warning(
+                "Training has already been finished! Either increase the number of "
+                f"epochs (current={self.config.num_epochs}) >= {resume_epoch} "
+                "OR reset the training from start."
+            )
+            return
+
+        # setup dataloader
+        logger.info(
+            f"Running training engine on total samples {len(self.train_dataloader.dataset)} "  # type: ignore
+            f"with batch size [{self.train_dataloader.batch_size}] and output_dir: {self.config.output_dir}"  # type: ignore
+        )
+
+        if self.config.test_run:
+            logger.info("Test run enabled, running only 1 epoch...")
+            training_engine.run(self.train_dataloader, max_epochs=1, epoch_length=10)
+        else:
+            training_engine.run(
+                self.train_dataloader, max_epochs=self.config.num_epochs
+            )
+
+    def get_test_output_path(self) -> Path:
+        return (
+            Path("data")
+            / "results"
+            / self.config.dataset_name
+            / self.config.model_name
+            / str(self.config.seed)
+            / f"{self.config.project_name}"
+            / f"{self.config.run_name}.json"
+        )
+
+    def test(
+        self,
+        resume_checkpoint: str | None = None,
+    ) -> None:
+        import numpy as np
+        import torch
+        from ignite.engine import Engine
+
+        if self.config.use_best:
+            resume_checkpoint = _find_best_checkpoint_in_dir(
+                output_dir=self.config.output_dir
+            )
+        else:
+            resume_checkpoint = _find_resume_checkpoint_in_dir(
+                output_dir=self.config.output_dir
+            )
+        if resume_checkpoint is not None:
+            logger.info(f"Loading model from checkpoint: {resume_checkpoint}")
+            import torch
+            from ignite.handlers import Checkpoint
+
+            checkpoint_data = torch.load(resume_checkpoint, map_location="cpu")
+            Checkpoint.load_objects(
+                to_load={"model_pipeline": self.model_pipeline},
+                checkpoint=checkpoint_data,
+                strict=True,
+            )
+
+        # setup model to devoce
+        model_pipeline = self.model_pipeline.to(self.device)
+
+        # load task metrics
+        metrics = _load_metrics(
+            dataset_name=self.config.dataset_name,
+            stage="test",
+            task_type=self.task_type,
+            device=self.device,
+            dataset_labels=self.data_pipeline.dataset_metadata.dataset_labels,
+        )
+
+        # create evaluation engine
+        evaluation_engine = Engine(
+            EvaluationStep(
+                model_pipeline=model_pipeline, device=self.device, stage="test"
+            )
+        )
+
+        # configure engine
+        configure_engine(
+            engine=evaluation_engine,
+            stage="test",
+            metrics=metrics,
+            wandb_logger=self.wandb_logger,
+            model_name=self.config.model_name,
+            dataset_name=self.config.dataset_name,
+            task_type=self.task_type,
+            output_dir=self.config.output_dir,
+        )
+
+        logger.info(
+            f"Running evaluation engine on total samples {len(self.test_dataloader.dataset)} "  # type: ignore
+            f"with batch size [{self.test_dataloader.batch_size}]"
+        )
+        state = evaluation_engine.run(
+            self.test_dataloader, epoch_length=1 if self.config.test_run else None
+        )
+
+        # get the test metrics and save the results in a file
+        test_metrics = state.metrics
+
+        # recursively go throughg metric dict converting tensors or numpy arays to floats
+        def _convert_metric_values(obj):
+            if isinstance(obj, dict):
+                return {k: _convert_metric_values(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [_convert_metric_values(item) for item in obj]
+            elif isinstance(obj, tuple):
+                return tuple(_convert_metric_values(item) for item in obj)
+            elif isinstance(obj, torch.Tensor):
+                if obj.numel() == 1:
+                    return obj.item()
+                else:
+                    return obj.detach().cpu().numpy().tolist()
+            elif isinstance(obj, np.ndarray):
+                if obj.size == 1:
+                    return obj.item()
+                else:
+                    return obj.tolist()
+            elif hasattr(obj, "item") and callable(getattr(obj, "item")):
+                return obj.item()
+            else:
+                return obj
+
+        test_metrics = _convert_metric_values(state.metrics)
+
+        output_file_path = self.get_test_output_path()
+        output_file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_file_path, "w") as f:
+            import json
+
+            json.dump({
+                **test_metrics,
+                "checkpoint": str(resume_checkpoint),
+                "config": self.config.dict(),
+
+            }, f, indent=4, sort_keys=True)
+        logger.info(f"Test results saved to: {output_file_path}")
+
+    def run(self):
+        output_file_path = self.get_test_output_path()
+        if output_file_path.exists() and not self.config.do_eval:
+            logger.info(
+                f"Output file {output_file_path} already exists and overwrite_output is set to False. Skipping run."
+            )
+            return
+
+        self.setup()
+        if self.config.do_train:
+            self.train()
+
+        if self.config.do_eval:
+            self.test()
+        self.close()
+
+    def close(self):
+        if self.wandb_logger is not None:
+            self.wandb_logger.close()
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=RunnerConfig,
+    )
+    Runner(parser.parse_typed_args()).run()
diff --git a/docgenie/evaluation/runners/utilities.py b/docgenie/evaluation/runners/utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..03d9dfe5d63d6609faad182a6f35bd0e0ef62ceb
--- /dev/null
+++ b/docgenie/evaluation/runners/utilities.py
@@ -0,0 +1,595 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from ignite.engine import Engine
+from ignite.metrics import Metric
+
+from docgenie.data._core._data_types import DatasetLabels
+from docgenie.data._core._utilities import TaskType
+from docgenie.data.interfaces.dataset import get_dataset_config
+from docgenie.evaluation.model_pipeline import ModelPipeline
+from docgenie.evaluation.runners._config import RunnerConfig
+from docgenie.evaluation.runners.wandb_logger import WandBLogger
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+EXPERIMENT_NAME_KEY = "experiment_name"
+METRICS_KEY = "metrics"
+TRAINING_ENGINE_KEY = "training_engine"
+MODEL_PIPELINE_CHECKPOINT_KEY = "model_pipeline"
+RUN_CONFIG_KEY = "run_config"
+
+
+def _reset_random_seeds(seed):
+    """
+    Resets random seeds for reproducibility across various libraries.
+
+    Args:
+        seed (int): The seed value to set for random number generation.
+
+    Libraries affected:
+        - random: Python's built-in random module.
+        - numpy: NumPy library for numerical computations.
+        - torch: PyTorch library for deep learning.
+    """
+    import random
+
+    import torch
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def _initialize_torch(seed: int = 0, deterministic: bool = False):
+    """
+    Initializes PyTorch settings, including random seeds and deterministic behavior.
+
+    Args:
+        seed (int, optional): The base seed value for random number generation. Defaults to 0.
+        deterministic (bool, optional): Whether to enforce deterministic behavior for reproducibility. Defaults to False.
+
+    Behavior:
+        - Sets the global seed for reproducibility.
+        - Configures PyTorch's CuDNN backend for deterministic or performance-optimized behavior.
+    """
+
+    import torch
+
+    _reset_random_seeds(seed)
+
+    # Configure CuDNN backend for deterministic behavior if required
+    if deterministic:
+        torch.backends.cudnn.enabled = False
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = True
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    return seed
+
+
+def _log_system_info():
+    """Log system information including hardware, memory, and GPU details."""
+    import platform
+    import sys
+
+    import ignite.distributed as idist
+    import psutil
+    import torch
+
+    logger.info("SYSTEM INFORMATION")
+    logger.info("-" * 30)
+    logger.info(f"Platform: {platform.platform()}")
+    logger.info(f"Python version: {sys.version}")
+    logger.info(f"Architecture: {platform.architecture()}")
+    logger.info(f"Processor: {platform.processor()}")
+    logger.info(
+        f"CPU count: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical"
+    )
+
+    # Memory information
+    memory = psutil.virtual_memory()
+    logger.info(f"Total RAM: {memory.total / 1024**3:.2f} GB")
+    logger.info(f"Available RAM: {memory.available / 1024**3:.2f} GB")
+    logger.info(f"RAM usage: {memory.percent}%")
+
+    # GPU information
+    if torch.cuda.is_available():
+        logger.info("World size: %d", idist.get_world_size())
+        logger.info("Rank: %d", idist.get_rank())
+        logger.info("CUDA available: True")
+        logger.info(f"CUDA version: {torch.version.cuda}")
+        logger.info(f"GPU count: {torch.cuda.device_count()}")
+        for i in range(torch.cuda.device_count()):
+            gpu_props = torch.cuda.get_device_properties(i)
+            logger.info(
+                f"GPU {i}: {gpu_props.name} ({gpu_props.total_memory / 1024**3:.2f} GB)"
+            )
+    else:
+        logger.info("CUDA available: False")
+
+    logger.info(f"PyTorch version: {torch.__version__}")
+
+
+def _log_run_configuration(config: RunnerConfig):
+    """Log the run configuration details."""
+    import yaml
+
+    logger.info("-" * 30)
+    logger.info("RUN CONFIGURATION")
+    logger.info("-" * 30)
+    logger.info(f"Run name: {config.run_name}")
+    logger.info(f"Output directory: {config.runs_dir}")
+    logger.info(f"Full config:\n{yaml.dump(config.dict(), indent=4)}")
+
+
+def _initialize_wandb(
+    project: str,
+    id: str,
+    name: str,
+    config: dict,
+    tags: list[str],
+    runs_dir: str,
+    resume: bool = False,
+) -> WandBLogger | None:
+    """Initialize wandb logging if available."""
+    import ignite.distributed as idist
+
+    if idist.get_rank() == 0:
+        try:
+            wandb_logger = WandBLogger(
+                project=project,
+                id=id,
+                name=name,
+                config=config,
+                tags=tags,
+                dir=runs_dir,  # Add this line to specify wandb directory
+                resume="must" if resume else None,
+            )
+            logger.info("Wandb logging initialized")
+            return wandb_logger
+        except ImportError:
+            logger.warning("wandb not available, skipping initialization")
+
+
+def _find_resume_checkpoint_in_dir(output_dir: str | Path) -> str | None:
+    output_dir = Path(output_dir)
+
+    if not output_dir.exists():
+        return
+
+    checkpoint_files = [
+        str(ckpt).split("_")[-1].replace(".pt", "")
+        for ckpt in list(output_dir.glob("epoch_checkpoint_*.pt"))
+    ]
+
+    epochs = [int(ckpt.split("_")[-1].replace(".pt", "")) for ckpt in checkpoint_files]
+
+    checkpoint_files = [ckpt for _, ckpt in sorted(zip(epochs, checkpoint_files))]
+
+    if len(checkpoint_files) == 0:
+        return
+
+    resume_checkpoint = str(output_dir / f"epoch_checkpoint_{checkpoint_files[-1]}.pt")
+    return resume_checkpoint
+
+
+def _find_best_checkpoint_in_dir(
+    output_dir: str | Path, checkpoint_prefix: str = "epoch_checkpoint"
+) -> str | None:
+    output_dir = Path(output_dir)
+
+    if not output_dir.exists():
+        return
+
+    checkpoint_files = [ckpt for ckpt in list(output_dir.glob("best_checkpoint_*.pt"))]
+    scores = [
+        float(str(ckpt).split("=")[-1].replace(".pt", "")) for ckpt in checkpoint_files
+    ]
+    checkpoint_files = [ckpt for _, ckpt in sorted(zip(scores, checkpoint_files))]
+    if len(checkpoint_files) == 0:
+        return
+    return str(output_dir / checkpoint_files[-1])
+
+
+def _load_optimizer(
+    optimizer: str, parameters, lr: float, weight_decay: float, momentum: float
+):
+    from torch.optim import SGD, Adam, AdamW
+
+    if optimizer.lower() == "sgd":
+        return SGD(
+            parameters,
+            lr=lr,
+            momentum=momentum,
+        )
+    elif optimizer.lower() == "adamw":
+        return AdamW(parameters, lr=lr, weight_decay=weight_decay)
+    elif optimizer.lower() == "adam":
+        return Adam(parameters, lr=lr, weight_decay=weight_decay)
+    else:
+        raise NotImplementedError(f"Optimizer {optimizer} not implemented.")
+
+
+def _load_metrics(
+    dataset_name: str,
+    stage: str,
+    task_type: TaskType,
+    device: str,
+    dataset_labels: DatasetLabels,
+) -> dict[str, Metric]:
+    from docgenie.evaluation.runners._metrics import (
+        load_classification_metrics,
+        load_detection_metrics,
+        load_extractive_qa_metrics,
+        load_token_classification_metrics,
+    )
+
+    if task_type == TaskType.sequence_classification:
+        assert dataset_labels is not None, (
+            "dataset_labels cannot be None when loading metrics."
+        )
+        assert dataset_labels.classification is not None, (
+            "dataset_labels.classification cannot be None when loading classification metrics."
+        )
+        num_labels = len(dataset_labels.classification)
+        return load_classification_metrics(
+            device=device,
+            num_classes=num_labels,
+        )
+    elif task_type == TaskType.token_classification:
+        return load_token_classification_metrics(
+            device=device,
+        )
+    elif task_type == TaskType.extractive_qa:
+        return load_extractive_qa_metrics(
+            dataset_name=dataset_name,
+            stage=stage,
+            device=device,
+        )
+    elif task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+        return load_detection_metrics(device=device)
+    else:
+        raise NotImplementedError(f"Metrics not implemented for task type: {task_type}")
+
+
+def _get_linear_schedule_with_min_lr(
+    optimizer, num_warmup_steps, num_training_steps, initial_lr, final_lr
+):
+    import torch
+
+    # Calculate the decay factor based on the desired final learning rate
+    decay_factor = (initial_lr - final_lr) / initial_lr
+
+    # Adjusted lambda function for custom linear decay with final LR
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(
+            max(1, num_training_steps - num_warmup_steps)
+        )
+        # Linearly decrease to final_lr instead of 0
+        return max(final_lr / initial_lr, 1.0 - decay_factor * progress)
+
+    # Use the LambdaLR scheduler with this custom lambda function
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+
+
+def configure_engine(
+    engine: "Engine",
+    stage: str,
+    model_name: str = "",
+    dataset_name: str = "",
+    wandb_logger: WandBLogger | None = None,
+    metrics: dict | None = None,
+    optimizer: Any | None = None,
+    parent_engine: Engine | None = None,
+    task_type: TaskType | None = None,
+    output_dir: str | Path | None = None,
+) -> None:
+    import ignite.distributed as idist
+    from ignite.engine import Events
+    from ignite.handlers import ProgressBar
+    from ignite.metrics import EpochWise, RunningAverage
+
+    # attach runninge average of loss
+    logger.info("Setting up running average loss callback")
+
+    is_layout_analysis = task_type is not None and task_type in [
+        TaskType.layout_analysis,
+        TaskType.table_extraction,
+    ]
+    if is_layout_analysis:
+        if stage == "train":
+            loss_metric_names = [
+                "loss",
+                "loss_rpn_cls",
+                "loss_rpn_bbox",
+                "loss_cls",
+                "loss_bbox",
+                "acc",
+            ]
+            for key in loss_metric_names:
+                RunningAverage(
+                    alpha=0.95,
+                    output_transform=lambda x, k=key: x.loss_dict[k]
+                    if k in x.loss_dict
+                    else 0.0,
+                    epoch_bound=True,
+                ).attach(engine, key)
+        else:
+            loss_metric_names = []
+    else:
+        loss_metric_names = [f"{stage}/running_avg_loss"]
+        RunningAverage(
+            alpha=0.95,
+            output_transform=lambda x: x.loss,
+            epoch_bound=True,
+        ).attach(engine, loss_metric_names[0])
+
+    if metrics is not None:
+        # attach metrics
+        for metric_name, metric in metrics.items():
+            logger.info(f"Attaching metric {metric_name} to engine")
+            metric.attach(
+                engine,
+                f"{stage}/{metric_name}",
+                usage=EpochWise(),
+            )
+
+    # attach progress bar only on rank 0
+    if idist.get_rank() == 0:
+        logger.info(f"Setting up progress bar with loss metrics: {loss_metric_names}")
+        progress_bar = ProgressBar(
+            desc=f"Running stage={stage} on model={model_name} and dataset={dataset_name}",
+            persist=True,
+        )
+        progress_bar.attach(
+            engine,
+            event_name=Events.ITERATION_COMPLETED(every=10),
+            metric_names=loss_metric_names,
+        )
+
+        @engine.on(Events.EPOCH_COMPLETED)
+        def progress_on_epoch_completed(engine: Engine) -> None:
+            logger.info(
+                "Epoch %d - Evaluation time: %.2fs - %s metrics: EpochResult:",
+                engine.state.epoch,
+                engine.state.times["EPOCH_COMPLETED"],
+                stage,
+            )
+            for k, v in engine.state.metrics.items():
+                logger.info(f"\t{k}: {v}")
+
+        @engine.on(Events.TERMINATE | Events.INTERRUPT)
+        def progress_on_terminate(engine: Engine) -> None:
+            logger.info(
+                f"Engine [{stage}] terminated after {engine.state.epoch} epochs."
+            )
+            progress_bar.close()
+
+    if wandb_logger is not None and idist.get_rank() == 0:
+        wandb_logger.attach_output_handler(
+            engine,
+            event_name=Events.EPOCH_COMPLETED | Events.COMPLETED,
+            tag=stage,
+            metric_names="all",
+            global_step_transform=lambda *_: parent_engine.state.iteration
+            if parent_engine
+            else engine.state.iteration,
+        )
+
+        if stage == "train":
+            wandb_logger.attach_output_handler(
+                engine,
+                event_name=Events.ITERATION_COMPLETED(every=50),
+                tag=stage,
+                metric_names=loss_metric_names,
+            )
+
+            if optimizer is not None:
+                wandb_logger.attach_opt_params_handler(
+                    engine,
+                    event_name=Events.ITERATION_STARTED(every=20),
+                    optimizer=optimizer,
+                )
+
+        if is_layout_analysis:
+            assert output_dir is not None, (
+                "output_dir cannot be None for visualization."
+            )
+
+            @engine.on(Events.EPOCH_COMPLETED)
+            def log_images(engine):
+                import mmcv
+                from mmdet.structures.bbox import scale_boxes
+                from mmdet.visualization import DetLocalVisualizer
+
+                # Determine output directory
+                epoch = (
+                    parent_engine.state.epoch
+                    if stage == "validation"
+                    else engine.state.epoch
+                )
+                _output_dir = Path(output_dir) / "visualizations" / stage / f"{epoch}"
+                _output_dir.mkdir(parents=True, exist_ok=True)
+                batch = engine.state.batch
+                batch = {
+                    "inputs": batch.inputs,
+                    "data_samples": batch.data_samples,
+                }
+                det_data_samples = engine.state.output.det_data_samples
+                class_labels = engine.state.output.class_labels
+
+                visualizer = DetLocalVisualizer()
+                visualizer.dataset_meta["classes"] = class_labels
+
+                logger.info(f"Saving visualizations to {_output_dir}")
+                for idx, data_sample in enumerate(det_data_samples):
+                    img = batch["inputs"][idx].cpu().numpy().transpose(1, 2, 0)
+
+                    scale_factor = data_sample.metainfo.get("scale_factor")
+                    if "gt_instances" in data_sample and stage != "train":
+                        data_sample.gt_instances.bboxes = scale_boxes(
+                            data_sample.gt_instances.bboxes, scale_factor
+                        )
+
+                    if "pred_instances" in data_sample:
+                        data_sample.pred_instances.bboxes = scale_boxes(
+                            data_sample.pred_instances.bboxes, scale_factor
+                        )
+
+                    # Draw predictions
+                    visualizer.add_datasample(
+                        name=f"{engine.state.iteration}_sample_{idx}",
+                        image=img,
+                        data_sample=data_sample,
+                        draw_gt=True,
+                        draw_pred=True,
+                    )
+
+                    # Save visualization
+                    output_path = (
+                        _output_dir / f"{engine.state.iteration}_sample_{idx}.png"
+                    )
+                    logger.info(f"Saving visualization to {output_path}")
+                    mmcv.imwrite(visualizer.get_image(), str(output_path))
+
+
+def configure_model_checkpointer(
+    config: RunnerConfig,
+    training_engine: Engine,
+    model_pipeline: ModelPipeline,
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
+    validation_engine: Engine | None = None,
+    resume_checkpoint: str | None = None,
+) -> dict | None:
+    """Configure model checkpointing for training and validation engines."""
+    import torch
+    from ignite.contrib.handlers import global_step_from_engine
+    from ignite.engine import Events
+    from ignite.handlers import Checkpoint, ModelCheckpoint
+
+    logger.info("=" * 50)
+    logger.info("CONFIGURING MODEL CHECKPOINTER")
+    logger.info("=" * 50)
+
+    logger.info(f"Checkpoint directory: {config.output_dir}")
+    logger.info(f"Save checkpoint every: {config.save_ckpt_every_n_epochs} epochs")
+    logger.info(f"Keep n checkpoints: {config.keep_n_checkpoints}")
+
+    # setup checkpoint saving if required
+    checkpoint_state_dict = {
+        "config": config,
+        "training_engine": training_engine,
+        "model_pipeline": model_pipeline,
+        "optimizer": optimizer,
+        "lr_scheduler": lr_scheduler,
+    }
+
+    model_checkpoint = ModelCheckpoint(
+        config.output_dir,
+        filename_prefix="epoch",
+        n_saved=1,
+        include_self=True,
+        global_step_transform=lambda *_: training_engine.state.epoch,
+        require_empty=False,
+    )
+    training_engine.add_event_handler(
+        Events.EPOCH_COMPLETED(every=1),
+        model_checkpoint,
+        checkpoint_state_dict,
+    )
+
+    # Configure best model checkpointing if monitored metric is specified
+    if validation_engine is not None and config.monitored_metric is not None:
+        logger.info(
+            f"Configuring best model checkpointing with monitored metric: {config.monitored_metric}"
+        )
+        logger.info(f"Monitored metric mode: {config.monitored_metric_mode}")
+
+        best_model_saver = ModelCheckpoint(
+            config.output_dir,
+            filename_prefix="best",
+            n_saved=1,
+            global_step_transform=global_step_from_engine(training_engine),
+            score_name=config.monitored_metric.replace("/", "-"),
+            score_function=Checkpoint.get_default_score_fn(
+                config.monitored_metric,
+                -1 if config.monitored_metric_mode == "min" else 1.0,
+            ),
+            require_empty=False,
+        )
+        validation_engine.add_event_handler(
+            Events.COMPLETED, best_model_saver, checkpoint_state_dict
+        )
+        logger.info("Best model checkpoint saving configured.")
+
+    if resume_checkpoint is not None:
+        logger.info("=" * 50)
+        logger.info("RESUMING FROM CHECKPOINT")
+        logger.info("=" * 50)
+        logger.info(f"Checkpoint detected, resuming training from: {resume_checkpoint}")
+
+        try:
+            import torch
+            from ignite.handlers import Checkpoint
+
+            resume_checkpoint_data = torch.load(resume_checkpoint, map_location="cpu")
+            Checkpoint.load_objects(
+                to_load=checkpoint_state_dict,
+                checkpoint=resume_checkpoint_data,
+                strict=True,
+            )
+            logger.info(f"Resumed at epoch: {training_engine.state.epoch}")
+            return resume_checkpoint_data
+        except Exception as e:
+            logger.error(f"Failed to resume from checkpoint: {e}")
+            logger.info("Starting training from scratch...")
+            return None
+    else:
+        logger.info("No existing checkpoints found, starting from scratch.")
+        return None
+
+
+def prepare_transform_kwargs(config: RunnerConfig) -> dict:
+    transform_kwargs: dict[str, Any] = {
+        "tokenizer_name": config.tokenizer_name,
+    }
+    task_type = get_dataset_config(config.dataset_name).task_type
+    if task_type in [TaskType.sequence_classification, TaskType.token_classification]:
+        transform_kwargs["use_segment_level_bboxes"] = config.use_segment_level_bboxes
+    elif task_type == TaskType.extractive_qa:
+        transform_kwargs["use_segment_level_bboxes"] = config.use_segment_level_bboxes
+        transform_kwargs["ignore_samples_with_no_answer"] = (
+            config.ignore_samples_with_no_answer
+        )
+    if config.resize_width is not None:
+        transform_kwargs["resize_width"] = config.resize_width
+    if config.resize_height is not None:
+        transform_kwargs["resize_height"] = config.resize_height
+    if config.use_imagenet_mean_std:
+        transform_kwargs["use_imagenet_mean_std"] = config.use_imagenet_mean_std
+    if config.add_segment_level_info:
+        transform_kwargs["add_segment_level_info"] = config.add_segment_level_info
+    if config.model_name.startswith("microsoft/udop") or config.model_name.startswith(
+        "google-t5/t5"
+    ):
+        transform_kwargs["transform_type"] = "conditional_generation"
+    if task_type in [TaskType.layout_analysis, TaskType.table_extraction]:
+        transform_kwargs["use_fixed_size"] = config.use_fixed_size
+        transform_kwargs["fixed_size"] = config.fixed_size
+        transform_kwargs["use_flip"] = config.use_flip
+
+    return transform_kwargs
diff --git a/docgenie/evaluation/runners/wandb_logger.py b/docgenie/evaluation/runners/wandb_logger.py
new file mode 100755
index 0000000000000000000000000000000000000000..00e2652352a267333ab4a82bee3c393dee61109d
--- /dev/null
+++ b/docgenie/evaluation/runners/wandb_logger.py
@@ -0,0 +1,389 @@
+"""WandB logger and its helper handlers."""
+
+from typing import Any, Callable, List, Optional, Union
+
+from ignite.engine import Engine, Events
+from ignite.handlers.base_logger import (
+    BaseLogger,
+    BaseOptimizerParamsHandler,
+    BaseOutputHandler,
+)
+from ignite.handlers.utils import global_step_from_engine  # noqa
+from torch.optim import Optimizer
+
+__all__ = [
+    "WandBLogger",
+    "OutputHandler",
+    "OptimizerParamsHandler",
+    "global_step_from_engine",
+]
+
+
+class WandBLogger(BaseLogger):
+    """`Weights & Biases <https://wandb.ai/site>`_ handler to log metrics, model/optimizer parameters, gradients
+    during training and validation. It can also be used to log model checkpoints to the Weights & Biases cloud.
+
+    .. code-block:: bash
+
+        pip install wandb
+
+    This class is also a wrapper for the wandb module. This means that you can call any wandb function using
+    this wrapper. See examples on how to save model parameters and gradients.
+
+    Args:
+        args: Positional arguments accepted by `wandb.init`.
+        kwargs: Keyword arguments accepted by `wandb.init`.
+            Please see `wandb.init <https://docs.wandb.ai/ref/python/sdk/functions/init/>`_ for documentation of possible parameters.
+
+    Examples:
+        .. code-block:: python
+
+            from ignite.handlers.wandb_logger import *
+
+            # Create a logger. All parameters are optional. See documentation
+            # on wandb.init for details.
+
+            wandb_logger = WandBLogger(
+                entity="shared",
+                project="pytorch-ignite-integration",
+                name="cnn-mnist",
+                config={"max_epochs": 10},
+                tags=["pytorch-ignite", "minst"]
+            )
+
+            # Attach the logger to the trainer to log training loss at each iteration
+            wandb_logger.attach_output_handler(
+                trainer,
+                event_name=Events.ITERATION_COMPLETED,
+                tag="training",
+                output_transform=lambda loss: {"loss": loss}
+            )
+
+            # Attach the logger to the evaluator on the training dataset and log NLL, Accuracy metrics after each epoch
+            # We setup `global_step_transform=lambda *_: trainer.state.iteration` to take iteration value
+            # of the `trainer`:
+            wandb_logger.attach_output_handler(
+                train_evaluator,
+                event_name=Events.EPOCH_COMPLETED,
+                tag="training",
+                metric_names=["nll", "accuracy"],
+                global_step_transform=lambda *_: trainer.state.iteration,
+            )
+
+            # Attach the logger to the evaluator on the validation dataset and log NLL, Accuracy metrics after
+            # each epoch. We setup `global_step_transform=lambda *_: trainer.state.iteration` to take iteration value
+            # of the `trainer` instead of `evaluator`.
+            wandb_logger.attach_output_handler(
+                evaluator,
+                event_name=Events.EPOCH_COMPLETED,
+                tag="validation",
+                metric_names=["nll", "accuracy"],
+                global_step_transform=lambda *_: trainer.state.iteration,
+            )
+
+            # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration
+            wandb_logger.attach_opt_params_handler(
+                trainer,
+                event_name=Events.ITERATION_STARTED,
+                optimizer=optimizer,
+                param_name='lr'  # optional
+            )
+
+            # We need to close the logger when we are done
+            wandb_logger.close()
+
+        If you want to log model gradients, the model call graph, etc., use the logger as wrapper of wandb. Refer
+        to the documentation of wandb.watch for details:
+
+        .. code-block:: python
+
+            wandb_logger = WandBLogger(
+                entity="shared",
+                project="pytorch-ignite-integration",
+                name="cnn-mnist",
+                config={"max_epochs": 10},
+                tags=["pytorch-ignite", "minst"]
+            )
+
+            model = torch.nn.Sequential(...)
+            wandb_logger.watch(model)
+
+        For model checkpointing, Weights & Biases creates a local run dir, and automatically synchronizes all
+        files saved there at the end of the run. You can just use the `wandb_logger.run.dir` as path for the
+        `ModelCheckpoint`:
+
+        .. code-block:: python
+
+            from ignite.handlers import ModelCheckpoint
+
+            def score_function(engine):
+                return engine.state.metrics['accuracy']
+
+            model_checkpoint = ModelCheckpoint(
+                wandb_logger.run.dir, n_saved=2, filename_prefix='best',
+                require_empty=False, score_function=score_function,
+                score_name="validation_accuracy",
+                global_step_transform=global_step_from_engine(trainer)
+            )
+            evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {'model': model})
+
+    Note:
+        :class:`~ignite.handlers.wandb_logger.OutputHandler` can handle
+        metrics, state attributes and engine output values of the following format:
+        - scalar values (i.e. int, float)
+        - 0d and 1d pytorch tensors
+        - dicts and list/tuples of previous types
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        try:
+            import wandb
+
+            self._wandb = wandb
+        except ImportError:
+            raise ModuleNotFoundError(
+                "This contrib module requires wandb to be installed. "
+                "You man install wandb with the command:\n pip install wandb\n"
+            )
+        if kwargs.get("init", True):
+            kwargs.pop("init", None)
+            self._wandb.init(*args, **kwargs)
+
+    def __getattr__(self, attr: Any) -> Any:
+        return getattr(self._wandb, attr)
+
+    def close(self) -> None:
+        self._wandb.finish()
+
+    def _create_output_handler(self, *args: Any, **kwargs: Any) -> "OutputHandler":
+        return OutputHandler(*args, **kwargs)
+
+    def _create_opt_params_handler(
+        self, *args: Any, **kwargs: Any
+    ) -> "OptimizerParamsHandler":
+        return OptimizerParamsHandler(*args, **kwargs)
+
+
+class OutputHandler(BaseOutputHandler):
+    """Helper handler to log engine's output and/or metrics
+
+    Args:
+        tag: common title for all produced plots. For example, "training"
+        metric_names: list of metric names to plot or a string "all" to plot all available
+            metrics.
+        output_transform: output transform function to prepare `engine.state.output` as a number.
+            For example, `output_transform = lambda output: output`
+            This function can also return a dictionary, e.g `{"loss": loss1, "another_loss": loss2}` to label the plot
+            with corresponding keys.
+        global_step_transform: global step transform function to output a desired global step.
+            Input of the function is `(engine, event_name)`. Output of function should be an integer.
+            Default is None, global_step based on attached engine. If provided,
+            uses function output as global_step. To setup global step from another engine, please use
+            :meth:`~ignite.handlers.wandb_logger.global_step_from_engine`.
+        sync: If set to False, process calls to log in a seperate thread. Default (None) uses whatever
+            the default value of wandb.log.
+
+    Examples:
+        .. code-block:: python
+
+            from ignite.handlers.wandb_logger import *
+
+            # Create a logger. All parameters are optional. See documentation
+            # on wandb.init for details.
+
+            wandb_logger = WandBLogger(
+                entity="shared",
+                project="pytorch-ignite-integration",
+                name="cnn-mnist",
+                config={"max_epochs": 10},
+                tags=["pytorch-ignite", "minst"]
+            )
+
+            # Attach the logger to the evaluator on the validation dataset and log NLL, Accuracy metrics after
+            # each epoch. We setup `global_step_transform=lambda *_: trainer.state.iteration,` to take iteration value
+            # of the `trainer`:
+            wandb_logger.attach(
+                evaluator,
+                log_handler=OutputHandler(
+                    tag="validation",
+                    metric_names=["nll", "accuracy"],
+                    global_step_transform=lambda *_: trainer.state.iteration,
+                ),
+                event_name=Events.EPOCH_COMPLETED
+            )
+            # or equivalently
+            wandb_logger.attach_output_handler(
+                evaluator,
+                event_name=Events.EPOCH_COMPLETED,
+                tag="validation",
+                metric_names=["nll", "accuracy"],
+                global_step_transform=lambda *_: trainer.state.iteration,
+            )
+
+        Another example, where model is evaluated every 500 iterations:
+
+        .. code-block:: python
+
+            from ignite.handlers.wandb_logger import *
+
+            @trainer.on(Events.ITERATION_COMPLETED(every=500))
+            def evaluate(engine):
+                evaluator.run(validation_set, max_epochs=1)
+
+            # Create a logger. All parameters are optional. See documentation
+            # on wandb.init for details.
+
+            wandb_logger = WandBLogger(
+                entity="shared",
+                project="pytorch-ignite-integration",
+                name="cnn-mnist",
+                config={"max_epochs": 10},
+                tags=["pytorch-ignite", "minst"]
+            )
+
+            def global_step_transform(*args, **kwargs):
+                return trainer.state.iteration
+
+            # Attach the logger to the evaluator on the validation dataset and log NLL, Accuracy metrics after
+            # every 500 iterations. Since evaluator engine does not have access to the training iteration, we
+            # provide a global_step_transform to return the trainer.state.iteration for the global_step, each time
+            # evaluator metrics are plotted on Weights & Biases.
+
+            wandb_logger.attach_output_handler(
+                evaluator,
+                event_name=Events.EPOCH_COMPLETED,
+                tag="validation",
+                metrics=["nll", "accuracy"],
+                global_step_transform=global_step_transform
+            )
+
+        Another example where the State Attributes ``trainer.state.alpha`` and ``trainer.state.beta``
+        are also logged along with the NLL and Accuracy after each iteration:
+
+        .. code-block:: python
+
+            wandb_logger.attach_output_handler(
+                trainer,
+                event_name=Events.ITERATION_COMPLETED,
+                tag="training",
+                metrics=["nll", "accuracy"],
+                state_attributes=["alpha", "beta"],
+            )
+
+
+        Example of `global_step_transform`:
+
+        .. code-block:: python
+
+            def global_step_transform(engine, event_name):
+                return engine.state.get_event_attrib_value(event_name)
+
+    .. versionchanged:: 0.4.7
+        accepts an optional list of `state_attributes`
+    """
+
+    def __init__(
+        self,
+        tag: str,
+        metric_names: Optional[List[str]] = None,
+        output_transform: Optional[Callable] = None,
+        global_step_transform: Optional[
+            Callable[[Engine, Union[str, Events]], int]
+        ] = None,
+        state_attributes: Optional[List[str]] = None,
+    ):
+        super().__init__(
+            tag, metric_names, output_transform, global_step_transform, state_attributes
+        )
+
+    def __call__(
+        self, engine: Engine, logger: WandBLogger, event_name: Union[str, Events]
+    ) -> None:
+        if not isinstance(logger, WandBLogger):
+            raise RuntimeError(
+                f"Handler '{self.__class__.__name__}' works only with WandBLogger."
+            )
+
+        global_step = self.global_step_transform(engine, event_name)
+        if not isinstance(global_step, int):
+            raise TypeError(
+                f"global_step must be int, got {type(global_step)}."
+                " Please check the output of global_step_transform."
+            )
+
+        metrics = self._setup_output_metrics_state_attrs(
+            engine, log_text=True, key_tuple=False
+        )
+        is_test_stage = False
+        for key in metrics.keys():
+            if key.startswith("test/"):
+                is_test_stage = True
+                break
+        if is_test_stage:
+            logger.log(metrics)
+        else:
+            logger.log(metrics, step=global_step)
+
+
+class OptimizerParamsHandler(BaseOptimizerParamsHandler):
+    """Helper handler to log optimizer parameters
+
+    Args:
+        optimizer: torch optimizer or any object with attribute ``param_groups``
+            as a sequence.
+        param_name: parameter name
+        tag: common title for all produced plots. For example, "generator"
+        sync: If set to False, process calls to log in a seperate thread. Default (None) uses whatever
+            the default value of wandb.log.
+
+    Examples:
+        .. code-block:: python
+
+            from ignite.handlers.wandb_logger import *
+
+            # Create a logger. All parameters are optional. See documentation
+            # on wandb.init for details.
+
+            wandb_logger = WandBLogger(
+                entity="shared",
+                project="pytorch-ignite-integration",
+                name="cnn-mnist",
+                config={"max_epochs": 10},
+                tags=["pytorch-ignite", "minst"]
+            )
+
+            # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration
+            wandb_logger.attach(
+                trainer,
+                log_handler=OptimizerParamsHandler(optimizer),
+                event_name=Events.ITERATION_STARTED
+            )
+            # or equivalently
+            wandb_logger.attach_opt_params_handler(
+                trainer,
+                event_name=Events.ITERATION_STARTED,
+                optimizer=optimizer
+            )
+    """
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        param_name: str = "lr",
+        tag: Optional[str] = None,
+    ):
+        super(OptimizerParamsHandler, self).__init__(optimizer, param_name, tag)
+
+    def __call__(
+        self, engine: Engine, logger: WandBLogger, event_name: Union[str, Events]
+    ) -> None:
+        global_step = engine.state.get_event_attrib_value(event_name)
+        tag_prefix = f"{self.tag}/" if self.tag else ""
+        params = {
+            f"{tag_prefix}{self.param_name}/group_{i}": float(
+                param_group[self.param_name]
+            )
+            for i, param_group in enumerate(self.optimizer.param_groups)
+        }
+        logger.log(params, step=global_step)
diff --git a/docgenie/evaluation/utils.py b/docgenie/evaluation/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..aa08235243c851163969696e9668821544aec508
--- /dev/null
+++ b/docgenie/evaluation/utils.py
@@ -0,0 +1,6 @@
+import torch
+
+
+def get_device():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    return device
diff --git a/docgenie/generation/constants.py b/docgenie/generation/constants.py
new file mode 100755
index 0000000000000000000000000000000000000000..92a694b78a21fccc91865169339692d9a512b925
--- /dev/null
+++ b/docgenie/generation/constants.py
@@ -0,0 +1,63 @@
+SEED_IMAGE_MAX_WIDTH: int = 500
+SEED_IMAGE_QUALITY: int = 80
+
+PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE = 50
+PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH = 5
+PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION = 5
+PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF = 0.75
+
+PIPELINE_03_RENDER_PDF__MAX_WORKERS = 8
+PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY = 10
+PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT = 30
+PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES = 2
+
+BS_PARSER = "lxml"  # "html.parser"
+
+PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN = -1
+PIPELINE_04_3_SCALE_UP_FACTOR = 3
+
+# bboxes read from pdf (and probably also those retrieved via OCR) dont fit exactly into the geo extracted via javascript
+BBOX_TO_GEO_MATCHING_THRESHOLD = 25
+
+IMAGE_RENDER_EXT = "png"
+
+HANDWRITING_DEFAULT_BATCH_SIZE = 256
+HANDWRITING_CLASS_NAME = "handwritten"
+SIGNATURE_CLASS_NAME = "signature"
+HANDWRITING_FONT_SIZE = "26"
+
+FIXED_HANDWRITING_X_OFFSET = (
+    2  # place all handwritten text 2px to the right to look better
+)
+MAX_HANDWRITING_RAND_X_OFFSET_LEFT = 1
+MAX_HANDWRITING_RAND_X_OFFSET_RIGHT = 2
+MAX_HANDWRITING_RAND_Y_OFFSET_UP = 1
+MAX_HANDWRITING_RAND_Y_OFFSET_DOWN = 2
+MAX_HANDWRITING_RAND_DEG_ROT = 1
+
+PDF_DPI = 200
+
+WRITER_STYLES = [
+    404,
+    347,
+    156,
+    253,
+    354,
+    603,
+    166,
+    320,
+    188,
+]
+
+# VISUAL_ELEMENT_TYPES = ["stamp", "logo", "barcode", "photo", "chart"]
+VISUAL_ELEMENT_TYPES = ["stamp", "logo", "figure", "barcode", "photo"]
+VISUAL_ELEMENT_TYPE_SYNONYMS = {
+    "chart": "figure",
+    "diagram": "figure",
+    "plot": "figure",
+    "graph": "figure",
+    "illustration": "figure",
+    "infographic": "figure",
+    "image": "photo",
+    "seal": "stamp",
+}
diff --git a/docgenie/generation/debug.js b/docgenie/generation/debug.js
new file mode 100755
index 0000000000000000000000000000000000000000..d733b16898212e136e6f5bda59f9a0e749bdcca1
--- /dev/null
+++ b/docgenie/generation/debug.js
@@ -0,0 +1,679 @@
+// GT Display Script
+// Reads JSON from element with id="GT" and displays in a readable overlay
+
+(function() {
+    // Read JSON from GT element
+    const gtElement = document.getElementById('GT');
+    let gtData = null;
+
+    if (gtElement) {
+        try {
+            gtData = JSON.parse(gtElement.textContent);
+        } catch (e) {
+            console.error('Failed to parse JSON from GT element:', e);
+        }
+    }
+
+    // Generate distinct colors for groups
+    function generateColor(index) {
+        const hue = (index * 137.508) % 360; // Golden angle for good distribution
+        return `hsla(${hue}, 70%, 60%, 0.3)`;
+    }
+
+    // Generate label color (darker, more saturated)
+    function generateLabelColor(index) {
+        const hue = (index * 137.508) % 360;
+        return `hsla(${hue}, 80%, 40%, 1)`;
+    }
+
+    // Highlight and collect MENU_<int>, PAIR_<int>, and GENERIC groups
+    function highlightAndCollectGroups() {
+        const groups = {}; // Structure: { "MENU_1": { elements: [...], subFields: {...} }, ... }
+        const allElements = document.querySelectorAll('*');
+        const groupLabels = []; // Store all label elements for toggling
+
+        // Regex patterns for group identifiers
+        const menuPattern = /^MENU_(\d+)$/;
+        const pairPattern = /^PAIR_(\d+)$/;
+        const genericPattern = /^GENERIC$/;
+
+        // First pass: collect all elements and their group memberships
+        allElements.forEach(element => {
+            const classList = Array.from(element.classList);
+
+            // Check if element has GENERIC class
+            const hasGeneric = classList.includes('GENERIC');
+
+            // Check for MENU_X or PAIR_X classes
+            const menuClass = classList.find(cls => cls.match(menuPattern));
+            const pairClass = classList.find(cls => cls.match(pairPattern));
+
+            // Process MENU groups
+            if (menuClass) {
+                const groupId = menuClass;
+                if (!groups[groupId]) {
+                    groups[groupId] = {
+                        elements: [],
+                        subFields: {},
+                        elementSubFields: new Map()
+                    };
+                }
+                groups[groupId].elements.push(element);
+
+                const elementSubFields = [];
+                classList.forEach(cls => {
+                    if (cls.startsWith('MENU_') && !cls.match(menuPattern)) {
+                        if (!groups[groupId].subFields[cls]) {
+                            groups[groupId].subFields[cls] = [];
+                        }
+                        groups[groupId].subFields[cls].push(element);
+                        elementSubFields.push(cls);
+                    }
+                });
+                groups[groupId].elementSubFields.set(element, elementSubFields);
+            }
+
+            // Process PAIR groups
+            if (pairClass) {
+                const groupId = pairClass;
+                if (!groups[groupId]) {
+                    groups[groupId] = {
+                        elements: [],
+                        subFields: {},
+                        elementSubFields: new Map()
+                    };
+                }
+                groups[groupId].elements.push(element);
+
+                const elementSubFields = [];
+                classList.forEach(cls => {
+                    if (cls.startsWith('PAIR_') && !cls.match(pairPattern)) {
+                        if (!groups[groupId].subFields[cls]) {
+                            groups[groupId].subFields[cls] = [];
+                        }
+                        groups[groupId].subFields[cls].push(element);
+                        elementSubFields.push(cls);
+                    }
+                });
+                groups[groupId].elementSubFields.set(element, elementSubFields);
+            }
+
+            // Process GENERIC group
+            if (hasGeneric) {
+                const groupId = 'GENERIC';
+                if (!groups[groupId]) {
+                    groups[groupId] = {
+                        elements: [],
+                        subFields: {},
+                        elementSubFields: new Map()
+                    };
+                }
+                groups[groupId].elements.push(element);
+
+                const elementSubFields = [];
+                classList.forEach(cls => {
+                    // For GENERIC, collect all classes that look like subfields:
+                    // - Start with GENERIC_ or GEN_
+                    // - Are uppercase with underscores (semantic field pattern)
+                    // - Exclude: GENERIC itself, LE- prefixed classes, common utility classes
+                    const isSubfield = cls !== 'GENERIC' &&
+                                      !cls.startsWith('LE-') &&
+                                      !cls.startsWith('layout-') &&
+                                      !cls.startsWith('text-') &&
+                                      !cls.startsWith('flex') &&
+                                      !cls.startsWith('grid') &&
+                                      !cls.startsWith('item-') &&
+                                      (cls.startsWith('GENERIC_') ||
+                                       cls.startsWith('GEN_') ||
+                                       (/^[A-Z_]+$/.test(cls) && cls.includes('_'))); // Uppercase with underscores
+
+                    if (isSubfield) {
+                        if (!groups[groupId].subFields[cls]) {
+                            groups[groupId].subFields[cls] = [];
+                        }
+                        groups[groupId].subFields[cls].push(element);
+                        elementSubFields.push(cls);
+                    }
+                });
+                groups[groupId].elementSubFields.set(element, elementSubFields);
+            }
+        });
+
+        // Sort groups by name for consistent coloring
+        const sortedGroupIds = Object.keys(groups).sort((a, b) => {
+            // Extract type and number for proper sorting
+            const aMatch = a.match(/^(MENU|PAIR)_(\d+)$/);
+            const bMatch = b.match(/^(MENU|PAIR)_(\d+)$/);
+
+            // Handle GENERIC separately
+            if (a === 'GENERIC' && b !== 'GENERIC') return 1; // GENERIC goes last
+            if (b === 'GENERIC' && a !== 'GENERIC') return -1;
+            if (a === 'GENERIC' && b === 'GENERIC') return 0;
+
+            if (aMatch && bMatch) {
+                if (aMatch[1] !== bMatch[1]) {
+                    return aMatch[1].localeCompare(bMatch[1]);
+                }
+                return parseInt(aMatch[2]) - parseInt(bMatch[2]);
+            }
+            return a.localeCompare(b);
+        });
+
+        // Second pass: apply highlighting with colors
+        sortedGroupIds.forEach((groupId, index) => {
+            const color = generateColor(index);
+            const labelColor = generateLabelColor(index);
+
+            groups[groupId].elements.forEach(element => {
+                element.style.backgroundColor = color;
+                element.style.transition = 'background-color 0.3s';
+                element.style.position = 'relative';
+                element.style.outline = `2px solid ${labelColor}`;
+                element.style.outlineOffset = '-2px';
+
+                // Get the subfields for this specific element
+                const elementSubFields = groups[groupId].elementSubFields.get(element) || [];
+
+                // Create label text: GROUP_ID + subfields
+                let labelText = groupId;
+                if (elementSubFields.length > 0) {
+                    labelText += ' | ' + elementSubFields.join(', ');
+                }
+
+                // Add a label above and to the right of the element
+                const label = document.createElement('div');
+                label.textContent = labelText;
+                label.className = 'group-label'; // Add class for easy toggling
+                label.style.position = 'absolute';
+                label.style.top = '-20px';
+                label.style.right = '0';
+                label.style.color = labelColor;
+                label.style.fontWeight = 'bold';
+                label.style.fontSize = '9px';
+                label.style.backgroundColor = 'rgba(255, 255, 255, 0.95)';
+                label.style.padding = '2px 6px';
+                label.style.borderRadius = '3px';
+                label.style.whiteSpace = 'nowrap';
+                label.style.pointerEvents = 'none';
+                label.style.zIndex = '1000';
+                label.style.boxShadow = '0 2px 4px rgba(0,0,0,0.3)';
+                label.style.border = `1px solid ${labelColor}`;
+                label.style.display = 'block'; // Initially visible
+
+                element.appendChild(label);
+                groupLabels.push(label);
+            });
+        });
+
+        return { groups, sortedGroupIds, groupLabels };
+    }
+
+    // Display group information in overlay
+    function displayGroupInfo(groups, sortedGroupIds, container) {
+        if (sortedGroupIds.length === 0) {
+            return;
+        }
+
+        const groupSection = document.createElement('div');
+        groupSection.style.marginTop = '15px';
+        groupSection.style.paddingTop = '12px';
+        groupSection.style.borderTop = '2px solid rgba(255, 255, 255, 0.4)';
+
+        const groupTitle = document.createElement('div');
+        groupTitle.textContent = `Element Groups (${sortedGroupIds.length})`;
+        groupTitle.style.fontWeight = 'bold';
+        groupTitle.style.fontSize = '12px';
+        groupTitle.style.marginBottom = '10px';
+        groupTitle.style.color = 'rgba(255, 200, 100, 1)';
+        groupSection.appendChild(groupTitle);
+
+        sortedGroupIds.forEach((groupId, index) => {
+            const group = groups[groupId];
+            const color = generateColor(index);
+            const labelColor = generateLabelColor(index);
+
+            const groupContainer = document.createElement('div');
+            groupContainer.style.marginBottom = '12px';
+            groupContainer.style.paddingBottom = '8px';
+            groupContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.2)';
+
+            // Group header with color indicator
+            const groupHeader = document.createElement('div');
+            groupHeader.style.display = 'flex';
+            groupHeader.style.alignItems = 'center';
+            groupHeader.style.marginBottom = '6px';
+
+            const colorBox = document.createElement('div');
+            colorBox.style.width = '16px';
+            colorBox.style.height = '16px';
+            colorBox.style.backgroundColor = color;
+            colorBox.style.border = `2px solid ${labelColor}`;
+            colorBox.style.borderRadius = '3px';
+            colorBox.style.marginRight = '8px';
+            colorBox.style.flexShrink = '0';
+
+            const groupLabel = document.createElement('span');
+            groupLabel.textContent = `${groupId} (${group.elements.length} element${group.elements.length !== 1 ? 's' : ''})`;
+            groupLabel.style.fontWeight = 'bold';
+            groupLabel.style.fontSize = '11px';
+            groupLabel.style.color = labelColor;
+
+            groupHeader.appendChild(colorBox);
+            groupHeader.appendChild(groupLabel);
+            groupContainer.appendChild(groupHeader);
+
+            // Sub-fields
+            if (Object.keys(group.subFields).length > 0) {
+                const subFieldsContainer = document.createElement('div');
+                subFieldsContainer.style.marginLeft = '24px';
+                subFieldsContainer.style.fontSize = '10px';
+
+                const subFieldsList = document.createElement('div');
+                subFieldsList.textContent = 'Sub-fields: ' + Object.keys(group.subFields).sort().join(', ');
+                subFieldsList.style.color = 'rgba(200, 200, 200, 1)';
+                subFieldsList.style.marginBottom = '4px';
+                subFieldsContainer.appendChild(subFieldsList);
+
+                // Show content from sub-fields
+                Object.entries(group.subFields).sort().forEach(([subField, elements]) => {
+                    const subFieldRow = document.createElement('div');
+                    subFieldRow.style.marginTop = '3px';
+
+                    const subFieldName = document.createElement('span');
+                    subFieldName.textContent = subField + ': ';
+                    subFieldName.style.color = 'rgba(150, 200, 255, 0.9)';
+                    subFieldName.style.fontWeight = 'normal';
+
+                    const subFieldValues = document.createElement('span');
+                    const values = elements
+                        .map(el => el.textContent.trim())
+                        .filter(text => text.length > 0)
+                        .slice(0, 3); // Limit to first 3 values
+
+                    if (values.length > 0) {
+                        subFieldValues.textContent = values.join(', ') + (elements.length > 3 ? '...' : '');
+                        subFieldValues.style.color = 'rgba(100, 255, 150, 1)';
+                    } else {
+                        subFieldValues.textContent = '(empty)';
+                        subFieldValues.style.color = 'rgba(150, 150, 150, 0.8)';
+                    }
+
+                    subFieldRow.appendChild(subFieldName);
+                    subFieldRow.appendChild(subFieldValues);
+                    subFieldsContainer.appendChild(subFieldRow);
+                });
+
+                groupContainer.appendChild(subFieldsContainer);
+            }
+
+            groupSection.appendChild(groupContainer);
+        });
+
+        container.appendChild(groupSection);
+    }
+
+    // Highlight elements containing exact value matches
+    function highlightValues(values) {
+        // Get all text-containing elements (excluding script tags and the overlay)
+        const allElements = document.body.querySelectorAll('*:not(script):not(style)');
+
+        allElements.forEach(element => {
+            // Get the direct text content (not including children)
+            const textContent = Array.from(element.childNodes)
+                .filter(node => node.nodeType === Node.TEXT_NODE)
+                .map(node => node.textContent.trim())
+                .join(' ');
+
+            // Check if this element's text exactly matches any of the values
+            for (const value of values) {
+                if (textContent === value || element.textContent.trim() === value) {
+                    element.style.backgroundColor = 'rgba(0, 100, 255, 0.3)';
+                    element.style.transition = 'background-color 0.3s';
+                    break;
+                }
+            }
+        });
+    }
+
+    // Display structured format (header/question/answer)
+    function displayStructuredFormat(data, container) {
+        for (const [pairKey, pairData] of Object.entries(data)) {
+            const pairContainer = document.createElement('div');
+            pairContainer.style.marginBottom = '12px';
+            pairContainer.style.paddingBottom = '8px';
+            pairContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.2)';
+
+            // Pair identifier (e.g., PAIR_1)
+            const pairLabel = document.createElement('div');
+            pairLabel.textContent = pairKey;
+            pairLabel.style.fontSize = '10px';
+            pairLabel.style.color = 'rgba(255, 255, 255, 0.6)';
+            pairLabel.style.marginBottom = '4px';
+            pairContainer.appendChild(pairLabel);
+
+            // Header
+            if (pairData.header) {
+                const header = document.createElement('div');
+                header.innerHTML = `<strong>Header:</strong> ${pairData.header}`;
+                header.style.marginBottom = '3px';
+                pairContainer.appendChild(header);
+            }
+
+            // Question
+            if (pairData.question) {
+                const question = document.createElement('div');
+                question.innerHTML = `<strong>Question:</strong> ${pairData.question}`;
+                question.style.marginBottom = '3px';
+                pairContainer.appendChild(question);
+            }
+
+            // Answer
+            if (pairData.answer) {
+                const answer = document.createElement('div');
+                answer.innerHTML = `<strong>Answer:</strong> ${pairData.answer}`;
+                answer.style.color = 'rgba(100, 255, 150, 1)';
+                pairContainer.appendChild(answer);
+            }
+
+            container.appendChild(pairContainer);
+        }
+    }
+
+    // Display simple key-value format
+    function displaySimpleFormat(data, container) {
+        const table = document.createElement('div');
+
+        for (const [key, value] of Object.entries(data)) {
+            const row = document.createElement('div');
+            row.style.display = 'flex';
+            row.style.marginBottom = '6px';
+            row.style.paddingBottom = '6px';
+            row.style.borderBottom = '1px dotted rgba(255, 255, 255, 0.2)';
+
+            const keySpan = document.createElement('span');
+            keySpan.textContent = key + ':';
+            keySpan.style.fontWeight = 'bold';
+            keySpan.style.minWidth = '100px';
+            keySpan.style.color = 'rgba(150, 200, 255, 1)';
+
+            const valueSpan = document.createElement('span');
+            valueSpan.textContent = value;
+            valueSpan.style.marginLeft = '10px';
+            valueSpan.style.color = 'rgba(100, 255, 150, 1)';
+
+            row.appendChild(keySpan);
+            row.appendChild(valueSpan);
+            table.appendChild(row);
+        }
+
+        container.appendChild(table);
+    }
+
+    // Display nested objects format (e.g., MENU_1, MENU_2, etc.)
+    function displayNestedFormat(data, container) {
+        for (const [groupKey, groupData] of Object.entries(data)) {
+            const groupContainer = document.createElement('div');
+            groupContainer.style.marginBottom = '14px';
+            groupContainer.style.paddingBottom = '10px';
+            groupContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.3)';
+
+            // Group identifier (e.g., MENU_1, VOID_MENU, GENERIC)
+            const groupLabel = document.createElement('div');
+            groupLabel.textContent = groupKey;
+            groupLabel.style.fontSize = '11px';
+            groupLabel.style.fontWeight = 'bold';
+            groupLabel.style.color = 'rgba(255, 200, 100, 1)';
+            groupLabel.style.marginBottom = '6px';
+            groupLabel.style.paddingBottom = '4px';
+            groupLabel.style.borderBottom = '1px dotted rgba(255, 255, 255, 0.2)';
+            groupContainer.appendChild(groupLabel);
+
+            // Check if groupData is an object
+            if (typeof groupData === 'object' && groupData !== null && !Array.isArray(groupData)) {
+                // Display nested key-value pairs
+                const nestedTable = document.createElement('div');
+                nestedTable.style.marginLeft = '10px';
+
+                for (const [key, value] of Object.entries(groupData)) {
+                    const row = document.createElement('div');
+                    row.style.display = 'flex';
+                    row.style.marginBottom = '4px';
+                    row.style.fontSize = '10px';
+
+                    const keySpan = document.createElement('span');
+                    keySpan.textContent = key + ':';
+                    keySpan.style.fontWeight = 'normal';
+                    keySpan.style.minWidth = '150px';
+                    keySpan.style.color = 'rgba(150, 200, 255, 0.9)';
+
+                    const valueSpan = document.createElement('span');
+                    valueSpan.textContent = typeof value === 'object' ? JSON.stringify(value) : value;
+                    valueSpan.style.marginLeft = '10px';
+                    valueSpan.style.color = 'rgba(100, 255, 150, 1)';
+
+                    row.appendChild(keySpan);
+                    row.appendChild(valueSpan);
+                    nestedTable.appendChild(row);
+                }
+
+                groupContainer.appendChild(nestedTable);
+            } else {
+                // Handle non-object values
+                const valueDiv = document.createElement('div');
+                valueDiv.textContent = typeof groupData === 'object' ? JSON.stringify(groupData) : groupData;
+                valueDiv.style.marginLeft = '10px';
+                valueDiv.style.color = 'rgba(100, 255, 150, 1)';
+                valueDiv.style.fontSize = '10px';
+                groupContainer.appendChild(valueDiv);
+            }
+
+            container.appendChild(groupContainer);
+        }
+    }
+
+    // Collect all values recursively from nested objects
+    function collectValues(obj, values = []) {
+        if (typeof obj !== 'object' || obj === null) {
+            values.push(String(obj));
+            return values;
+        }
+
+        for (const value of Object.values(obj)) {
+            if (typeof value === 'object' && value !== null) {
+                collectValues(value, values);
+            } else {
+                values.push(String(value));
+            }
+        }
+
+        return values;
+    }
+
+    // Create overlay container
+    const overlay = document.createElement('div');
+    overlay.style.position = 'relative';
+    overlay.style.backgroundColor = 'rgba(0, 0, 0, 0.85)';
+    overlay.style.color = 'white';
+    overlay.style.padding = '15px';
+    overlay.style.fontSize = '11px';
+    overlay.style.fontFamily = 'monospace';
+    overlay.style.marginTop = '8mm';
+    overlay.style.borderRadius = '6px';
+    overlay.style.boxShadow = '0 2px 10px rgba(0, 0, 0, 0.4)';
+    overlay.style.lineHeight = '1.6';
+    overlay.style.maxHeight = '500px';
+    overlay.style.overflowY = 'auto';
+
+    let valuesToHighlight = [];
+
+    // Only process GT data if it exists
+    if (gtData) {
+        // Create title
+        const title = document.createElement('div');
+        title.textContent = 'Ground Truth Data';
+        title.style.fontWeight = 'bold';
+        title.style.fontSize = '12px';
+        title.style.marginBottom = '10px';
+        title.style.borderBottom = '2px solid rgba(255, 255, 255, 0.4)';
+        title.style.paddingBottom = '6px';
+        overlay.appendChild(title);
+
+        // Detect format and display accordingly
+        const firstKey = Object.keys(gtData)[0];
+        const firstValue = gtData[firstKey];
+
+        // Check if it's the header/question/answer format
+        if (typeof firstValue === 'object' && firstValue !== null &&
+            ('header' in firstValue || 'question' in firstValue || 'answer' in firstValue)) {
+            // Format 1: Structured pairs with header/question/answer
+            displayStructuredFormat(gtData, overlay);
+            // Collect answer values for highlighting
+            for (const pairData of Object.values(gtData)) {
+                if (pairData.answer) {
+                    valuesToHighlight.push(pairData.answer);
+                }
+            }
+        } else if (typeof firstValue === 'object' && firstValue !== null) {
+            // Format 3: Nested objects (MENU_1, MENU_2, etc.)
+            displayNestedFormat(gtData, overlay);
+            // Collect all nested values for highlighting
+            valuesToHighlight = collectValues(gtData);
+        } else {
+            // Format 2: Simple key-value pairs
+            displaySimpleFormat(gtData, overlay);
+            // Collect all values for highlighting
+            valuesToHighlight = Object.values(gtData);
+        }
+
+        // Highlight elements containing the values
+        highlightValues(valuesToHighlight);
+    }
+
+    // Highlight placeholder elements with red background
+    highlightPlaceholders();
+
+    // Highlight layout elements and display count
+    const layoutElementCount = highlightLayoutElements();
+
+    // Highlight and collect MENU_/PAIR_/GENERIC groups
+    const { groups, sortedGroupIds, groupLabels } = highlightAndCollectGroups();
+
+    // Add toggle button for group labels
+    const toggleButton = document.createElement('button');
+    toggleButton.textContent = 'Hide Group Labels';
+    toggleButton.style.position = 'absolute';
+    toggleButton.style.top = '10px';
+    toggleButton.style.right = '10px';
+    toggleButton.style.padding = '6px 12px';
+    toggleButton.style.fontSize = '10px';
+    toggleButton.style.fontWeight = 'bold';
+    toggleButton.style.backgroundColor = 'rgba(100, 150, 255, 0.9)';
+    toggleButton.style.color = 'white';
+    toggleButton.style.border = 'none';
+    toggleButton.style.borderRadius = '4px';
+    toggleButton.style.cursor = 'pointer';
+    toggleButton.style.boxShadow = '0 2px 4px rgba(0,0,0,0.3)';
+    toggleButton.style.pointerEvents = 'auto';
+    toggleButton.style.zIndex = '1001';
+
+    let labelsVisible = true;
+    toggleButton.addEventListener('click', function() {
+        labelsVisible = !labelsVisible;
+        groupLabels.forEach(label => {
+            label.style.display = labelsVisible ? 'block' : 'none';
+        });
+        toggleButton.textContent = labelsVisible ? 'Hide Group Labels' : 'Show Group Labels';
+    });
+
+    overlay.appendChild(toggleButton);
+
+    // Add layout element count to overlay
+    const layoutCountDiv = document.createElement('div');
+    layoutCountDiv.textContent = `Layout Elements: ${layoutElementCount}`;
+    layoutCountDiv.style.fontSize = '11px';
+    layoutCountDiv.style.fontWeight = 'bold';
+    layoutCountDiv.style.color = 'rgba(255, 255, 100, 1)';
+    layoutCountDiv.style.marginTop = '10px';
+    layoutCountDiv.style.paddingTop = '8px';
+    layoutCountDiv.style.borderTop = '1px solid rgba(255, 255, 255, 0.3)';
+    overlay.appendChild(layoutCountDiv);
+
+    // Display group information in overlay
+    displayGroupInfo(groups, sortedGroupIds, overlay);
+
+    document.body.appendChild(overlay);
+
+    // Highlight elements with data-placeholder attribute
+    function highlightPlaceholders() {
+        const placeholderElements = document.querySelectorAll('[data-placeholder]');
+        placeholderElements.forEach(element => {
+            element.style.backgroundColor = 'rgba(255, 0, 0, 0.3)';
+            element.style.transition = 'background-color 0.3s';
+            element.style.position = 'relative';
+
+            // Get the placeholder value
+            const placeholderValue = element.getAttribute('data-placeholder');
+
+            // Create a label to display the placeholder value
+            const label = document.createElement('div');
+            label.textContent = placeholderValue;
+            label.style.position = 'absolute';
+            label.style.top = '50%';
+            label.style.left = '50%';
+            label.style.transform = 'translate(-50%, -50%)';
+            label.style.color = 'red';
+            label.style.fontWeight = 'bold';
+            label.style.fontSize = '10px';
+            label.style.backgroundColor = 'rgba(255, 255, 255, 0.8)';
+            label.style.padding = '2px 6px';
+            label.style.borderRadius = '3px';
+            label.style.whiteSpace = 'nowrap';
+            label.style.pointerEvents = 'none';
+            label.style.zIndex = '1000';
+
+            element.appendChild(label);
+        });
+    }
+
+    // Highlight layout-element elements and return count
+    function highlightLayoutElements() {
+        // Find all elements with classes starting with LE-
+        const allElements = document.querySelectorAll('*');
+        const layoutElements = Array.from(allElements).filter(element => {
+            return Array.from(element.classList).some(cls => cls.startsWith('LE-'));
+        });
+
+        layoutElements.forEach(element => {
+            // Highlight with transparent yellow background
+            element.style.backgroundColor = 'rgba(255, 255, 0, 0.3)';
+            element.style.transition = 'background-color 0.3s';
+            element.style.position = 'relative';
+
+            // Extract the type from the classList (e.g., LE-TEXT, LE-TABLE, LE-TITLE)
+            const classList = Array.from(element.classList);
+            const typeClass = classList.find(cls => cls.startsWith('LE-'));
+
+            if (typeClass) {
+                // Create a label to display the type
+                const label = document.createElement('div');
+                label.textContent = typeClass.toUpperCase();
+                label.style.position = 'absolute';
+                label.style.top = '5px';
+                label.style.left = '5px';
+                label.style.color = 'rgba(200, 200, 0, 1)';
+                label.style.fontWeight = 'bold';
+                label.style.fontSize = '10px';
+                label.style.backgroundColor = 'rgba(0, 0, 0, 0.5)';
+                label.style.padding = '2px 6px';
+                label.style.borderRadius = '3px';
+                label.style.whiteSpace = 'nowrap';
+                label.style.pointerEvents = 'none';
+                label.style.zIndex = '1000';
+
+                element.appendChild(label);
+            }
+        });
+
+        return layoutElements.length;
+    }
+})();
\ No newline at end of file
diff --git a/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py b/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py
new file mode 100755
index 0000000000000000000000000000000000000000..5548a25ded6961de8c7edebf162afa73bdccd039
--- /dev/null
+++ b/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py
@@ -0,0 +1,556 @@
+#!/usr/bin/env python3
+"""
+Post-process generated handwriting token images (e.g. from `generate_handwriting_diffusion_raw.py`) to reduce pixelation
+and add a natural soft edge via Gaussian blur + optional scale anti-aliasing.
+
+Features:
+  * Recursively scans an input root directory for PNG images (expects per-document subfolders)
+  * Applies a randomized (or fixed) Gaussian blur radius (on RGB while preserving alpha)
+  * Optional downscale+upscale anti-alias pass before blur to smooth jagged edges
+    * Advanced edge refinement options (erosion/dilation/feather of alpha, contrast/gamma, noise, unsharp mask)
+  * Writes results either (a) in-place with a suffix before extension or (b) into a mirror output directory tree
+  * Can update an existing mapping JSON (e.g. raw_token_map.json) by appending `blurred_image` for each segment
+
+Typical usage (mirror output tree):
+  python scripts/add_handwriting_blur.py \
+      --input-root syn_docvqa/handwriting_raw_tokens \
+      --output-root syn_docvqa/handwriting_raw_tokens_blurred \
+      --mapping-json syn_docvqa/handwriting_raw_tokens/raw_token_map.json \
+      --append-mapping \
+      --radius-min 0.6 --radius-max 1.8 --antialias
+
+In-place variant (adds suffix _b):
+  python scripts/add_handwriting_blur.py \
+      --input-root syn_docvqa/handwriting_raw_tokens \
+      --in-place --suffix _soft --radius 1.2 --append-mapping
+
+Key Arguments:
+  --radius:        Fixed Gaussian blur radius (overrides min/max if set)
+  --radius-min/max Range for random uniform blur radius per image when --radius not given
+  --antialias      Enable a downscale+upscale pass before blur (slower but smoother)
+  --scale-factor   Downscale factor when antialiasing (default 0.75)
+  --suffix         Filename suffix (only used in --in-place mode)
+  --append-mapping Update mapping JSON adding a 'blurred_image' key per segment (keeps original).
+  --skip-existing  Skip processing if blurred file already exists
+  --extensions     Comma separated list of extensions to process (default: .png)
+
+Mapping Update Behavior:
+  - Loads JSON, finds segments with an 'image' field
+  - If a blurred counterpart is produced, adds 'blurred_image' (relative path analogous to original)
+  - Writes updated mapping next to original unless --mapping-output specified
+
+Limitations:
+  - Mapping update assumes relative paths in JSON remain valid under original root. If you mirror into a different
+    --output-root, the blurred path is generated accordingly.
+  - Only PNG RGBA expected; non-RGBA images are converted.
+
+Requires: Pillow, numpy (for advanced options)
+(Optional) tqdm for progress bar.
+"""
+
+from __future__ import annotations
+import argparse
+import json
+import random
+import sys
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Dict, Any, Optional
+from types import SimpleNamespace
+
+from PIL import Image, ImageFilter
+import numpy as np
+
+try:
+    from tqdm import tqdm  # type: ignore
+except Exception:  # pragma: no cover - optional
+    tqdm = None  # type: ignore
+
+
+@dataclass
+class BlurConfig:
+    radius: Optional[float]
+    radius_min: float
+    radius_max: float
+    antialias: bool
+    scale_factor: float
+    suffix: str
+    skip_existing: bool
+    alpha_erosion: int
+    alpha_dilation: int
+    feather: float
+    contrast: float
+    ink_gamma: float
+    add_noise: float
+    unsharp: Optional[str]
+    max_alpha: Optional[int]
+
+
+def parse_args() -> argparse.Namespace:
+    ap = argparse.ArgumentParser(
+        description="Apply soft blur to handwriting token images."
+    )
+    ap.add_argument(
+        "--input-root",
+        type=Path,
+        required=True,
+        help="Root directory containing per-document subfolders with images.",
+    )
+    ap.add_argument(
+        "--output-root",
+        type=Path,
+        help="Root to write blurred images (mirrors structure). Omit with --in-place.",
+    )
+    ap.add_argument(
+        "--in-place",
+        action="store_true",
+        help="Blur in-place (creates new files with suffix).",
+    )
+    ap.add_argument(
+        "--suffix",
+        type=str,
+        default="_b",
+        help="Suffix to append before extension in in-place mode.",
+    )
+    ap.add_argument(
+        "--radius",
+        type=float,
+        default=None,
+        help="Fixed blur radius (overrides min/max).",
+    )
+    ap.add_argument(
+        "--radius-min",
+        type=float,
+        default=0.35,
+        help="Min random blur radius when --radius not set (slight blur).",
+    )
+    ap.add_argument(
+        "--radius-max",
+        type=float,
+        default=0.85,
+        help="Max random blur radius when --radius not set (slight blur).",
+    )
+    ap.add_argument(
+        "--antialias",
+        action="store_true",
+        help="Apply downscale+upscale anti-alias pass before blur.",
+    )
+    ap.add_argument(
+        "--scale-factor",
+        type=float,
+        default=0.75,
+        help="Downscale factor for anti-alias pass.",
+    )
+    # Advanced edge / tone controls
+    ap.add_argument(
+        "--alpha-erosion",
+        type=int,
+        default=0,
+        help="Erode alpha mask this many pixels before feather (default off).",
+    )
+    ap.add_argument(
+        "--alpha-dilation",
+        type=int,
+        default=0,
+        help="Dilate alpha mask this many pixels after erosion (default off).",
+    )
+    ap.add_argument(
+        "--feather",
+        type=float,
+        default=0.6,
+        help="Feather (Gaussian blur) radius for alpha edges (subtle).",
+    )
+    ap.add_argument(
+        "--contrast",
+        type=float,
+        default=1.02,
+        help="Contrast multiplier for RGB (slight).",
+    )
+    ap.add_argument(
+        "--ink-gamma",
+        type=float,
+        default=0.98,
+        help="Gamma adjustment for ink intensity (<1 darkens mid-tones slightly).",
+    )
+    ap.add_argument(
+        "--add-noise",
+        type=float,
+        default=0.35,
+        help="Std dev of Gaussian noise (0-10) added to RGB pre-blur (subtle grain).",
+    )
+    ap.add_argument(
+        "--unsharp",
+        type=str,
+        default="0.5,30,2",
+        help="Unsharp mask params radius,percent,threshold (mild crisp restore).",
+    )
+    ap.add_argument(
+        "--max-alpha",
+        type=int,
+        default=None,
+        help="Clamp final alpha to at most this (0-255).",
+    )
+    ap.add_argument("--mapping-json", type=Path, help="Path to mapping JSON to update.")
+    ap.add_argument(
+        "--mapping-output",
+        type=Path,
+        help="Path to write updated mapping (default overwrites original when --append-mapping).",
+    )
+    ap.add_argument(
+        "--append-mapping",
+        action="store_true",
+        help="Append blurred_image field to mapping JSON segments.",
+    )
+    ap.add_argument(
+        "--skip-existing",
+        action="store_true",
+        help="Skip if output blurred file already exists.",
+    )
+    ap.add_argument(
+        "--extensions",
+        type=str,
+        default=".png",
+        help="Comma-separated list of extensions to process.",
+    )
+    ap.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducible radius sampling.",
+    )
+    ap.add_argument("--no-progress", action="store_true", help="Disable progress bar.")
+    args = ap.parse_args()
+
+    if args.in_place and args.output_root:
+        ap.error("Cannot specify --output-root with --in-place.")
+    if not args.in_place and not args.output_root:
+        ap.error("Either provide --output-root or use --in-place.")
+    if args.radius is not None and args.radius <= 0:
+        ap.error("--radius must be > 0")
+    if args.radius is None and args.radius_min <= 0:
+        ap.error("--radius-min must be > 0")
+    if args.radius is None and args.radius_max < args.radius_min:
+        ap.error("--radius-max must be >= --radius-min")
+    if args.scale_factor <= 0 or args.scale_factor >= 1 and args.antialias:
+        # Allow >1? Not necessary here.
+        pass
+    return args
+
+
+def iter_images(root: Path, exts: List[str]) -> Iterable[Path]:
+    for p in root.rglob("*"):
+        if p.is_file() and p.suffix.lower() in exts:
+            yield p
+
+
+def choose_radius(cfg: BlurConfig) -> float:
+    if cfg.radius is not None:
+        return cfg.radius
+    return random.uniform(cfg.radius_min, cfg.radius_max)
+
+
+def anti_alias(im: Image.Image, scale_factor: float) -> Image.Image:
+    if scale_factor >= 1 or scale_factor <= 0:
+        return im
+    w, h = im.size
+    new_w = max(1, int(w * scale_factor))
+    new_h = max(1, int(h * scale_factor))
+    if new_w == w or new_h == h:
+        return im
+    # Downscale (BOX) then upscale (BICUBIC) to soften
+    small = im.resize((new_w, new_h), Image.Resampling.BOX)
+    return small.resize((w, h), Image.Resampling.BICUBIC)
+
+
+def process_image(src: Path, dst: Path, cfg: BlurConfig) -> bool:
+    if cfg.skip_existing and dst.exists():
+        return False
+    try:
+        im = Image.open(src).convert("RGBA")
+        # Separate alpha
+        r, g, b, a = im.split()
+        rgb = Image.merge("RGB", (r, g, b))
+        # Tone / noise adjustments
+        if cfg.contrast != 1.0 or cfg.ink_gamma != 1.0 or cfg.add_noise > 0:
+            arr = np.array(rgb).astype(np.float32) / 255.0
+            if cfg.contrast != 1.0:
+                arr = (arr - 0.5) * cfg.contrast + 0.5
+            arr = np.clip(arr, 0, 1)
+            if cfg.ink_gamma != 1.0:
+                arr = np.power(arr, cfg.ink_gamma)
+            if cfg.add_noise > 0:
+                noise = np.random.normal(0, cfg.add_noise / 255.0, arr.shape).astype(
+                    np.float32
+                )
+                arr = np.clip(arr + noise, 0, 1)
+            rgb = Image.fromarray((arr * 255).astype(np.uint8), "RGB")
+        if cfg.antialias:
+            rgb = anti_alias(rgb, cfg.scale_factor)
+        radius = choose_radius(cfg)
+        rgb = rgb.filter(ImageFilter.GaussianBlur(radius=radius))
+        # Optional unsharp mask
+        if cfg.unsharp:
+            try:
+                parts = [p.strip() for p in cfg.unsharp.split(",")]
+                if len(parts) == 3:
+                    u_radius, u_percent, u_threshold = (
+                        float(parts[0]),
+                        int(parts[1]),
+                        int(parts[2]),
+                    )
+                    rgb = rgb.filter(
+                        ImageFilter.UnsharpMask(
+                            radius=u_radius, percent=u_percent, threshold=u_threshold
+                        )
+                    )
+            except Exception:
+                pass
+        # Alpha refinement (erosion / dilation / feather / clamp)
+        if (
+            cfg.alpha_erosion > 0
+            or cfg.alpha_dilation > 0
+            or cfg.feather > 0
+            or cfg.max_alpha is not None
+        ):
+            a_np = np.array(a).astype(np.uint8)
+            mask = (a_np > 0).astype(np.uint8) * 255
+
+            def morph(mask_arr: np.ndarray, iters: int, op: str) -> np.ndarray:
+                if iters <= 0:
+                    return mask_arr
+                for _ in range(iters):
+                    padded = np.pad(mask_arr, 1, mode="constant", constant_values=0)
+                    out = mask_arr.copy()
+                    h, w = mask_arr.shape
+                    if op == "erode":
+                        for y in range(h):
+                            for x in range(w):
+                                region = padded[y : y + 3, x : x + 3]
+                                out[y, x] = 255 if np.all(region == 255) else 0
+                    else:  # dilate
+                        for y in range(h):
+                            for x in range(w):
+                                region = padded[y : y + 3, x : x + 3]
+                                out[y, x] = 255 if np.any(region == 255) else 0
+                    mask_arr = out
+                return mask_arr
+
+            if cfg.alpha_erosion > 0:
+                mask = morph(mask, cfg.alpha_erosion, "erode")
+            if cfg.alpha_dilation > 0:
+                mask = morph(mask, cfg.alpha_dilation, "dilate")
+            if cfg.feather > 0:
+                mask = np.array(
+                    Image.fromarray(mask, "L").filter(
+                        ImageFilter.GaussianBlur(radius=cfg.feather)
+                    )
+                )
+            if cfg.max_alpha is not None:
+                mask = np.minimum(mask, cfg.max_alpha).astype(np.uint8)
+            a = Image.fromarray(mask, "L")
+        out = Image.merge("RGBA", (*rgb.split(), a))
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        out.save(dst)
+        return True
+    except Exception as e:
+        print(f"[ERROR] Failed to blur {src}: {e}", file=sys.stderr)
+        return False
+
+
+def map_destination(src: Path, args, input_root: Path) -> Path:
+    if args.in_place:
+        return src.with_name(src.stem + args.suffix + src.suffix)
+    # Mirror path inside output_root
+    rel = src.relative_to(input_root)
+    return args.output_root / rel
+
+
+def update_mapping(
+    mapping_path: Path, output_path: Path, input_root: Path, args
+) -> None:
+    try:
+        data = json.loads(mapping_path.read_text(encoding="utf-8"))
+    except Exception as e:
+        print(f"[WARN] Could not read mapping JSON: {e}", file=sys.stderr)
+        return
+
+    # Determine if mapping uses 'file_author_styles' or old style; keep untouched.
+    entries = data.get("entries", [])
+    changed = False
+    for entry in entries:
+        for seg in entry.get("segments", []):
+            img_rel = seg.get("image")
+            if not img_rel:
+                continue
+            # Destination blurred relative path
+            src_abs = input_root / img_rel
+            if args.in_place:
+                blurred_rel = str(
+                    Path(img_rel).with_name(
+                        Path(img_rel).stem + args.suffix + Path(img_rel).suffix
+                    )
+                )
+            else:
+                # Mirror relative path but under output_root
+                blurred_rel = img_rel  # Same relative name inside mirrored root
+            blurred_abs = (
+                (args.output_root / blurred_rel)
+                if not args.in_place
+                else src_abs.with_name(Path(blurred_rel).name)
+            )
+            if blurred_abs.exists():
+                seg["blurred_image"] = blurred_rel
+                changed = True
+    if changed:
+        out_path = args.mapping_output or mapping_path
+        out_path.write_text(
+            json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8"
+        )
+        print(f"[INFO] Updated mapping JSON with blurred_image fields: {out_path}")
+    else:
+        print(
+            "[INFO] No mapping changes applied (maybe images not yet generated or already present)."
+        )
+
+
+def blur_handwriting(
+    input_root: Path,
+    output_root: Optional[Path] = None,
+    in_place: bool = False,
+    suffix: str = "_b",
+    radius: Optional[float] = None,
+    radius_min: float = 0.35,
+    radius_max: float = 0.85,
+    antialias: bool = False,
+    scale_factor: float = 0.75,
+    alpha_erosion: int = 0,
+    alpha_dilation: int = 0,
+    feather: float = 0.6,
+    contrast: float = 1.02,
+    ink_gamma: float = 0.98,
+    add_noise: float = 0.35,
+    unsharp: Optional[str] = "0.5,30,2",
+    max_alpha: Optional[int] = None,
+    mapping_json: Optional[Path] = None,
+    mapping_output: Optional[Path] = None,
+    append_mapping: bool = False,
+    skip_existing: bool = False,
+    extensions: str = ".png",
+    seed: int = 42,
+    no_progress: bool = False,
+) -> int:
+    """Apply soft blur to handwriting token images.
+
+    Mirrors the command-line behavior while exposing a reusable API.
+
+    Returns the number of processed images.
+    """
+    # Basic validation to mirror CLI expectations
+    if in_place and output_root is not None:
+        raise ValueError("Cannot specify output_root with in_place=True.")
+    if not in_place and output_root is None:
+        raise ValueError("Either provide output_root or set in_place=True.")
+    if radius is not None and radius <= 0:
+        raise ValueError("--radius must be > 0")
+    if radius is None and radius_min <= 0:
+        raise ValueError("--radius-min must be > 0")
+    if radius is None and radius_max < radius_min:
+        raise ValueError("--radius-max must be >= --radius-min")
+
+    random.seed(seed)
+
+    exts = [e if e.startswith(".") else f".{e}" for e in extensions.split(",")]
+
+    if not input_root.exists():
+        raise FileNotFoundError(f"Input root not found: {input_root}")
+    if not in_place and output_root is not None:
+        output_root.mkdir(parents=True, exist_ok=True)
+
+    cfg = BlurConfig(
+        radius=radius,
+        radius_min=radius_min,
+        radius_max=radius_max,
+        antialias=antialias,
+        scale_factor=scale_factor,
+        suffix=suffix,
+        skip_existing=skip_existing,
+        alpha_erosion=alpha_erosion,
+        alpha_dilation=alpha_dilation,
+        feather=feather,
+        contrast=contrast,
+        ink_gamma=ink_gamma,
+        add_noise=add_noise,
+        unsharp=unsharp,
+        max_alpha=max_alpha,
+    )
+
+    images = list(iter_images(input_root, exts))
+    if not images:
+        print("[WARN] No images found to process.")
+
+    iterator = images
+    if not no_progress and tqdm is not None:
+        iterator = tqdm(images, desc="Blurring tokens", unit="img")  # type: ignore
+
+    processed = 0
+    # Minimal namespace to reuse map_destination/update_mapping without changing their signatures
+    ns = SimpleNamespace(
+        in_place=in_place,
+        suffix=suffix,
+        output_root=output_root,
+        mapping_output=mapping_output,
+    )
+
+    for img_path in iterator:  # type: ignore
+        dst = map_destination(img_path, ns, input_root)
+        if process_image(img_path, dst, cfg):
+            processed += 1
+
+    print(f"[INFO] Blurred {processed} / {len(images)} images.")
+
+    if append_mapping and mapping_json:
+        update_mapping(mapping_json, mapping_json, input_root, ns)
+
+    return processed
+
+
+def main():
+    args = parse_args()
+    try:
+        blur_handwriting(
+            input_root=args.input_root,
+            output_root=args.output_root,
+            in_place=args.in_place,
+            suffix=args.suffix,
+            radius=args.radius,
+            radius_min=args.radius_min,
+            radius_max=args.radius_max,
+            antialias=args.antialias,
+            scale_factor=args.scale_factor,
+            alpha_erosion=args.alpha_erosion,
+            alpha_dilation=args.alpha_dilation,
+            feather=args.feather,
+            contrast=args.contrast,
+            ink_gamma=args.ink_gamma,
+            add_noise=args.add_noise,
+            unsharp=args.unsharp,
+            max_alpha=args.max_alpha,
+            mapping_json=args.mapping_json,
+            mapping_output=args.mapping_output,
+            append_mapping=args.append_mapping,
+            skip_existing=args.skip_existing,
+            extensions=args.extensions,
+            seed=args.seed,
+            no_progress=args.no_progress,
+        )
+    except FileNotFoundError as e:
+        print(f"[ERROR] {e}", file=sys.stderr)
+        sys.exit(1)
+    except ValueError as e:
+        print(f"[ERROR] {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py b/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py
new file mode 100755
index 0000000000000000000000000000000000000000..0c266dcbc8023b5f90503cf6b55453277753e220
--- /dev/null
+++ b/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py
@@ -0,0 +1,1680 @@
+#!/usr/bin/env python3
+"""
+Diffusion-based handwriting token generator with intelligent word splitting and stitching.
+
+This script:
+    - Reads handwriting bbox JSON files with format: "x1,y1,x2,y2,text,block_no,line_no,word_no"
+    - Intelligently splits long words internally based on --split-length parameter
+    - Splits numeric sequences within tokens into configurable chunk sizes (default: 2)
+    - Generates handwriting using HuggingFace diffusion model with text conditioning
+    - Stitches split word segments horizontally with baseline alignment
+    - Supports sentence-level reconstruction using line metadata
+    - Outputs transparent RGBA images with tight cropping
+    - Maintains consistent writer styles per document
+    - Supports batched generation for GPU efficiency
+
+Usage example:
+  python scripts/generate_handwriting_diffusion_raw.py \
+      --input-dir docvqa-handwritten-sizes4/handwriting_bbox \
+      --output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \
+      --run-dir model/experiments/hf_conditional_latent \
+      --checkpoint latest.pt \
+      --steps 30 --split-length 7 --batch-size 8 --device cuda
+
+With sentence stitching and custom baseline:
+  python scripts/generate_handwriting_diffusion_raw.py \
+    --input-dir docvqa-handwritten-sizes4/handwriting_bbox \
+    --output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \
+    --run-dir model/experiments/hf_conditional_latent \
+    --checkpoint latest.pt \
+    --steps 30 --split-length 7 --stitch-sentences \
+    --baseline-percentile 85.0 --device cuda
+
+Install requirements:
+  pip install torch diffusers transformers Pillow PyYAML
+
+Mapping file (raw_token_map.json) structure:
+{
+  "backend": "diffusion-hf",
+  "split_length": 7,
+  "entries": [
+     {
+       "source_json": "example.json",
+       "hw_id": "hw0",
+       "author_id": "author1",
+       "words": [
+         {
+           "block_no": 22,
+           "line_no": 0,
+           "word_no": 0,
+           "image": "example/hw0_0.png",
+           "style_id": 123,
+           "width": 250,
+           "height": 64,
+           "segments": [
+             {"token": "genera", "bbox": [x1,y1,x2,y2]},
+             {"token": "tion", "bbox": [x1,y1,x2,y2]}
+           ]
+         }
+       ]
+     }
+  ],
+  "file_author_styles": {"example.json": {"author1": {"style_id": 123}}}
+}
+"""
+
+from __future__ import annotations
+import argparse
+import json
+import math
+import random
+import sys
+from copy import deepcopy
+from datetime import datetime
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from collections import defaultdict
+
+from .tokenizer import CharTokenizer
+from .text_encoder import TextEncoder
+
+try:
+    import torch
+    import torch.nn as nn
+    from diffusers import (
+        AutoencoderKL,
+        DDPMScheduler,
+        DPMSolverMultistepScheduler,
+        UNet2DConditionModel,
+    )
+    from diffusers.training_utils import EMAModel
+    import numpy as np
+    from PIL import Image
+    import yaml
+    from rich.progress import Progress
+except Exception as e:
+    print(
+        "[ERROR] Missing dependencies. Install: torch diffusers transformers Pillow PyYAML",
+        file=sys.stderr,
+    )
+    raise
+
+
+BBox = Tuple[float, float, float, float]
+
+
+@dataclass
+class WordSegment:
+    """Represents a segment of a word after splitting."""
+
+    token: str
+    bbox: BBox
+    original_index: (
+        int  # Track which part of the word this is (0=first, 1=second, etc.)
+    )
+    space_before: bool = (
+        False  # True if this segment had a space before it in the original word
+    )
+
+
+@dataclass
+class WordTask:
+    """Represents a complete word (possibly split into segments)."""
+
+    source_json: str
+    hw_id: str
+    author_id: str
+    block_no: int
+    line_no: int
+    word_no: int
+    segments: List[WordSegment]  # List of segments if word was split
+    original_bbox: BBox  # Original bbox before splitting
+    include_in_sentence: bool = (
+        True  # Whether this word should be considered for sentence stitching
+    )
+    sentence_exclusion_reason: Optional[str] = (
+        None  # Reason for omitting from sentence stitching
+    )
+
+
+# ---------------------------- util ----------------------------
+
+
+def list_json_files(p: Path) -> List[Path]:
+    return sorted([x for x in p.glob("*.json") if x.is_file()])
+
+
+def load_json(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def parse_bbox_record(rec: str) -> Tuple[BBox, str, int, int, int]:
+    """Parse bbox record in format: x1,y1,x2,y2,text,block_no,line_no,word_no"""
+    parts = rec.split(",")
+    if len(parts) < 8:
+        raise ValueError(f"Invalid bbox record (expected at least 8 parts): {rec}")
+    x1, y1, x2, y2 = map(float, parts[:4])
+    block_no = int(parts[-3])
+    line_no = int(parts[-2])
+    word_no = int(parts[-1])
+    # Text is everything between coordinates and the last 3 indices
+    token = ",".join(parts[4:-3])
+    return (x1, y1, x2, y2), token, block_no, line_no, word_no
+
+
+def split_word(word: str, split_length: int) -> List[str]:
+    """
+    Split a word into segments where each segment is AT MOST split_length characters.
+    All segments will have equal or nearly equal length, with no segment exceeding split_length.
+
+    Args:
+        word: The word to split
+        split_length: Maximum length for each segment
+
+    Returns:
+        List of word segments (all <= split_length)
+
+    Examples:
+        split_word("generation", 4) -> ["gen", "era", "tio", "n"] (3, 3, 3, 1)
+        split_word("generation", 5) -> ["gener", "ation"] (5, 5)
+        split_word("extraordinary", 7) -> ["extraor", "dinary"] (7, 7)
+        split_word("extraordinary", 5) -> ["extra", "ordin", "ary"] (5, 5, 3)
+        split_word("hello", 10) -> ["hello"] (5)
+
+    Strategy:
+    - Calculate minimum number of segments needed (ceil(len/split_length))
+    - Distribute characters as evenly as possible
+    - Ensure no segment exceeds split_length
+    """
+    if split_length <= 0:
+        return [word]
+
+    word_len = len(word)
+
+    if word_len <= split_length:
+        return [word]
+
+    # Calculate minimum number of segments needed
+    num_segments = (word_len + split_length - 1) // split_length  # Ceiling division
+
+    # Calculate base length for each segment (will be <= split_length)
+    base_length = word_len // num_segments
+    remainder = word_len % num_segments
+
+    # Verify base_length doesn't exceed split_length
+    # (This should always be true given our calculation, but being safe)
+    assert base_length <= split_length, (
+        f"base_length {base_length} exceeds split_length {split_length}"
+    )
+
+    # Build segments: first 'remainder' segments get base_length+1, rest get base_length
+    segments = []
+    start = 0
+
+    for i in range(num_segments):
+        # First 'remainder' segments get one extra character
+        seg_length = base_length + (1 if i < remainder else 0)
+        segments.append(word[start : start + seg_length])
+        start += seg_length
+
+    # Verify all segments are <= split_length
+    for seg in segments:
+        assert len(seg) <= split_length, (
+            f"Segment '{seg}' (len={len(seg)}) exceeds split_length {split_length}"
+        )
+
+    return segments
+
+
+def split_token_preserving_digit_chunks(
+    token: str, split_length_words: int, split_length_numeric: int
+) -> List[str]:
+    """
+    Split a token while keeping numeric sequences in configurable chunk sizes.
+
+    Args:
+        token: The token to split.
+        split_length_words: Maximum length for each non-numeric segment.
+        split_length_numeric: Maximum length for numeric sequences (<=0 disables special handling).
+
+    Returns:
+        List of token segments in the original order.
+    """
+    if split_length_numeric <= 0:
+        return split_word(token, split_length_words)
+
+    segments: List[str] = []
+    idx = 0
+    token_len = len(token)
+
+    while idx < token_len:
+        if token[idx].isdigit():
+            start = idx
+            while idx < token_len and token[idx].isdigit():
+                idx += 1
+            digits = token[start:idx]
+            effective_chunk = max(1, split_length_numeric)
+            if split_length_words > 0:
+                effective_chunk = min(effective_chunk, split_length_words)
+            for chunk_start in range(0, len(digits), effective_chunk):
+                segments.append(digits[chunk_start : chunk_start + effective_chunk])
+        else:
+            start = idx
+            while idx < token_len and not token[idx].isdigit():
+                idx += 1
+            alpha = token[start:idx]
+            if alpha:
+                segments.extend(split_word(alpha, split_length_words))
+
+    return segments or [token]
+
+
+def split_word_with_spaces(
+    word: str, split_length_words: int, split_length_numeric: int
+) -> List[Tuple[str, bool]]:
+    """
+    Split a word into segments, handling spaces first, then applying length-based splitting.
+
+    Args:
+        word: The word to split (may contain spaces)
+        split_length_words: Maximum length for each segment
+        split_length_numeric: Maximum length for numeric sequences within each token (<=0 disables special handling)
+
+    Returns:
+        List of tuples (segment_text, space_before) where space_before indicates if this
+        segment was separated by a space in the original word.
+
+    Examples:
+        split_word_with_spaces("hello world", 10) -> [("hello", False), ("world", True)]
+        split_word_with_spaces("very long phrase", 5) -> [("very", False), ("long", True), ("phras", True), ("e", False)]
+        split_word_with_spaces("hello", 3) -> [("hel", False), ("lo", False)]
+
+    Strategy:
+    1. Split at spaces first
+    2. Apply length-based splitting (with digit chunking) to each space-separated part
+    3. Mark segments that were separated by spaces with space_before=True
+    """
+    if not word:
+        return []
+
+    # Split at spaces first
+    space_parts = word.split(" ")
+
+    result = []
+    for part_idx, part in enumerate(space_parts):
+        if not part:  # Skip empty parts (from consecutive spaces)
+            continue
+
+        # Apply length-based splitting to this part
+        length_segments = split_token_preserving_digit_chunks(
+            part, split_length_words, split_length_numeric
+        )
+
+        for seg_idx, seg in enumerate(length_segments):
+            # First segment of non-first parts had a space before it
+            space_before = part_idx > 0 and seg_idx == 0
+            result.append((seg, space_before))
+
+    return result
+
+
+def extract_tasks(
+    json_path: Path,
+    data: List[Dict[str, Any]],
+    split_length_words: int,
+    split_length_numeric: int,
+) -> Tuple[List[WordTask], List[Dict[str, Any]]]:
+    """
+    Extract word tasks from JSON data, splitting long words internally.
+
+    Args:
+        json_path: Path to the JSON file
+        data: Parsed JSON data
+        split_length_words: Maximum word length before splitting
+        split_length_numeric: Maximum length for numeric sequences within tokens (<=0 disables special handling)
+
+    Returns:
+        Tuple of (word tasks, extraction log entries)
+    """
+    tasks: List[WordTask] = []
+    extraction_logs: List[Dict[str, Any]] = []
+    fallback_counters: Dict[str, int] = defaultdict(int)
+    zero_bbox: BBox = (0.0, 0.0, 0.0, 0.0)
+
+    for obj in data:
+        # Skip entries without valid data
+        if obj is None:
+            continue
+
+        hw_id = obj.get("id")
+        author_id = obj.get("author-id") or obj.get("author_id")
+        bboxes = obj.get("bboxes")
+        text_content = (obj.get("text") or "").strip()
+
+        # Skip entries with None or empty bboxes
+        if bboxes is None or not bboxes:
+            if not text_content:
+                extraction_logs.append(
+                    {
+                        "type": "extraction_skip",
+                        "source_json": json_path.name,
+                        "hw_id": hw_id,
+                        "reason": "missing_bbox_no_text",
+                    }
+                )
+                continue
+
+            fallback_words = [w for w in text_content.split() if w]
+            if not fallback_words:
+                extraction_logs.append(
+                    {
+                        "type": "extraction_skip",
+                        "source_json": json_path.name,
+                        "hw_id": hw_id,
+                        "reason": "missing_bbox_no_tokens",
+                    }
+                )
+                continue
+
+            for fallback_idx, raw_word in enumerate(fallback_words):
+                word_segments_with_flags = split_word_with_spaces(
+                    raw_word, split_length_words, split_length_numeric
+                )
+                if not word_segments_with_flags:
+                    continue
+
+                segments: List[WordSegment] = []
+                for seg_idx, (seg_text, space_before) in enumerate(
+                    word_segments_with_flags
+                ):
+                    segments.append(
+                        WordSegment(
+                            token=seg_text,
+                            bbox=zero_bbox,
+                            original_index=seg_idx,
+                            space_before=space_before,
+                        )
+                    )
+
+                fallback_counter = fallback_counters[hw_id]
+                fallback_counters[hw_id] += 1
+                tasks.append(
+                    WordTask(
+                        source_json=json_path.name,
+                        hw_id=hw_id,
+                        author_id=author_id,
+                        block_no=-1,
+                        line_no=-1,
+                        word_no=100000 + fallback_counter,
+                        segments=segments,
+                        original_bbox=zero_bbox,
+                        include_in_sentence=False,
+                        sentence_exclusion_reason="missing_bbox",
+                    )
+                )
+
+            extraction_logs.append(
+                {
+                    "type": "extraction_notice",
+                    "source_json": json_path.name,
+                    "hw_id": hw_id,
+                    "reason": "missing_bbox_generated",
+                    "num_words": len(fallback_words),
+                }
+            )
+            continue
+
+        for idx, rec in enumerate(bboxes):
+            bbox, token, block_no, line_no, word_no = parse_bbox_record(rec)
+
+            # Split word with space-awareness (splits at spaces first, then by length)
+            word_segments_with_flags = split_word_with_spaces(
+                token, split_length_words, split_length_numeric
+            )
+
+            # Create WordSegment objects for each part
+            segments = []
+            for seg_idx, (seg_text, space_before) in enumerate(
+                word_segments_with_flags
+            ):
+                segments.append(
+                    WordSegment(
+                        token=seg_text,
+                        bbox=bbox,  # Use same bbox for all segments (will adjust proportionally if needed)
+                        original_index=seg_idx,
+                        space_before=space_before,
+                    )
+                )
+
+            tasks.append(
+                WordTask(
+                    source_json=json_path.name,
+                    hw_id=hw_id,
+                    author_id=author_id,
+                    block_no=block_no,
+                    line_no=line_no,
+                    word_no=word_no,
+                    segments=segments,
+                    original_bbox=bbox,
+                )
+            )
+    return tasks, extraction_logs
+
+
+def style_id_for_file(json_name: str, author_id: str, seed: int, vocab: int) -> int:
+    """Deterministically derive a style id for (json_name, author_id) combo."""
+    composite = f"{json_name}::{author_id}"
+    return (hash(composite) ^ seed) % vocab
+
+
+def build_word_filename(task: WordTask) -> str:
+    """Create a unique filename for a word using hw_id, block, line, and word numbers."""
+    block_part = f"b{task.block_no}" if task.block_no is not None else "bX"
+    line_part = f"l{task.line_no}" if task.line_no is not None else "lX"
+    word_part = f"w{task.word_no}"
+    return f"{task.hw_id}_{block_part}_{line_part}_{word_part}.png"
+
+
+# ------------------------ generation -------------------------
+
+
+def load_experiment(
+    run_dir: Path, checkpoint_name: str, device: torch.device
+) -> Dict[str, Any]:
+    """
+    Load model components from experiment directory.
+    Based on inference_hf.ipynb load_experiment function.
+    """
+    run_dir = run_dir.expanduser().resolve()
+    if not run_dir.exists():
+        raise FileNotFoundError(f"Run directory {run_dir} does not exist.")
+
+    config_path = run_dir / "config.yaml"
+    if not config_path.exists():
+        raise FileNotFoundError(f"Expected config at {config_path}.")
+
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+
+    # Load tokenizer
+    vocab_path = Path(config["data"]["vocab_path"])
+    if not vocab_path.is_absolute():
+        vocab_path = run_dir / vocab_path
+    if not vocab_path.exists():
+        vocab_path = run_dir.parent / config["data"]["vocab_path"]
+
+    tokenizer = CharTokenizer.load(str(vocab_path))
+
+    # Load writer_id_map
+    writer_map_path = run_dir / "writer_id_map.json"
+    if not writer_map_path.exists():
+        raise FileNotFoundError(f"Expected writer mapping at {writer_map_path}.")
+    with open(writer_map_path, "r", encoding="utf-8") as f:
+        raw_writer_map = json.load(f)
+    writer_id_map = {str(k): int(v) for k, v in raw_writer_map.items()}
+    num_writers = len(writer_id_map)
+
+    # Load text encoder
+    text_cfg = config["model"]["text_encoder"]
+    text_encoder = TextEncoder(
+        vocab_size=len(tokenizer),
+        d_model=text_cfg["d_model"],
+        num_layers=text_cfg["num_layers"],
+        num_heads=text_cfg["num_heads"],
+        d_ff=text_cfg["d_ff"],
+        dropout=text_cfg["dropout"],
+        max_length=text_cfg["max_length"],
+        output_dim=text_cfg.get("output_dim", text_cfg["d_model"]),
+    ).to(device)
+    text_encoder.eval()
+
+    # Load UNet
+    unet_cfg = deepcopy(config["model"]["unet"])
+    pretrained_path = unet_cfg.pop("pretrained_model_name_or_path", None)
+
+    # Ensure tuple types
+    for key in ("down_block_types", "up_block_types", "block_out_channels"):
+        if key in unet_cfg and isinstance(unet_cfg[key], list):
+            unet_cfg[key] = tuple(unet_cfg[key])
+
+    if "sample_size" in unet_cfg and isinstance(unet_cfg["sample_size"], list):
+        unet_cfg["sample_size"] = tuple(unet_cfg["sample_size"])
+
+    # Set num_class_embeds from writer_id_map
+    unet_cfg["num_class_embeds"] = num_writers
+
+    if pretrained_path:
+        unet = UNet2DConditionModel.from_pretrained(
+            pretrained_path, num_class_embeds=num_writers
+        ).to(device)
+    else:
+        unet = UNet2DConditionModel(**unet_cfg).to(device)
+
+    unet.eval()
+
+    # Load scheduler - using DPM-Solver++ with order 3 for fast, high-quality sampling
+    scheduler_cfg = config["model"]["scheduler"]
+    noise_scheduler = DPMSolverMultistepScheduler(
+        num_train_timesteps=scheduler_cfg["num_train_timesteps"],
+        beta_start=scheduler_cfg["beta_start"],
+        beta_end=scheduler_cfg["beta_end"],
+        beta_schedule=scheduler_cfg["beta_schedule"],
+        prediction_type=scheduler_cfg.get("prediction_type", "epsilon"),
+        algorithm_type="dpmsolver++",
+        solver_order=3,  # Higher order = better quality
+        use_karras_sigmas=scheduler_cfg.get("use_karras_sigmas", False),
+    )
+    # Add timestep_spacing if specified in config
+    if "timestep_spacing" in scheduler_cfg:
+        noise_scheduler.config.timestep_spacing = scheduler_cfg["timestep_spacing"]
+
+    # Load VAE if latent mode
+    mode = config["training"].get("mode", "latent")
+    vae = None
+    vae_scale_factor = 0.18215
+    if mode == "latent":
+        vae_config = config["model"].get("vae")
+        if vae_config is None:
+            raise KeyError("Latent mode requires 'model.vae' configuration.")
+        vae_model_name = vae_config["model_name"]
+
+        vae_cache_dir = run_dir / "cached_vae"
+        if vae_cache_dir.exists():
+            vae = AutoencoderKL.from_pretrained(vae_cache_dir).to(device)
+        else:
+            vae = AutoencoderKL.from_pretrained(vae_model_name).to(device)
+            vae_cache_dir.mkdir(parents=True, exist_ok=True)
+            vae.save_pretrained(vae_cache_dir)
+
+        vae.eval()
+
+    # Load checkpoint
+    checkpoint_path = run_dir / checkpoint_name
+
+    print(checkpoint_path)
+    if not checkpoint_path.exists():
+        checkpoint_path = Path(checkpoint_name)
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(f"Checkpoint {checkpoint_name} not found.")
+
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    text_encoder.load_state_dict(checkpoint["text_encoder"])
+    unet.load_state_dict(checkpoint["unet"], strict=False)
+
+    # Load EMA if available
+    ema_model = None
+    if "ema" in checkpoint:
+        training_cfg = config.get("training", {})
+        use_warmup = training_cfg.get("ema_use_warmup", False)
+        ema_model = EMAModel(
+            unet.parameters(),
+            decay=training_cfg.get("ema_decay", 0.9999),
+            use_ema_warmup=use_warmup,
+            inv_gamma=training_cfg.get("ema_inv_gamma", 1.0),
+            power=training_cfg.get("ema_power", 1.0),
+            min_decay=training_cfg.get("ema_min_decay", 0.0),
+            device=device,
+            model_cls=UNet2DConditionModel,
+            model_config=unet.config,
+        )
+        ema_model.load_state_dict(checkpoint["ema"])
+        ema_model.to(device)
+        ema_model.copy_to(unet.parameters())
+
+    latent_shape = config["model"].get("latent_shape")
+    image_shape = config["model"].get("image_shape")
+    if mode == "latent":
+        sample_shape = tuple(latent_shape)
+    else:
+        sample_shape = tuple(image_shape)
+
+    return {
+        "tokenizer": tokenizer,
+        "text_encoder": text_encoder,
+        "unet": unet,
+        "noise_scheduler": noise_scheduler,
+        "vae": vae,
+        "vae_scale_factor": vae_scale_factor,
+        "writer_id_map": writer_id_map,
+        "device": device,
+        "config": config,
+        "sample_shape": sample_shape,
+        "mode": mode,
+    }
+
+
+def diffusion_generate_batch(
+    tokens: List[str],
+    style_ids: List[int],
+    components: Dict[str, Any],
+    steps: int,
+    temperature: float = 1.0,
+) -> List[Image.Image]:
+    """
+    Generate batch of handwriting images using diffusion model.
+    Based on sample_diffusion from inference_hf.ipynb.
+    """
+    if not tokens:
+        return []
+
+    device = components["device"]
+    tokenizer = components["tokenizer"]
+    text_encoder = components["text_encoder"]
+    unet = components["unet"]
+    noise_scheduler = components["noise_scheduler"]
+    sample_shape = components["sample_shape"]
+    mode = components["mode"]
+    vae = components.get("vae")
+    vae_scale_factor = components.get("vae_scale_factor", 0.18215)
+
+    # Encode text
+    encodings = tokenizer.encode_batch(tokens)
+    input_ids = torch.tensor(encodings["input_ids"], device=device, dtype=torch.long)
+    attention_mask = torch.tensor(
+        encodings["attention_mask"], device=device, dtype=torch.float32
+    )
+
+    # Convert writer style IDs to class indices
+    writer_indices = torch.tensor(style_ids, device=device, dtype=torch.long)
+
+    # Set timesteps
+    noise_scheduler.set_timesteps(steps, device=device)
+    timesteps = noise_scheduler.timesteps
+
+    # Initialize latents
+    batch_shape = (len(tokens),) + tuple(sample_shape)
+    latents = torch.randn(batch_shape, device=device) * temperature
+
+    # Generate text features
+    with torch.no_grad():
+        text_features = text_encoder(input_ids, attention_mask=attention_mask)
+
+        # Sampling loop
+        for timestep in timesteps:
+            t_batch = torch.full(
+                (len(tokens),), int(timestep), device=device, dtype=torch.long
+            )
+
+            model_output = unet(
+                latents,
+                t_batch,
+                encoder_hidden_states=text_features,
+                encoder_attention_mask=attention_mask,
+                class_labels=writer_indices,
+            )
+            noise_pred = (
+                model_output.sample if hasattr(model_output, "sample") else model_output
+            )
+
+            scheduler_step = noise_scheduler.step(noise_pred, int(timestep), latents)
+            latents = scheduler_step.prev_sample
+
+        # Decode if latent mode
+        if mode == "latent" and vae is not None:
+            latents = latents / vae_scale_factor
+            decoded = vae.decode(latents).sample
+        else:
+            decoded = latents
+
+        images = (decoded / 2 + 0.5).clamp(0.0, 1.0)
+
+    # Convert to PIL images with cropping and transparency
+    results: List[Image.Image] = []
+    imgs = images.cpu().numpy()
+
+    for i in range(len(tokens)):
+        arr = imgs[i]
+        if arr.shape[0] == 1:
+            arr = arr[0]  # Remove channel dim if grayscale
+        else:
+            arr = arr.transpose(1, 2, 0)  # CHW -> HWC
+
+        arr8 = (arr * 255).round().astype("uint8")
+
+        # Binarize
+        if arr8.ndim == 3:
+            arr8 = arr8.mean(axis=2).astype("uint8")
+
+        thresh = otsu_threshold(arr8)
+        bin_arr = (arr8 > thresh).astype("uint8") * 255
+
+        # Crop to content
+        cropped, crop_box = crop_to_content(bin_arr)
+
+        # Convert to RGBA
+        rgba = binary_to_rgba(cropped)
+        rgba.info["crop_box"] = crop_box
+        results.append(rgba)
+
+    return results
+
+
+# ---------------------- binarization utils -------------------
+
+
+def otsu_threshold(arr8):
+    hist = np.bincount(arr8.ravel(), minlength=256).astype(np.float64)
+    total = arr8.size
+    sum_total = (hist * np.arange(256)).sum()
+    weight_bg = 0.0
+    sum_bg = 0.0
+    max_between = -1.0
+    thresh = 0
+    for i in range(256):
+        weight_bg += hist[i]
+        if weight_bg == 0:
+            continue
+        weight_fg = total - weight_bg
+        if weight_fg == 0:
+            break
+        sum_bg += i * hist[i]
+        mean_bg = sum_bg / weight_bg
+        mean_fg = (sum_total - sum_bg) / weight_fg
+        between = weight_bg * weight_fg * (mean_bg - mean_fg) ** 2
+        if between > max_between:
+            max_between = between
+            thresh = i
+    return thresh
+
+
+# ---------------------- cropping & alpha --------------------
+
+
+def crop_to_content(bin_arr: np.ndarray, pad: int = 0):
+    """Crop binary array (0=ink,255=bg) to tight bounding box. Returns (cropped_array, (x1,y1,x2,y2))."""
+    h, w = bin_arr.shape
+    ink_mask = bin_arr < 255
+    if not ink_mask.any():
+        # No ink; return 1x1 transparent placeholder
+        return bin_arr[:1, :1], (0, 0, 1, 1)
+    rows = np.where(ink_mask.any(axis=1))[0]
+    cols = np.where(ink_mask.any(axis=0))[0]
+    y1, y2 = rows[0], rows[-1]
+    x1, x2 = cols[0], cols[-1]
+    if pad:
+        x1 = max(0, x1 - pad)
+        y1 = max(0, y1 - pad)
+        x2 = min(w - 1, x2 + pad)
+        y2 = min(h - 1, y2 + pad)
+    cropped = bin_arr[y1 : y2 + 1, x1 : x2 + 1]
+    return cropped, (
+        int(x1),
+        int(y1),
+        int(x2) + 1,
+        int(y2) + 1,
+    )  # x2,y2 exclusive for convenience
+
+
+def binary_to_rgba(bin_arr: np.ndarray) -> Image.Image:
+    """Convert binary (0 ink, 255 bg) to RGBA with transparent background."""
+    h, w = bin_arr.shape
+    # Ink black RGB (0,0,0), alpha 255 where ink, 0 where bg
+    alpha = (bin_arr == 0).astype("uint8") * 255
+    rgb = np.zeros((h, w, 3), dtype="uint8")  # already black
+    rgba = np.dstack([rgb, alpha])
+    return Image.fromarray(rgba, mode="RGBA")
+
+
+def pad_tokens_to_equal_length(tokens: List[str]) -> List[str]:
+    """Pad tokens to equal length by appending spaces to shorter tokens."""
+    if not tokens:
+        return tokens
+    max_len = max(len(t) for t in tokens)
+    print([t.ljust(max_len) for t in tokens])
+    return [t.ljust(max_len) for t in tokens]
+
+
+def calculate_baseline_info(
+    img: Image.Image, baseline_percentile: float = 85.0
+) -> Dict[str, Any]:
+    """
+    Calculate baseline information for an RGBA image.
+
+    Args:
+        img: RGBA PIL Image
+        baseline_percentile: Percentile for baseline detection (default: 85.0)
+
+    Returns:
+        Dictionary with baseline metrics:
+        - baseline_y: Absolute baseline position (pixels from top)
+        - baseline_ratio: Baseline as ratio of height (0.0-1.0)
+        - height_above: Pixels above baseline
+        - height_below: Pixels below baseline
+        - ascender_ratio: Ratio of height above baseline
+        - descender_ratio: Ratio of height below baseline
+    """
+    arr = np.array(img)
+    height = img.height
+
+    if arr.shape[2] == 4:  # RGBA
+        alpha = arr[:, :, 3]
+    else:
+        alpha = np.ones((height, img.width), dtype=np.uint8) * 255
+
+    ink_mask = alpha > 200
+
+    if not ink_mask.any():
+        # No ink, use bottom as baseline
+        baseline_y = height - 1
+    else:
+        # Find bottom-most ink pixels for each column
+        bottom_candidates = []
+        cols_with_ink = np.where(ink_mask.any(axis=0))[0]
+        for col_idx in cols_with_ink:
+            ink_rows = np.where(ink_mask[:, col_idx])[0]
+            if ink_rows.size > 0:
+                bottom_candidates.append(int(ink_rows[-1]))
+
+        if bottom_candidates:
+            baseline_y = int(np.percentile(bottom_candidates, baseline_percentile))
+        else:
+            baseline_y = height - 1
+
+    height_above = baseline_y
+    height_below = height - 1 - baseline_y
+
+    return {
+        "baseline_y": baseline_y,
+        "baseline_ratio": baseline_y / height if height > 0 else 0.0,
+        "height_above": height_above,
+        "height_below": height_below,
+        "ascender_ratio": height_above / height if height > 0 else 0.0,
+        "descender_ratio": height_below / height if height > 0 else 0.0,
+    }
+
+
+def concatenate_images_horizontal(
+    images: List[Image.Image],
+    gap: int = 0,
+    baseline_align: bool = True,
+    baseline_percentile: float = 75.0,
+) -> Image.Image:
+    """
+    Horizontally concatenate a list of RGBA images with baseline alignment.
+
+    Args:
+        images: List of RGBA images to concatenate
+        gap: Spacing between images in pixels
+        baseline_align: If True, align by baseline; if False, center vertically
+        baseline_percentile: Percentile for baseline detection (default: 85.0)
+
+    Returns:
+        Concatenated RGBA image
+    """
+    if not images:
+        raise ValueError("Cannot concatenate empty image list")
+    if len(images) == 1:
+        return images[0]
+
+    if baseline_align:
+        # Calculate baseline for each image
+        baselines = []
+        max_above_baseline = 0
+        max_below_baseline = 0
+
+        for img in images:
+            # Convert to grayscale array
+            arr = np.array(img)
+            if arr.shape[2] == 4:  # RGBA
+                alpha = arr[:, :, 3]
+            else:
+                alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255
+
+            # Find ink pixels
+            ink_mask = alpha > 200
+
+            if not ink_mask.any():
+                # No ink, use bottom as baseline
+                baseline = img.height - 1
+            else:
+                # Find bottom-most ink pixels for each column (optimized: only iterate columns with ink)
+                bottom_candidates = []
+                cols_with_ink = np.where(ink_mask.any(axis=0))[0]
+                for col_idx in cols_with_ink:
+                    ink_rows = np.where(ink_mask[:, col_idx])[0]
+                    if ink_rows.size > 0:
+                        bottom_candidates.append(int(ink_rows[-1]))
+
+                if bottom_candidates:
+                    baseline = int(
+                        np.percentile(bottom_candidates, baseline_percentile)
+                    )
+                else:
+                    baseline = img.height - 1
+
+            baselines.append(baseline)
+
+            # Calculate space above and below baseline
+            above = baseline
+            below = img.height - 1 - baseline
+            max_above_baseline = max(max_above_baseline, above)
+            max_below_baseline = max(max_below_baseline, below)
+
+        # Total height needed
+        canvas_height = max_above_baseline + 1 + max_below_baseline
+        total_width = sum(img.width for img in images) + gap * (len(images) - 1)
+
+        # Create canvas
+        result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0))
+
+        # Paste images aligned by baseline
+        x_offset = 0
+        for img, baseline in zip(images, baselines):
+            # Calculate y position to align baselines
+            y_offset = max_above_baseline - baseline
+            result.paste(img, (x_offset, y_offset), img)
+            x_offset += img.width + gap
+    else:
+        # Simple vertical centering
+        max_height = max(img.height for img in images)
+        total_width = sum(img.width for img in images) + gap * (len(images) - 1)
+
+        result = Image.new("RGBA", (total_width, max_height), (0, 0, 0, 0))
+
+        x_offset = 0
+        for img in images:
+            y_offset = (max_height - img.height) // 2
+            result.paste(img, (x_offset, y_offset), img)
+            x_offset += img.width + gap
+
+    return result
+
+
+def concatenate_segments_with_variable_gaps(
+    images: List[Image.Image],
+    segments: List[WordSegment],
+    segment_gap: int = 2,
+    word_gap: int = 20,
+    baseline_percentile: float = 75.0,
+) -> Image.Image:
+    """
+    Concatenate word segments with variable gaps based on whether they were separated by spaces.
+
+    Args:
+        images: List of RGBA segment images (same length as segments)
+        segments: List of WordSegment objects with space_before flags
+        segment_gap: Gap for length-split segments (no space in original)
+        word_gap: Gap for space-separated segments
+        baseline_percentile: Percentile for baseline detection
+
+    Returns:
+        Concatenated RGBA image with appropriate gaps
+    """
+    if not images:
+        raise ValueError("Cannot concatenate empty image list")
+    if len(images) == 1:
+        return images[0]
+    if len(images) != len(segments):
+        raise ValueError(f"Mismatch: {len(images)} images but {len(segments)} segments")
+
+    # Calculate baseline for each image
+    baselines = []
+    max_above_baseline = 0
+    max_below_baseline = 0
+
+    for img in images:
+        arr = np.array(img)
+        if arr.shape[2] == 4:  # RGBA
+            alpha = arr[:, :, 3]
+        else:
+            alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255
+
+        ink_mask = alpha > 200
+
+        if not ink_mask.any():
+            baseline = img.height - 1
+        else:
+            bottom_candidates = []
+            cols_with_ink = np.where(ink_mask.any(axis=0))[0]
+            for col_idx in cols_with_ink:
+                ink_rows = np.where(ink_mask[:, col_idx])[0]
+                if ink_rows.size > 0:
+                    bottom_candidates.append(int(ink_rows[-1]))
+
+            if bottom_candidates:
+                baseline = int(np.percentile(bottom_candidates, baseline_percentile))
+            else:
+                baseline = img.height - 1
+
+        baselines.append(baseline)
+        above = baseline
+        below = img.height - 1 - baseline
+        max_above_baseline = max(max_above_baseline, above)
+        max_below_baseline = max(max_below_baseline, below)
+
+    # Calculate total width based on variable gaps
+    canvas_height = max_above_baseline + 1 + max_below_baseline
+    total_width = sum(img.width for img in images)
+    for i in range(1, len(images)):
+        # Use word_gap if this segment had a space before it, else segment_gap
+        gap = word_gap if segments[i].space_before else segment_gap
+        total_width += gap
+
+    # Create canvas and paste images
+    result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0))
+
+    x_offset = 0
+    for i, (img, baseline, segment) in enumerate(zip(images, baselines, segments)):
+        y_offset = max_above_baseline - baseline
+        result.paste(img, (x_offset, y_offset), img)
+        x_offset += img.width
+
+        # Add appropriate gap before next image
+        if i < len(images) - 1:
+            gap = word_gap if segments[i + 1].space_before else segment_gap
+            x_offset += gap
+
+    return result
+
+
+# -------------------------- main -----------------------------
+
+
+def generate_handwriting(
+    input_dir: Path,
+    output_dir: Path,
+    run_dir: Path,
+    checkpoint: str = "latest.pt",
+    progress: Progress | None = None,
+    steps: int = 30,
+    split_length_words: int = 6,
+    split_length_numeric: int = 2,
+    temperature: float = 0.5,
+    seed: int = 42,
+    device: str = "cuda",
+    overwrite: bool = False,
+    mapping_file: Optional[Path] = None,
+    log_file: Optional[Path] = None,
+    batch_size: int = 32,
+    stitch_sentences: bool = True,
+    segment_gap: int = 2,
+    word_gap: int = 20,
+    baseline_percentile: float = 75.0,
+    allowed_writers: Optional[List[str]] = None,
+) -> None:
+    """Generate handwriting images and metadata using configured diffusion models."""
+    random.seed(seed)
+    torch.manual_seed(seed)
+    device_obj = torch.device(
+        device if torch.cuda.is_available() or device == "cpu" else "cpu"
+    )
+
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    run_dir = Path(run_dir)
+    mapping_file = Path(mapping_file) if mapping_file is not None else None
+    log_file = Path(log_file) if log_file is not None else None
+
+    # Load model components
+    print(f"Loading model from {run_dir}...")
+    components = load_experiment(run_dir, checkpoint, device_obj)
+    print(f"✓ Model loaded successfully")
+    print(f"  Mode: {components['mode']}")
+    print(f"  Sample shape: {components['sample_shape']}")
+    print(f"  Writers: {len(components['writer_id_map'])}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load JSON files
+    json_files = list_json_files(input_dir)
+    if not json_files:
+        print("[ERROR] No JSON files found.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(json_files)} JSON files")
+
+    # Extract tasks with word splitting
+    tasks: List[WordTask] = []
+    extraction_logs: List[Dict[str, Any]] = []
+    for jf in json_files:
+        data = load_json(jf)
+        extracted_tasks, extracted_log_entries = extract_tasks(
+            jf, data, split_length_words, split_length_numeric
+        )
+        tasks.extend(extracted_tasks)
+        extraction_logs.extend(extracted_log_entries)
+
+    print(f"Extracted {len(tasks)} word tasks")
+    if split_length_words > 0:
+        total_segments = sum(len(t.segments) for t in tasks)
+        print(
+            f"  Split into {total_segments} segments (split_length={split_length_words}, digit_chunk_length={split_length_numeric})"
+        )
+
+    # Per-file author style mapping
+    file_author_style_ids: Dict[str, Dict[str, int]] = {}
+    writer_id_map = components["writer_id_map"]
+
+    # Filter to allowed writers if specified
+    allowed_writer_ids = None
+    if allowed_writers is not None:
+        allowed_writer_ids = []
+        for w in allowed_writers:
+            try:
+                writer_id = int(w)
+                if 0 <= writer_id < len(writer_id_map):
+                    allowed_writer_ids.append(writer_id)
+                else:
+                    print(
+                        f"[WARNING] Writer ID {writer_id} out of range (0-{len(writer_id_map) - 1}), ignoring"
+                    )
+            except ValueError:
+                print(f"[WARNING] Invalid writer ID '{w}', must be integer, ignoring")
+
+        if not allowed_writer_ids:
+            print("[ERROR] No valid writer IDs provided in --allowed-writers")
+            sys.exit(1)
+
+        print(
+            f"Using {len(allowed_writer_ids)} allowed writer(s): {sorted(allowed_writer_ids)}"
+        )
+
+    # Set up RNG for random writer selection if needed
+    rng = random.Random(seed)
+
+    for t in tasks:
+        file_author_style_ids.setdefault(t.source_json, {})
+        if t.author_id not in file_author_style_ids[t.source_json]:
+            # Map author_id to writer index from the model's writer_id_map
+            if t.author_id in writer_id_map:
+                style_id = writer_id_map[t.author_id]
+                # If allowed_writers specified and this author's style not in list, randomly pick from allowed
+                if (
+                    allowed_writer_ids is not None
+                    and style_id not in allowed_writer_ids
+                ):
+                    style_id = rng.choice(allowed_writer_ids)
+            else:
+                # Author not in map: use allowed writers if specified, else fallback to hashing
+                if allowed_writer_ids is not None:
+                    style_id = rng.choice(allowed_writer_ids)
+                else:
+                    style_id = style_id_for_file(
+                        t.source_json, t.author_id, seed, len(writer_id_map)
+                    )
+            file_author_style_ids[t.source_json][t.author_id] = style_id
+
+    results: List[Dict[str, Any]] = []
+    generation_skip_log: List[Dict[str, Any]] = []
+    generation_error_log: List[Dict[str, Any]] = []
+    sentence_exclusion_log: List[Dict[str, Any]] = []
+    total_words = len(tasks)
+    effective_batch_size = max(1, batch_size)
+    progress = progress or Progress(transient=True)
+    generation_task_id = progress.add_task("Generating words", total=total_words)
+
+    for word_idx in range(0, total_words, effective_batch_size):
+        batch_tasks = tasks[word_idx : word_idx + effective_batch_size]
+
+        # Process each word task
+        for task in batch_tasks:
+            json_stem = Path(task.source_json).stem
+            doc_dir = output_dir / json_stem
+            doc_dir.mkdir(parents=True, exist_ok=True)
+
+            # Output filename includes block and line numbers to avoid collisions across lines
+            out_name = build_word_filename(task)
+            relative_image_path = f"{json_stem}/{out_name}"
+            out_path = doc_dir / out_name
+
+            if out_path.exists() and not overwrite:
+                # Load existing metadata
+                try:
+                    existing_img = Image.open(out_path)
+                    w, h = existing_img.size
+                    baseline_info = calculate_baseline_info(
+                        existing_img, baseline_percentile=baseline_percentile
+                    )
+                    results.append(
+                        {
+                            "image": relative_image_path,
+                            "hw_id": task.hw_id,
+                            "author_id": task.author_id,
+                            "style_id": file_author_style_ids[task.source_json][
+                                task.author_id
+                            ],
+                            "source_json": task.source_json,
+                            "block_no": task.block_no,
+                            "line_no": task.line_no,
+                            "word_no": task.word_no,
+                            "segments": [
+                                {
+                                    "token": seg.token,
+                                    "bbox": list(seg.bbox),
+                                    "space_before": seg.space_before,
+                                }
+                                for seg in task.segments
+                            ],
+                            "skipped": True,
+                            "skip_reason": "existing_output",
+                            "include_in_sentence": task.include_in_sentence,
+                            "sentence_exclusion_reason": task.sentence_exclusion_reason,
+                            "width": w,
+                            "height": h,
+                            "baseline": baseline_info,
+                        }
+                    )
+                    generation_skip_log.append(
+                        {
+                            "type": "existing_output",
+                            "source_json": task.source_json,
+                            "hw_id": task.hw_id,
+                            "word_no": task.word_no,
+                            "block_no": task.block_no,
+                            "line_no": task.line_no,
+                            "image": relative_image_path,
+                        }
+                    )
+                    if not task.include_in_sentence:
+                        sentence_exclusion_log.append(
+                            {
+                                "source_json": task.source_json,
+                                "hw_id": task.hw_id,
+                                "word_no": task.word_no,
+                                "block_no": task.block_no,
+                                "line_no": task.line_no,
+                                "image": relative_image_path,
+                                "reason": task.sentence_exclusion_reason
+                                or "manual_exclusion",
+                            }
+                        )
+                except Exception as e:
+                    print(f"[WARN] Could not load existing {out_path}: {e}")
+                continue
+
+            # Generate all segments for this word
+            try:
+                tokens_batch = [seg.token for seg in task.segments]
+                style_id = file_author_style_ids[task.source_json][task.author_id]
+                style_ids_batch = [style_id] * len(tokens_batch)
+
+                segment_images = diffusion_generate_batch(
+                    tokens_batch,
+                    style_ids_batch,
+                    components,
+                    steps,
+                    temperature=temperature,
+                )
+
+                # Concatenate segments with variable gaps (word-gap for spaces, segment-gap for length splits)
+                if len(segment_images) > 1:
+                    final_image = concatenate_segments_with_variable_gaps(
+                        segment_images,
+                        task.segments,
+                        segment_gap=segment_gap,
+                        word_gap=word_gap,
+                        baseline_percentile=baseline_percentile,
+                    )
+                else:
+                    final_image = segment_images[0]
+
+                # Save
+                w, h = final_image.size
+                final_image.save(out_path)
+
+                # Calculate baseline information for alignment
+                baseline_info = calculate_baseline_info(
+                    final_image, baseline_percentile=baseline_percentile
+                )
+
+                results.append(
+                    {
+                        "image": relative_image_path,
+                        "hw_id": task.hw_id,
+                        "author_id": task.author_id,
+                        "style_id": style_id,
+                        "source_json": task.source_json,
+                        "block_no": task.block_no,
+                        "line_no": task.line_no,
+                        "word_no": task.word_no,
+                        "segments": [
+                            {
+                                "token": seg.token,
+                                "bbox": list(seg.bbox),
+                                "space_before": seg.space_before,
+                            }
+                            for seg in task.segments
+                        ],
+                        "skipped": False,
+                        "skip_reason": None,
+                        "include_in_sentence": task.include_in_sentence,
+                        "sentence_exclusion_reason": task.sentence_exclusion_reason,
+                        "width": w,
+                        "height": h,
+                        "baseline": baseline_info,
+                    }
+                )
+                if not task.include_in_sentence:
+                    sentence_exclusion_log.append(
+                        {
+                            "source_json": task.source_json,
+                            "hw_id": task.hw_id,
+                            "word_no": task.word_no,
+                            "block_no": task.block_no,
+                            "line_no": task.line_no,
+                            "image": relative_image_path,
+                            "reason": task.sentence_exclusion_reason
+                            or "manual_exclusion",
+                        }
+                    )
+            except Exception as e:
+                print(
+                    f"[ERROR] Generation failed for {task.hw_id} word {task.word_no}: {e}",
+                    file=sys.stderr,
+                )
+                import traceback
+
+                traceback.print_exc()
+                generation_error_log.append(
+                    {
+                        "type": "generation_error",
+                        "source_json": task.source_json,
+                        "hw_id": task.hw_id,
+                        "word_no": task.word_no,
+                        "block_no": task.block_no,
+                        "line_no": task.line_no,
+                        "reason": str(e),
+                        "traceback": traceback.format_exc(),
+                    }
+                )
+
+        if progress and generation_task_id is not None:
+            progress.advance(generation_task_id, len(batch_tasks))
+
+    # Sentence-level stitching (if requested)
+    if stitch_sentences:
+        print("\nStitching words into sentences...")
+        sentences_dir = output_dir / "sentences"
+        sentences_dir.mkdir(exist_ok=True)
+
+        # Group results by (source_json, hw_id, block_no, line_no)
+        line_groups: Dict[Tuple[str, str, int, int], List[Dict[str, Any]]] = {}
+        for r in results:
+            if r["skipped"]:
+                continue
+            if not r.get("include_in_sentence", True):
+                continue
+            key = (r["source_json"], r["hw_id"], r["block_no"], r["line_no"])
+            line_groups.setdefault(key, []).append(r)
+
+        # Sort words within each line by word_no
+        for key in line_groups:
+            line_groups[key].sort(key=lambda x: x["word_no"])
+
+        sentence_results: List[Dict[str, Any]] = []
+        sentence_progress = progress
+        sentence_task_id = sentence_progress.add_task(
+            "Stitching sentences", total=len(line_groups)
+        )
+
+        for (source_json, hw_id, block_no, line_no), word_list in line_groups.items():
+            if not word_list:
+                continue
+
+            json_stem = Path(source_json).stem
+            sent_doc_dir = sentences_dir / json_stem
+            sent_doc_dir.mkdir(parents=True, exist_ok=True)
+
+            # Output filename: hw{id}_block{block}_line{line}.png
+            sent_name = f"{hw_id}_block{block_no}_line{line_no}.png"
+            sent_relative_path = f"sentences/{json_stem}/{sent_name}"
+            sent_path = sent_doc_dir / sent_name
+
+            if sent_path.exists() and not overwrite:
+                if sentence_progress and sentence_task_id is not None:
+                    sentence_progress.advance(sentence_task_id, 1)
+                continue
+
+            try:
+                # Load all word images for this line
+                word_images = []
+                for word_data in word_list:
+                    word_img_path = output_dir / word_data["image"]
+                    if word_img_path.exists():
+                        word_images.append(Image.open(word_img_path))
+
+                if not word_images:
+                    continue
+
+                # Stitch words together with larger gap
+                sentence_image = concatenate_images_horizontal(
+                    word_images,
+                    gap=word_gap,
+                    baseline_align=True,
+                    baseline_percentile=baseline_percentile,
+                )
+
+                # Save sentence image
+                sentence_image.save(sent_path)
+
+                # Collect text for this line
+                line_text = " ".join(
+                    [
+                        "".join([seg["token"] for seg in w["segments"]])
+                        for w in word_list
+                    ]
+                )
+
+                sentence_results.append(
+                    {
+                        "image": sent_relative_path,
+                        "source_json": source_json,
+                        "hw_id": hw_id,
+                        "block_no": block_no,
+                        "line_no": line_no,
+                        "text": line_text,
+                        "num_words": len(word_list),
+                        "width": sentence_image.width,
+                        "height": sentence_image.height,
+                    }
+                )
+
+            except Exception as e:
+                print(
+                    f"[ERROR] Failed to stitch sentence {hw_id} block{block_no} line{line_no}: {e}",
+                    file=sys.stderr,
+                )
+
+            if sentence_progress and sentence_task_id is not None:
+                sentence_progress.advance(sentence_task_id, 1)
+
+        # Save sentence mapping
+        sentence_mapping_file = sentences_dir / "sentence_map.json"
+        with sentence_mapping_file.open("w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "backend": "diffusion-hf-sentences",
+                    "word_gap": word_gap,
+                    "sentences": sentence_results,
+                },
+                f,
+                ensure_ascii=False,
+                indent=2,
+            )
+
+        print(f"✓ Generated {len(sentence_results)} sentence images")
+        print(f"✓ Sentence mapping saved: {sentence_mapping_file}")
+
+    # Build mapping structure
+    entries_map: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
+    for r in results:
+        key = (r["source_json"], r["hw_id"])
+        entries_map.setdefault(key, []).append(r)
+
+    # Export file author styles
+    file_author_styles_export = {
+        fname: {aid: {"style_id": sid} for aid, sid in inner.items()}
+        for fname, inner in sorted(file_author_style_ids.items())
+    }
+
+    consolidated = {
+        "backend": "diffusion-hf",
+        "split_length": split_length_words,
+        "digit_chunk_length": split_length_numeric,
+        "temperature": temperature,
+        "steps": steps,
+        "segment_gap": segment_gap,
+        "word_gap": word_gap if stitch_sentences else None,
+        "baseline_percentile": baseline_percentile,
+        "entries": [
+            {
+                "source_json": src,
+                "hw_id": hw,
+                "author_id": words[0]["author_id"] if words else None,
+                "words": [
+                    {
+                        "block_no": w["block_no"],
+                        "line_no": w["line_no"],
+                        "word_no": w["word_no"],
+                        "image": w["image"],
+                        "style_id": w["style_id"],
+                        "width": w["width"],
+                        "height": w["height"],
+                        "baseline": w["baseline"],
+                        "segments": w["segments"],
+                    }
+                    for w in sorted(
+                        words, key=lambda x: (x["block_no"], x["line_no"], x["word_no"])
+                    )
+                ],
+            }
+            for (src, hw), words in sorted(entries_map.items())
+        ],
+        "file_author_styles": file_author_styles_export,
+    }
+
+    mapping_path = mapping_file or (output_dir / "raw_token_map.json")
+    with mapping_path.open("w", encoding="utf-8") as f:
+        json.dump(consolidated, f, ensure_ascii=False, indent=2)
+
+    generated_count = sum(1 for r in results if not r["skipped"])
+    reused_count = sum(1 for r in results if r["skipped"])
+    log_file_path = log_file or (output_dir / "generation_log.json")
+    log_payload = {
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "summary": {
+            "total_tasks": len(tasks),
+            "extraction_skips": len(
+                [
+                    entry
+                    for entry in extraction_logs
+                    if entry.get("type") == "extraction_skip"
+                ]
+            ),
+            "words_generated": generated_count,
+            "words_reused": reused_count,
+            "generation_errors": len(generation_error_log),
+            "sentence_exclusions": len(sentence_exclusion_log),
+        },
+        "details": {
+            "extraction": extraction_logs,
+            "generation_skips": generation_skip_log,
+            "generation_errors": generation_error_log,
+            "sentence_exclusions": sentence_exclusion_log,
+        },
+    }
+    with log_file_path.open("w", encoding="utf-8") as log_fp:
+        json.dump(log_payload, log_fp, ensure_ascii=False, indent=2)
+
+    print(f"\n✓ Generated {len(results)} word images")
+    print(f"✓ Mapping saved: {mapping_path}")
+    print(f"✓ Log saved: {log_file_path}")
+    print("[DONE] Freeing up memory..")
+    for k, v in components.items():
+        del v
+    del components
+    torch.cuda.empty_cache()
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Diffusion-based handwriting token generator with intelligent word splitting."
+    )
+    ap.add_argument(
+        "--input-dir",
+        type=Path,
+        required=True,
+        help="Directory containing bbox JSON files",
+    )
+    ap.add_argument(
+        "--output-dir",
+        type=Path,
+        required=True,
+        help="Output directory for generated images",
+    )
+    ap.add_argument(
+        "--run-dir",
+        type=Path,
+        required=True,
+        help="Model experiment directory (e.g., model/experiments/hf_conditional_latent)",
+    )
+    ap.add_argument(
+        "--checkpoint", type=str, default="latest.pt", help="Checkpoint filename"
+    )
+    ap.add_argument("--steps", type=int, default=30, help="Number of diffusion steps")
+    ap.add_argument(
+        "--split-length-words",
+        type=int,
+        default=6,
+        help="Maximum word length before splitting (0 = no splitting)",
+    )
+    ap.add_argument(
+        "--temperature", type=float, default=0.5, help="Sampling temperature"
+    )
+    ap.add_argument("--seed", type=int, default=42, help="Random seed")
+    ap.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
+    ap.add_argument(
+        "--overwrite", action="store_true", help="Overwrite existing images"
+    )
+    ap.add_argument(
+        "--mapping-file", type=Path, default=None, help="Output mapping JSON path"
+    )
+    ap.add_argument(
+        "--log-file",
+        type=Path,
+        default=None,
+        help="Optional path for JSON log output (default: output_dir/generation_log.json)",
+    )
+    ap.add_argument(
+        "--batch-size", type=int, default=32, help="Batch size for generation"
+    )
+    ap.add_argument(
+        "--stitch-sentences",
+        default=True,
+        action="store_true",
+        help="Generate sentence-level stitched images in separate folder",
+    )
+    ap.add_argument(
+        "--segment-gap",
+        type=int,
+        default=2,
+        help="Gap between word segments (split parts) in pixels",
+    )
+    ap.add_argument(
+        "--word-gap",
+        type=int,
+        default=20,
+        help="Gap between words in sentence stitching in pixels",
+    )
+    ap.add_argument(
+        "--baseline-percentile",
+        type=float,
+        default=75.0,
+        help="Percentile for baseline detection (0-100, default: 85.0)",
+    )
+    ap.add_argument(
+        "--allowed-writers",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of allowed writer IDs to choose from (e.g., --allowed-writers 0 5 10 25)",
+    )
+    args = ap.parse_args()
+
+    generate_handwriting(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docgenie/generation/handwriting_diffusion/text_encoder.py b/docgenie/generation/handwriting_diffusion/text_encoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..532d1a280a072c2bf626e59da4dd3f1b66c36286
--- /dev/null
+++ b/docgenie/generation/handwriting_diffusion/text_encoder.py
@@ -0,0 +1,258 @@
+"""
+Transformer-based text encoder for conditioning diffusion model.
+"""
+import torch
+import torch.nn as nn
+import math
+
+
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding."""
+    
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        
+        # Create positional encoding matrix
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
+        
+        self.register_buffer('pe', pe)
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor of shape [batch_size, seq_len, d_model]
+        Returns:
+            Tensor with positional encoding added
+        """
+        return x + self.pe[:, :x.size(1), :]
+
+
+class TransformerEncoderBlock(nn.Module):
+    """Single Transformer encoder block."""
+    
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        d_ff: int,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        
+        self.self_attn = nn.MultiheadAttention(
+            d_model,
+            num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        
+        self.feed_forward = nn.Sequential(
+            nn.Linear(d_model, d_ff),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_ff, d_model),
+            nn.Dropout(dropout)
+        )
+        
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [batch_size, seq_len, d_model]
+            attention_mask: [batch_size, seq_len] - 1 for valid, 0 for padding
+        """
+        # Self-attention with residual
+        attn_output, _ = self.self_attn(
+            x, x, x,
+            key_padding_mask=(1 - attention_mask).bool() if attention_mask is not None else None
+        )
+        x = self.norm1(x + self.dropout(attn_output))
+        
+        # Feed-forward with residual
+        ff_output = self.feed_forward(x)
+        x = self.norm2(x + ff_output)
+        
+        return x
+
+
+class TextEncoder(nn.Module):
+    """
+    Transformer-based text encoder for character-level conditioning.
+    """
+    
+    def __init__(
+        self,
+        vocab_size: int,
+        char_embed_dim: int = 256,
+        d_model: int = 512,
+        num_layers: int = 6,
+        num_heads: int = 8,
+        d_ff: int = 2048,
+        max_length: int = 128,
+        dropout: float = 0.1,
+        output_dim: int = 512
+    ):
+        """
+        Args:
+            vocab_size: Size of character vocabulary
+            char_embed_dim: Dimension of character embeddings
+            d_model: Hidden dimension of transformer
+            num_layers: Number of transformer layers
+            num_heads: Number of attention heads
+            d_ff: Dimension of feed-forward layer
+            max_length: Maximum sequence length
+            dropout: Dropout probability
+            output_dim: Output dimension for conditioning
+        """
+        super().__init__()
+        
+        self.d_model = d_model
+        self.output_dim = output_dim
+        
+        # Character embedding
+        self.char_embedding = nn.Embedding(vocab_size, char_embed_dim, padding_idx=0)
+        
+        # Project char embeddings to model dimension
+        self.input_projection = nn.Linear(char_embed_dim, d_model)
+        
+        # Positional encoding
+        self.pos_encoding = PositionalEncoding(d_model, max_length)
+        
+        # Transformer encoder layers
+        self.layers = nn.ModuleList([
+            TransformerEncoderBlock(d_model, num_heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        
+        # Output projection
+        self.output_projection = nn.Linear(d_model, output_dim)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(d_model)
+        
+        # Initialize weights
+        self._init_weights()
+    
+    def _init_weights(self):
+        """Initialize weights."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0, std=0.02)
+    
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        
+        Args:
+            input_ids: [batch_size, seq_len] - Token indices
+            attention_mask: [batch_size, seq_len] - 1 for valid, 0 for padding
+        
+        Returns:
+            Encoded text features [batch_size, seq_len, output_dim]
+        """
+        # Character embedding
+        x = self.char_embedding(input_ids)  # [B, seq_len, char_embed_dim]
+        
+        # Project to model dimension
+        x = self.input_projection(x)  # [B, seq_len, d_model]
+        
+        # Add positional encoding
+        x = self.pos_encoding(x)
+        x = self.dropout(x)
+        
+        # Pass through transformer layers
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        
+        # Normalize
+        x = self.norm(x)
+        
+        # Project to output dimension
+        x = self.output_projection(x)  # [B, seq_len, output_dim]
+        
+        return x
+    
+    def get_sequence_embedding(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Get single embedding for entire sequence (mean pooling over valid tokens).
+        
+        Args:
+            input_ids: [batch_size, seq_len]
+            attention_mask: [batch_size, seq_len]
+        
+        Returns:
+            Pooled embedding [batch_size, output_dim]
+        """
+        # Get token-level embeddings
+        token_embeddings = self.forward(input_ids, attention_mask)  # [B, seq_len, output_dim]
+        
+        # Mean pooling over valid tokens
+        if attention_mask is not None:
+            # Expand mask to match embedding dimension
+            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
+            sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
+            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
+            pooled = sum_embeddings / sum_mask
+        else:
+            pooled = token_embeddings.mean(dim=1)
+        
+        return pooled
+
+
+if __name__ == "__main__":
+    # Test the text encoder
+    vocab_size = 100
+    batch_size = 4
+    seq_len = 32
+    
+    model = TextEncoder(
+        vocab_size=vocab_size,
+        char_embed_dim=256,
+        d_model=512,
+        num_layers=6,
+        num_heads=8,
+        d_ff=2048,
+        max_length=128,
+        output_dim=512
+    )
+    
+    # Random input
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
+    attention_mask = torch.ones(batch_size, seq_len)
+    attention_mask[:, seq_len//2:] = 0  # Simulate padding
+    
+    # Forward pass
+    output = model(input_ids, attention_mask)
+    pooled = model.get_sequence_embedding(input_ids, attention_mask)
+    
+    print(f"Input shape: {input_ids.shape}")
+    print(f"Output shape: {output.shape}")
+    print(f"Pooled shape: {pooled.shape}")
+    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
diff --git a/docgenie/generation/handwriting_diffusion/tokenizer.py b/docgenie/generation/handwriting_diffusion/tokenizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..69e1081a2844fd9c98642a3d30562d8238a2de50
--- /dev/null
+++ b/docgenie/generation/handwriting_diffusion/tokenizer.py
@@ -0,0 +1,300 @@
+"""
+Character-level tokenizer for handwriting generation.
+Supports special tokens and can be saved/loaded for inference.
+"""
+import json
+import os
+from typing import List, Dict, Optional
+import numpy as np
+
+
+class CharTokenizer:
+    """Character-level tokenizer with special tokens."""
+    
+    # Special tokens
+    PAD_TOKEN = "<PAD>"
+    UNK_TOKEN = "<UNK>"
+    SOS_TOKEN = "<SOS>"
+    EOS_TOKEN = "<EOS>"
+    
+    def __init__(
+        self,
+        vocab: Optional[Dict[str, int]] = None,
+        max_length: int = 128
+    ):
+        """
+        Initialize tokenizer.
+        
+        Args:
+            vocab: Character to index mapping. If None, will be built from data.
+            max_length: Maximum sequence length for padding/truncation.
+        """
+        self.max_length = max_length
+        
+        if vocab is None:
+            # Initialize with special tokens only
+            self.char_to_idx = {
+                self.PAD_TOKEN: 0,
+                self.UNK_TOKEN: 1,
+                self.SOS_TOKEN: 2,
+                self.EOS_TOKEN: 3,
+            }
+        else:
+            self.char_to_idx = vocab
+        
+        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
+        self.vocab_size = len(self.char_to_idx)
+    
+    def build_vocab(self, texts: List[str]) -> None:
+        """
+        Build vocabulary from list of texts.
+        
+        Args:
+            texts: List of text strings to build vocabulary from.
+        """
+        # Collect all unique characters
+        unique_chars = set()
+        for text in texts:
+            unique_chars.update(text)
+        
+        # Sort for deterministic ordering
+        unique_chars = sorted(list(unique_chars))
+        
+        # Add to vocabulary (starting after special tokens)
+        for char in unique_chars:
+            if char not in self.char_to_idx:
+                self.char_to_idx[char] = len(self.char_to_idx)
+        
+        # Update reverse mapping
+        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
+        self.vocab_size = len(self.char_to_idx)
+        
+        print(f"Built vocabulary with {self.vocab_size} characters")
+        print(f"Sample characters: {list(unique_chars)[:20]}")
+    
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        padding: bool = True,
+        truncation: bool = True,
+        return_attention_mask: bool = True
+    ) -> Dict[str, np.ndarray]:
+        """
+        Encode text to token indices.
+        
+        Args:
+            text: Input text string.
+            add_special_tokens: Whether to add SOS/EOS tokens.
+            padding: Whether to pad to max_length.
+            truncation: Whether to truncate to max_length.
+            return_attention_mask: Whether to return attention mask.
+        
+        Returns:
+            Dictionary with 'input_ids' and optionally 'attention_mask'.
+        """
+        # Convert characters to indices
+        token_ids = []
+        
+        if add_special_tokens:
+            token_ids.append(self.char_to_idx[self.SOS_TOKEN])
+        
+        for char in text:
+            token_ids.append(
+                self.char_to_idx.get(char, self.char_to_idx[self.UNK_TOKEN])
+            )
+        
+        if add_special_tokens:
+            token_ids.append(self.char_to_idx[self.EOS_TOKEN])
+        
+        # Truncation
+        if truncation and len(token_ids) > self.max_length:
+            token_ids = token_ids[:self.max_length]
+            if add_special_tokens:
+                token_ids[-1] = self.char_to_idx[self.EOS_TOKEN]
+        
+        # Create attention mask (1 for real tokens, 0 for padding)
+        attention_mask = [1] * len(token_ids)
+        
+        # Padding
+        if padding and len(token_ids) < self.max_length:
+            padding_length = self.max_length - len(token_ids)
+            token_ids.extend([self.char_to_idx[self.PAD_TOKEN]] * padding_length)
+            attention_mask.extend([0] * padding_length)
+        
+        result = {
+            'input_ids': np.array(token_ids, dtype=np.int64)
+        }
+        
+        if return_attention_mask:
+            result['attention_mask'] = np.array(attention_mask, dtype=np.float32)
+        
+        return result
+    
+    def encode_batch(
+        self,
+        texts: List[str],
+        add_special_tokens: bool = True,
+        padding: bool = True,
+        truncation: bool = True,
+        return_attention_mask: bool = True
+    ) -> Dict[str, np.ndarray]:
+        """
+        Encode batch of texts.
+        
+        Args:
+            texts: List of text strings.
+            add_special_tokens: Whether to add SOS/EOS tokens.
+            padding: Whether to pad to max_length.
+            truncation: Whether to truncate to max_length.
+            return_attention_mask: Whether to return attention mask.
+        
+        Returns:
+            Dictionary with batched 'input_ids' and optionally 'attention_mask'.
+        """
+        batch_encoding = [
+            self.encode(
+                text,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                return_attention_mask=return_attention_mask
+            )
+            for text in texts
+        ]
+        
+        result = {
+            'input_ids': np.stack([enc['input_ids'] for enc in batch_encoding])
+        }
+        
+        if return_attention_mask:
+            result['attention_mask'] = np.stack([enc['attention_mask'] for enc in batch_encoding])
+        
+        return result
+    
+    def decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = True
+    ) -> str:
+        """
+        Decode token indices to text.
+        
+        Args:
+            token_ids: List of token indices.
+            skip_special_tokens: Whether to skip special tokens in output.
+        
+        Returns:
+            Decoded text string.
+        """
+        chars = []
+        special_tokens = {
+            self.char_to_idx[self.PAD_TOKEN],
+            self.char_to_idx[self.UNK_TOKEN],
+            self.char_to_idx[self.SOS_TOKEN],
+            self.char_to_idx[self.EOS_TOKEN]
+        }
+        
+        for idx in token_ids:
+            if skip_special_tokens and idx in special_tokens:
+                continue
+            chars.append(self.idx_to_char.get(idx, self.UNK_TOKEN))
+        
+        return ''.join(chars)
+    
+    def save(self, save_path: str) -> None:
+        """
+        Save tokenizer to file.
+        
+        Args:
+            save_path: Path to save tokenizer (JSON file).
+        """
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        
+        config = {
+            'char_to_idx': self.char_to_idx,
+            'max_length': self.max_length,
+            'vocab_size': self.vocab_size
+        }
+        
+        with open(save_path, 'w', encoding='utf-8') as f:
+            json.dump(config, f, ensure_ascii=False, indent=2)
+        
+        print(f"Tokenizer saved to {save_path}")
+    
+    @classmethod
+    def load(cls, load_path: str) -> "CharTokenizer":
+        """
+        Load tokenizer from file.
+        
+        Args:
+            load_path: Path to load tokenizer from (JSON file).
+        
+        Returns:
+            Loaded tokenizer instance.
+        """
+        with open(load_path, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+        
+        tokenizer = cls(
+            vocab=config['char_to_idx'],
+            max_length=config['max_length']
+        )
+        
+        print(f"Tokenizer loaded from {load_path}")
+        print(f"Vocabulary size: {tokenizer.vocab_size}")
+        
+        return tokenizer
+    
+    def __len__(self) -> int:
+        """Return vocabulary size."""
+        return self.vocab_size
+    
+    def __repr__(self) -> str:
+        return f"CharTokenizer(vocab_size={self.vocab_size}, max_length={self.max_length})"
+
+
+def build_tokenizer_from_csv(csv_path: str, max_length: int = 128) -> CharTokenizer:
+    """
+    Build tokenizer from IAM dataset CSV file.
+    
+    Args:
+        csv_path: Path to dataset_metadata.csv
+        max_length: Maximum sequence length
+    
+    Returns:
+        Built tokenizer
+    """
+    import pandas as pd
+    
+    print(f"Loading texts from {csv_path}...")
+    df = pd.read_csv(csv_path)
+    texts = df['text'].astype(str).tolist()
+    
+    print(f"Building vocabulary from {len(texts)} samples...")
+    tokenizer = CharTokenizer(max_length=max_length)
+    tokenizer.build_vocab(texts)
+    
+    return tokenizer
+
+
+if __name__ == "__main__":
+    # Example: Build tokenizer from IAM dataset
+    tokenizer = build_tokenizer_from_csv(
+        "../iam_dataset_processed/dataset_metadata.csv",
+        max_length=128
+    )
+    
+    # Save tokenizer
+    tokenizer.save("../training/tokenizer.json")
+    
+    # Test encoding
+    test_text = "Hello, World!"
+    encoded = tokenizer.encode(test_text)
+    print(f"\nTest encoding for: '{test_text}'")
+    print(f"Input IDs: {encoded['input_ids'][:20]}")
+    print(f"Attention mask: {encoded['attention_mask'][:20]}")
+    
+    # Test decoding
+    decoded = tokenizer.decode(encoded['input_ids'])
+    print(f"Decoded: '{decoded}'")
diff --git a/docgenie/generation/main.py b/docgenie/generation/main.py
new file mode 100755
index 0000000000000000000000000000000000000000..d53e6f549f429523e853b6615cc2013a98394a81
--- /dev/null
+++ b/docgenie/generation/main.py
@@ -0,0 +1,254 @@
+import argparse
+import os
+import pathlib
+import shutil
+
+from docgenie import ENV
+from docgenie.generation.constants import HANDWRITING_DEFAULT_BATCH_SIZE
+from docgenie.generation.models import LLMType, DatasetTask
+from docgenie.generation.models import PipelineParameters, SynDatasetDefinition
+from docgenie.data.interface import load_dataset
+from docgenie.generation.pipeline_01_select_seeds import pipeline_select_seeds
+from docgenie.generation.pipeline_02_prompt_llm import (
+    pipeline_retrieve_document_html_seed_based,
+)
+from docgenie.generation.pipeline_03_process_response import (
+    pipeline_process_response_extract_html_and_gt,
+)
+from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import (
+    pipeline_render_pdf_and_extract_geos_parallel,
+)
+from docgenie.generation.pipeline_05_extract_bboxes_from_pdf import (
+    pipeline_extract_bboxes,
+)
+from docgenie.generation.pipeline_06_extract_layout_element_definitions_and_annotation_gt import (
+    pipeline_extract_layout_element_definitions_and_annotation_gt,
+)
+from docgenie.generation.pipeline_08_extract_visual_element_definitions import (
+    pipeline_extract_visual_element_definitions,
+)
+from docgenie.generation.pipeline_07_extract_handwriting import (
+    pipeline_extract_handwritten_fields,
+)
+from docgenie.generation.pipeline_09_create_handwriting_images import (
+    pipeline_create_handwriting_images,
+)
+from docgenie.generation.pipeline_11_render_pdf_second_pass import (
+    pipeline_render_pdf_second_pass,
+)
+from docgenie.generation.pipeline_12_insert_handwriting_images import (
+    pipeline_handwritten_text_insertion,
+)
+from docgenie.generation.pipeline_10_create_visual_elements import (
+    pipeline_create_visual_elements,
+)
+from docgenie.generation.pipeline_13_insert_visual_elements import (
+    pipeline_insert_visual_elements,
+)
+from docgenie.generation.pipeline_16_normalize_bboxes import pipeline_normalize_bboxes
+from docgenie.generation.pipeline_15_perform_ocr import (
+    pipeline_perform_ocr,
+)
+from docgenie.generation.pipeline_14_render_image import (
+    pipeline_render_image,
+)
+from docgenie.generation.pipeline_17_gt_preparation_verification import (
+    pipeline_ground_truth_verification,
+)
+from docgenie.generation.pipeline_19_create_debug_data import pipeline_create_debug_data
+from docgenie.generation.pipeline_18_analyze import pipeline_analyze
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="DocGenie Synthetic Document Generator",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "SynDatasetDefinition",
+        type=str,
+        help="Filename without extension of the SynDatasetDefinition in data/syn_dataset_definitions",
+    )
+
+    parser.add_argument(
+        "--reset",
+        "-r",
+        action="store_true",
+        help="If set, all previous data is deleted prior to execution, except: prompt batches, prompt responses and seed images.",
+    )
+
+    parser.add_argument(
+        "--entry",
+        "-e",
+        type=int,
+        default=None,
+        help="If set, starts the pipeline at this step",
+    )
+
+    # parser.add_argument(
+    #     "--docids",
+    #     type=str,
+    #     default=None,
+    #     help="Define document ids to which restrict the pipeline",
+    # )
+
+    parser.add_argument(
+        "--hwbs",
+        type=int,
+        default=HANDWRITING_DEFAULT_BATCH_SIZE,
+        help="Handwriting batch size",
+    )
+
+    parser.add_argument(
+        "--nohw",
+        action="store_true",
+        help="Runs the pipeline without creating handwriting",
+    )
+
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Runs the pipeline and creates debug data",
+    )
+
+    parser.add_argument(
+        "--seedsonly",
+        "-s",
+        action="store_true",
+        help="If set, the pipeline only collects seed images and then aborts",
+    )
+
+    parser.add_argument(
+        "--apikey",
+        type=str,
+        default=None,
+        help="If given, use the env variable with this name to retrieve the anthropic API key",
+    )
+
+    parser.add_argument(
+        "--LLMType",
+        type=str,
+        choices=[e.value for e in LLMType],
+        default=LLMType.CLAUDE.value,
+        help="Define the whether to use closed source model or open source (currently just Qwen2.5-32B)",
+    )
+
+    parser.add_argument(
+        "--message_custom_id",
+        type=str,
+        default=None,
+        help="If specified, the pipeline is run only for this message and ignores existing results.",
+    )
+
+    args = parser.parse_args()
+    args.LLMType = LLMType(args.LLMType)
+    assert args.SynDatasetDefinition
+
+    if args.apikey:
+        assert os.getenv(args.apikey)
+        print(f"Using Anthropic API Key from {args.apikey}")
+
+    print(args)
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{args.SynDatasetDefinition}.yaml"
+    dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile)
+    dsfiles = dsdef.get_file_structure()
+
+    input('PRESS KEY')
+
+    if args.reset:
+        print(f"""Parameter --reset has been passed. All existing data from {dsdef.name} will be deleted, except:
+    - {dsfiles.prompt_batches_directory}
+    - {dsfiles.message_results_directory}
+    - {dsfiles.ocr_results_directory}
+    - {dsfiles.preprocessed_seed_images_directory}
+""")
+        dsdef.reset_data_except_prompt_and_seeds()
+
+    dataset = load_dataset(dsdef.base_dataset_name, split="train")
+
+    print(f"The LLM will be used is: {args.LLMType}")
+
+    # Copy used syn dataset defintion to output directory
+    dst = dsfiles.base_path / f"{args.SynDatasetDefinition}.yaml"
+    shutil.copy2(deffile, dst)
+
+    params = PipelineParameters(
+        dsdef=dsdef,
+        llmtype=args.LLMType,
+        message_custom_id=args.message_custom_id,
+        seedsonly=args.seedsonly,
+        debug=args.debug,
+        handwriting_batch_size=args.hwbs,
+        generate_handwriting=not args.nohw,
+        api_key_env_variable_name=args.apikey,
+    )
+
+    entry = args.entry or 0
+
+    # Execute pipeline
+    if entry <= 1:
+        pipeline_select_seeds(params=params)
+
+    if args.seedsonly:
+        exit(0)
+
+    if entry <= 2:
+        pipeline_retrieve_document_html_seed_based(params=params)
+
+    if entry <= 3:
+        pipeline_process_response_extract_html_and_gt(params=params)
+
+    if entry <= 4:
+        pipeline_render_pdf_and_extract_geos_parallel(params=params)
+
+    if entry <= 5:
+        pipeline_extract_bboxes(params=params)
+
+    if entry <= 6:
+        if dsdef.prompt_task == "annotation":
+            pipeline_extract_layout_element_definitions_and_annotation_gt(params=params)
+
+    if entry <= 7:
+        pipeline_extract_handwritten_fields(params=params)
+
+    if entry <= 8:
+        pipeline_extract_visual_element_definitions(params=params)
+
+    if entry <= 9:
+        pipeline_create_handwriting_images(params=params)
+
+    if entry <= 10:
+        pipeline_create_visual_elements(params=params)
+
+    if entry <= 11:
+        pipeline_render_pdf_second_pass(params=params)
+
+    if entry <= 12:
+        pipeline_handwritten_text_insertion(params=params)
+
+    if entry <= 13:
+        pipeline_insert_visual_elements(params=params)
+
+    if entry <= 14:
+        pipeline_render_image(params=params)
+
+    if entry <= 15:
+        pipeline_perform_ocr(params=params)
+
+    if entry <= 16:
+        pipeline_normalize_bboxes(params=params)
+
+    if entry <= 17:
+        pipeline_ground_truth_verification(params=params)
+
+    if entry <= 18:
+        pipeline_analyze(params=params)
+
+    if entry <= 19 and params.debug:
+        pipeline_create_debug_data(params=params)
diff --git a/docgenie/generation/models/__init__.py b/docgenie/generation/models/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b31e493579c0424e62e447f9a86e782cd87c7ab9
--- /dev/null
+++ b/docgenie/generation/models/__init__.py
@@ -0,0 +1,6 @@
+from docgenie.generation.models._bbox import *  # noqa
+from docgenie.generation.models._consts import *  # noqa
+from docgenie.generation.models._file import *  # noqa
+from docgenie.generation.models._log import *  # noqa
+from docgenie.generation.models._pipeline import *  # noqa
+from docgenie.generation.models._syndatadef import *  # noqa
diff --git a/docgenie/generation/models/_bbox.py b/docgenie/generation/models/_bbox.py
new file mode 100755
index 0000000000000000000000000000000000000000..977b889406b424f11f30712b3b01dcc8f031027e
--- /dev/null
+++ b/docgenie/generation/models/_bbox.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class OCRBox:
+    x0: float
+    y0: float
+    x2: float
+    y2: float
+    text: str
+    block_no: int
+    line_no: int
+    word_no: int
+
+    @property
+    def key(self):
+        return (self.block_no, self.line_no, self.word_no)
+
+    def as_string(self) -> str:
+        return f"{self.x0},{self.y0},{self.x2},{self.y2},{self.text},{self.block_no},{self.line_no},{self.word_no}"
+
+    @property
+    def width(self):
+        return self.x2 - self.x0
+
+    @property
+    def height(self):
+        return self.y2 - self.y0
+    
+    def unnormalize(self, width_px, height_px):
+        return OCRBox(self.x0 * width_px, self.y0 * height_px, self.x2 * width_px, self.y2 * height_px, text=self.text, block_no=self.block_no, line_no=self.line_no, word_no=self.word_no)
+    
+    def scale(self, scale):
+        return OCRBox(self.x0 * scale, self.y0 * scale, self.x2 * scale, self.y2 * scale, text=self.text, block_no=self.block_no, line_no=self.line_no, word_no=self.word_no)
+
+
+@dataclass(frozen=True)
+class LayoutBox:
+    x0: float
+    y0: float
+    x2: float
+    y2: float
+    label: str
+
+    @staticmethod
+    def box_contains(outer: LayoutBox, inner: LayoutBox) -> bool:
+        """Check if outer box fully contains inner box."""
+        return (outer.x0 <= inner.x0 and outer.y0 <= inner.y0 and
+                outer.x2 >= inner.x2 and outer.y2 >= inner.y2)
+
+    @staticmethod
+    def calculate_overlap_ratio(box1: LayoutBox, box2: LayoutBox) -> float:
+        """
+        Calculate the overlap ratio between two boxes.
+        Returns the ratio of intersection area to the smaller box's area.
+        """
+        x_left = max(box1.x0, box2.x0)
+        y_top = max(box1.y0, box2.y0)
+        x_right = min(box1.x2, box2.x2)
+        y_bottom = min(box1.y2, box2.y2)
+
+        if x_right <= x_left or y_bottom <= y_top:
+            return 0.0
+
+        intersection_area = (x_right - x_left) * (y_bottom - y_top)
+
+        box1_area = (box1.x2 - box1.x0) * (box1.y2 - box1.y0)
+        box2_area = (box2.x2 - box2.x0) * (box2.y2 - box2.y0)
+
+        smaller_area = min(box1_area, box2_area)
+        return intersection_area / smaller_area if smaller_area > 0 else 0.0
+
+    @staticmethod
+    def normalize_to_pdf(bbox: LayoutBox, width_pt: float, height_pt: float, dpi: float) -> LayoutBox:
+        """Convert a bounding box from PDF points to normalized image coordinates."""
+        # Convert PDF points to pixels
+        scale = dpi / 72
+        x_min_px = bbox.x0 * scale
+        y_min_px = bbox.y0 * scale
+        x_max_px = bbox.x2 * scale
+        y_max_px = bbox.y2 * scale
+
+        # Get image size in pixels
+        img_w_px = width_pt * scale
+        img_h_px = height_pt * scale
+
+        # Normalize bounding box
+        x_min_norm = x_min_px / img_w_px
+        y_min_norm = y_min_px / img_h_px
+        x_max_norm = x_max_px / img_w_px
+        y_max_norm = y_max_px / img_h_px
+
+        return LayoutBox(
+            x0=x_min_norm, y0=y_min_norm,
+            x2=x_max_norm, y2=y_max_norm,
+            label=bbox.label,
+        )
diff --git a/docgenie/generation/models/_consts.py b/docgenie/generation/models/_consts.py
new file mode 100755
index 0000000000000000000000000000000000000000..f3b716491bb82be7c9ee2a8f005c0f9f4eab85f2
--- /dev/null
+++ b/docgenie/generation/models/_consts.py
@@ -0,0 +1,14 @@
+from enum import Enum
+
+
+class LLMType(Enum):
+    CLAUDE = "claude"
+    QWEN = "qwen"
+    DEEPSEEK = "deepseek"
+
+
+class DatasetTask(Enum):
+    KIE = "KIE"
+    QA = "QA"
+    DLA = "DLA"
+    CLASSIFICATION = "CLASSIFICATION"
diff --git a/docgenie/generation/models/_file.py b/docgenie/generation/models/_file.py
new file mode 100755
index 0000000000000000000000000000000000000000..646b55217bf714c8379b690ee5caa1354b724eba
--- /dev/null
+++ b/docgenie/generation/models/_file.py
@@ -0,0 +1,249 @@
+import pathlib
+from typing import Literal
+
+from docgenie import ENV
+
+
+class SyntheticDatasetFileStructure:
+    def __init__(self, ds_name: str):
+        self.ds_name = ds_name
+
+        self.prompt_batches_directory.mkdir(parents=True, exist_ok=True)
+        self.message_results_directory.mkdir(parents=True, exist_ok=True)
+        self.preprocessed_seed_images_directory.mkdir(parents=True, exist_ok=True)
+        self.message_processing_logs_directory.mkdir(parents=True, exist_ok=True)
+        self.raw_html_directory.mkdir(parents=True, exist_ok=True)
+        self.render_html_directory.mkdir(parents=True, exist_ok=True)
+        self.render_html_second_pass_directory.mkdir(parents=True, exist_ok=True)
+        self.geometries_directory.mkdir(parents=True, exist_ok=True)
+        self.raw_annotations_directory.mkdir(parents=True, exist_ok=True)
+        self.pdf_initial_directory.mkdir(parents=True, exist_ok=True)
+        self.pdf_with_handwriting_directory.mkdir(parents=True, exist_ok=True)
+        self.pdf_without_handwriting_placeholder_directory.mkdir(
+            parents=True, exist_ok=True
+        )
+        self.final_pdf_directory.mkdir(parents=True, exist_ok=True)
+        self.bboxes_pdf_directory.mkdir(parents=True, exist_ok=True)
+        self.bboxes_final_directory.mkdir(parents=True, exist_ok=True)
+        self.bboxes_final_normalized_directory.mkdir(parents=True, exist_ok=True)
+        self.ocr_results_directory.mkdir(parents=True, exist_ok=True)
+        self.img_directory.mkdir(parents=True, exist_ok=True)
+        self.gt_directory.mkdir(parents=True, exist_ok=True)
+        self.document_logs_directory.mkdir(parents=True, exist_ok=True)
+        self.handwritten_bboxes_directory.mkdir(parents=True, exist_ok=True)
+        self.visual_element_definitions_directory.mkdir(parents=True, exist_ok=True)
+        self.visual_elements_directory.mkdir(parents=True, exist_ok=True)
+        self.layout_element_definitions_directory.mkdir(parents=True, exist_ok=True)
+        # Directories for handwritten text images
+        self.handwritten_text_images_directory.mkdir(parents=True, exist_ok=True)
+
+        self.debug_pdf_visual_elements_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_handwriting_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_layout_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_geometries_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_bboxes_final_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_bboxes_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_pdf_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_ocr_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True)
+        self.debug_html_raw_directory.mkdir(parents=True, exist_ok=True)
+
+    @property
+    def base_path(self) -> pathlib.Path:
+        return ENV.SYN_DATASETS_DIR / self.ds_name
+
+    @property
+    def ds_log_path(self) -> pathlib.Path:
+        return self.base_path / "dataset_log.json"
+    
+    @property
+    def ds_csv_log_path(self)->pathlib.Path:
+        return self.base_path / "dataset_log.csv"
+
+    # Keep on reset
+    @property
+    def prompt_batches_directory(self) -> pathlib.Path:
+        return self.base_path / "logs" / "prompt_batches"
+
+    # Keep on reset
+    @property
+    def message_results_directory(self) -> pathlib.Path:
+        return self.base_path / "logs" / "prompt_message_results"
+
+    # Keep on reset
+    @property
+    def preprocessed_seed_images_directory(self) -> pathlib.Path:
+        return self.base_path / "preprocessed_seed_images"
+
+    @property
+    def message_processing_logs_directory(self) -> pathlib.Path:
+        return self.base_path / "logs" / "message_processing_logs"
+
+    @property
+    def _html_directory(self) -> pathlib.Path:
+        return self.base_path / "html"
+
+    @property
+    def raw_html_directory(self) -> pathlib.Path:
+        return self._html_directory / "raw_html"
+
+    @property
+    def render_html_directory(self) -> pathlib.Path:
+        return self._html_directory / "render_html_pass1"
+
+    @property
+    def render_html_second_pass_directory(self) -> pathlib.Path:
+        return self._html_directory / "render_html_pass2"
+
+    @property
+    def geometries_directory(self) -> pathlib.Path:
+        return self.base_path / "geometries"
+
+    @property
+    def _pdf_directory(self) -> pathlib.Path:
+        return self.base_path / "pdf"
+
+    @property
+    def pdf_initial_directory(self) -> pathlib.Path:
+        """Contains PDFs with handwriting-html-text visible"""
+        return self._pdf_directory / "pdf_initial"
+
+    @property
+    def pdf_without_handwriting_placeholder_directory(self) -> pathlib.Path:
+        """Contains PDFs with handwriting-html-text and visual element placeholders invisible"""
+        return self._pdf_directory / "pdf_without_handwriting_placeholder"
+
+    @property
+    def pdf_with_handwriting_directory(self) -> pathlib.Path:
+        """Contains PDFs where Handwriting and Visual Elements are invisible
+        (need two render passes because transparent text is not included in PDF)"""
+        return self._pdf_directory / "pdf_with_handwriting"
+
+    @property
+    def final_pdf_directory(self) -> pathlib.Path:
+        """Contains final PDFs with handwriting and visual elements"""
+        return self._pdf_directory / "pdf_final"
+
+    @property
+    def _bbox_directory(self) -> pathlib.Path:
+        return self.base_path / "bbox"
+
+    @property
+    def bboxes_pdf_directory(self) -> pathlib.Path:
+        """Contains the bounding boxes which were extracted from the PDF."""
+        return self._bbox_directory / "bbox_pdf"
+
+    @property
+    def bboxes_final_directory(self) -> pathlib.Path:
+        """For documents which contain handwriting or visual elements, this contains bounding boxes retrieved via OCR.
+        Otherwise contains the bounding boxes which were extracted from the PDF."""
+        return self._bbox_directory / "bbox_final"
+
+    @property
+    def bboxes_final_normalized_directory(self) -> pathlib.Path:
+        """Contains the final bboxes but normalized to image size."""
+        return self._bbox_directory / "bbox_final_normalized"
+
+    @property
+    def ocr_results_directory(self) -> pathlib.Path:
+        """Contains OCR results for documents which contain handwriting or visual elements"""
+        return self.base_path / "ocr_results"
+
+    @property
+    def img_directory(self) -> pathlib.Path:
+        return self.base_path / "img"
+
+    @property
+    def _annotations_directory(self) -> pathlib.Path:
+        return self.base_path / "annotations"
+
+    @property
+    def gt_directory(self) -> pathlib.Path:
+        return self._annotations_directory / "gt"
+
+    @property
+    def raw_annotations_directory(self) -> pathlib.Path:
+        return self._annotations_directory / "raw_annotations"
+
+    @property
+    def document_logs_directory(self) -> pathlib.Path:
+        return self.base_path / "logs" / "document_logs"
+
+    @property
+    def _handwriting_directory(self) -> pathlib.Path:
+        return self.base_path / "handwriting"
+
+    @property
+    def handwritten_bboxes_directory(self) -> pathlib.Path:
+        return self._handwriting_directory / "handwriting_bbox"
+
+    # Directories for handwritten text images
+    @property
+    def handwritten_text_images_directory(self) -> pathlib.Path:
+        return self._handwriting_directory / "handwriting_raw_tokens"
+
+    @property
+    def _visual_elements_directory(self) -> pathlib.Path:
+        return self.base_path / "visual_elements"
+
+    @property
+    def visual_element_definitions_directory(self) -> pathlib.Path:
+        return self._visual_elements_directory / "visual_element_definitions"
+
+    @property
+    def visual_elements_directory(self) -> pathlib.Path:
+        return self._visual_elements_directory / "visual_elements_images"
+
+    @property
+    def layout_element_definitions_directory(self) -> pathlib.Path:
+        return self.base_path / "layout_element_definitions"
+
+    @property
+    def _debug_directory(self) -> pathlib.Path:
+        return self.base_path / "debug"
+
+    @property
+    def debug_pdf_visual_elements_directory(self) -> pathlib.Path:
+        return self._debug_directory / "visual_elements"
+
+    @property
+    def debug_pdf_handwriting_directory(self) -> pathlib.Path:
+        return self._debug_directory / "handwriting"
+
+    @property
+    def debug_pdf_layout_directory(self) -> pathlib.Path:
+        return self._debug_directory / "layout"
+
+    @property
+    def debug_pdf_geometries_directory(self) -> pathlib.Path:
+        return self._debug_directory / "geometries"
+
+    @property
+    def debug_pdf_bboxes_final_directory(self) -> pathlib.Path:
+        return self._debug_directory / "bboxes_final"
+
+    @property
+    def debug_pdf_bboxes_directory(self) -> pathlib.Path:
+        return self._debug_directory / "bboxes"
+
+    @property
+    def debug_pdf_bboxes_and_geos_directory(self) -> pathlib.Path:
+        return self._debug_directory / "bboxes_and_geos"
+
+    @property
+    def debug_ocr_bboxes_and_geos_directory(self) -> pathlib.Path:
+        return self._debug_directory / "ocr_bboxes_and_geos"
+
+    @property
+    def debug_html_raw_directory(self) -> pathlib.Path:
+        return self._debug_directory / "html_raw"
+
+    def get_pdf_bbox_path(self, level: Literal["word", "char"], doc_id: str):
+        return self.bboxes_pdf_directory / level / f"{doc_id}.txt"
+
+    def get_final_bbox_path(self, level: Literal["word", "segment"], doc_id: str):
+        return self.bboxes_final_directory / level / f"{doc_id}.txt"
+
+    def get_final_normalized_bbox_path(
+        self, level: Literal["word", "segment"], doc_id: str
+    ):
+        return self.bboxes_final_normalized_directory / level / f"{doc_id}.txt"
diff --git a/docgenie/generation/models/_log.py b/docgenie/generation/models/_log.py
new file mode 100755
index 0000000000000000000000000000000000000000..4fe79da83358f1cd6833223f53de5e691e5cba9c
--- /dev/null
+++ b/docgenie/generation/models/_log.py
@@ -0,0 +1,177 @@
+import json
+import pathlib
+
+
+class PromptMsgResultLogKey:
+    custom_id = "custom_id"
+    id = "id"
+    result_type = "result_type"
+    error = "error"
+    response = "response"
+    usage_input_tokens = "usage_input_tokens"
+    usage_output_tokens = "usage_output_tokens"
+
+
+class MessageProcessingLogKey:
+    custom_id = "custom_id"
+    result_type = "result_type"
+    num_documents_expected = "num_documents_expected"
+    num_documents_found = "num_documents_found"
+    document_ids = "document_ids"
+
+
+class DocLogKey:
+    document_id = "document_id"
+    html_len = "html_len"
+
+    raw_json_gt_found = "raw_json_gt_found"
+    raw_json_gt_valid_json = "raw_json_gt_valid_json"
+
+    raw_annotation_gt_found = "raw_annotation_gt_found"
+    raw_annotation_gt_extraction_errors = "raw_annotation_gt_extraction_errors"
+
+    raw_gt_or_annotation_annotations_count = "raw_gt_or_annotation_annotations_count"
+
+    render_html_width = "render_html_width"
+    render_html_height = "render_html_height"
+
+    pdf_num_pages = "pdf_num_pages"
+    pdf_render_error = "pdf_render_error"
+
+    num_geometries_extracted = "num_geometries_extracted"
+
+    num_word_bboxes = "num_word_bboxes"
+    num_char_bboxes = "num_char_bboxes"
+    can_map_chars_to_words = "can_map_chars_to_words"
+
+    handwriting_num_elements = "handwriting_num_elements"
+    handwriting_element_extraction_errors = "handwriting_element_extraction_errors"
+
+    handwriting_generation_authorid_to_writerstyle = (
+        "handwriting_generation_authorid_to_writerstyle"
+    )
+
+    handwriting_insertion_success = "handwriting_insertion_success"
+    handwriting_images_were_generated = "handwriting_images_were_generated"
+    handwriting_missing_images = "handwriting_missing_images"
+
+    visual_elements_insertion_success = "visual_elements_insertion_success"
+    visual_elements_were_generated = "visual_elements_were_generated"
+    visual_elements_missing_images = "visual_elements_missing_images"
+    visual_elements_num_elements = "visual_elements_num_elements"
+    visual_elements_extraction_errors = "visual_elements_extraction_errors"
+
+    visual_elements_generation_logs = "visual_elements_generation_logs"
+    visual_elements_generation_errors = "visual_elements_generation_errors"
+
+    layout_elements_num_elements = "layout_elements_num_elements"
+    layout_elements_extraction_errors = "layout_elements_extraction_errors"
+
+    layout_elements_generation_logs = "layout_elements_generation_logs"
+    layout_elements_generation_errors = "layout_elements_generation_errors"
+
+    ocr_required = "ocr_required"
+    ocr_found = "ocr_found"
+    ocr_num_bboxes_words = "ocr_num_bboxes_words"
+    ocr_num_bboxes_lines = "ocr_num_bboxes_lines"
+    ocr_error = "ocr_error"
+
+    gt_verification_confirmed_keys = "gt_verification_confirmed_keys"
+    gt_verification_similarities = "gt_verification_similarities"
+    gt_verification_passed = "gt_verification_passed"
+    gt_verification_skipped = "gt_verification_skipped"
+
+
+class SynDocumentLog:
+    def __init__(self, document_id: str, logdir: pathlib.Path):
+        self.document_id = document_id
+        logpath = logdir / f"{document_id}.json"
+        self.log = json.loads(logpath.read_text(encoding="utf-8"))
+
+    @property
+    def raw_json_gt_found(self):
+        return self.log.get(DocLogKey.raw_json_gt_found, False)
+
+    @property
+    def raw_json_gt_valid_json(self):
+        return self.log.get(DocLogKey.raw_json_gt_valid_json, False)
+
+    @property
+    def raw_annotation_gt_found(self):
+        return self.log.get(DocLogKey.raw_annotation_gt_found, False)
+
+    @property
+    def raw_annotation_gt_extraction_errors(self):
+        return self.log.get(DocLogKey.raw_annotation_gt_extraction_errors, [-1])
+
+    @property
+    def gt_verification_passed(self):
+        return self.log.get(DocLogKey.gt_verification_passed, False)
+
+    @property
+    def gt_verification_skipped(self):
+        return self.log.get(DocLogKey.gt_verification_skipped, False)
+
+    @property
+    def pdf_num_pages(self):
+        return self.log.get(DocLogKey.pdf_num_pages, -1)
+
+    @property
+    def num_word_bboxes(self):
+        return self.log.get(DocLogKey.num_word_bboxes, -1)
+
+    @property
+    def num_char_bboxes(self):
+        return self.log.get(DocLogKey.num_char_bboxes, -1)
+
+    @property
+    def can_map_chars_to_words(self):
+        return self.log.get(DocLogKey.can_map_chars_to_words, False)
+
+    @property
+    def handwriting_num_elements(self):
+        return self.log.get(DocLogKey.handwriting_num_elements, -1)
+
+    @property
+    def handwriting_element_extraction_errors(self):
+        return self.log.get(DocLogKey.handwriting_element_extraction_errors, [-1])
+
+    @property
+    def handwriting_missing_images(self):
+        return self.log.get(DocLogKey.handwriting_missing_images, [-1])
+
+    @property
+    def visual_elements_num_elements(self):
+        return self.log.get(DocLogKey.visual_elements_num_elements, -1)
+
+    @property
+    def visual_elements_extraction_errors(self):
+        return self.log.get(DocLogKey.visual_elements_extraction_errors, [-1])
+
+    @property
+    def layout_elements_num_elements(self):
+        return self.log.get(DocLogKey.layout_elements_num_elements, -1)
+
+    @property
+    def layout_elements_extraction_errors(self):
+        return self.log.get(DocLogKey.layout_elements_extraction_errors, [-1])
+
+    @property
+    def ocr_required(self):
+        return self.log.get(DocLogKey.ocr_required, False)
+
+    @property
+    def ocr_found(self):
+        return self.log.get(DocLogKey.ocr_found, False)
+
+    @property
+    def render_html_width(self) -> int | None:
+        return self.log.get(DocLogKey.render_html_width, None)
+
+    @property
+    def render_html_height(self) -> int | None:
+        return self.log.get(DocLogKey.render_html_height, None)
+
+    @property
+    def annotations_count(self) -> int:
+        return self.log.get(DocLogKey.raw_gt_or_annotation_annotations_count, 0)
diff --git a/docgenie/generation/models/_pipeline.py b/docgenie/generation/models/_pipeline.py
new file mode 100755
index 0000000000000000000000000000000000000000..bd5cbdf0206d93150b60e88b1d6d7ff2a3a28f57
--- /dev/null
+++ b/docgenie/generation/models/_pipeline.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+from docgenie.generation.models._consts import DatasetTask, LLMType
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+
+
+@dataclass
+class PipelineParameters:
+    dsdef: SynDatasetDefinition
+    llmtype: LLMType
+    message_custom_id: str | None
+    seedsonly: bool
+    handwriting_batch_size: int
+    debug: bool
+    api_key_env_variable_name: str
+    generate_handwriting: bool = True
+    blur_handwriting_images: bool = True
diff --git a/docgenie/generation/models/_syndatadef.py b/docgenie/generation/models/_syndatadef.py
new file mode 100755
index 0000000000000000000000000000000000000000..ff9d99810559bef9cd558e2304b04c5b5e4d35a5
--- /dev/null
+++ b/docgenie/generation/models/_syndatadef.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+import json
+import pathlib
+from dataclasses import dataclass
+from typing import Iterable
+
+import yaml
+
+from docgenie import ENV
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.models._log import SynDocumentLog
+from docgenie.generation.utils.serialization import from_dict
+
+
+@dataclass
+class PromptParameters:
+    num_solutions: int
+    doc_type: str
+    language: str
+    gt_type: str  # eg "keys and their values"
+    gt_format: str  # eg {"company": "company value", "date": "date value"...}
+
+
+@dataclass
+class SynDatasetDefinition:
+    # General
+    name: str
+    task: str
+    dataloader_model_task_as: (
+        str | None
+    )  # For Kleister which the data loading pipeline handles as QA
+    base_dataset_name: str
+    documents_count: int
+    valid_labels: list[str]  # For DLA, KIE and Classification
+    label_mapping: (
+        dict[str, str] | None
+    )  # For CORD because original labels have dots in them
+    valid_secondary_labels: list[str]  # For groupings like in CORD or FUNSD
+
+    # Prompt
+    prompt_template: str
+    prompt_task: str
+    prompt_params: PromptParameters
+
+    # Seed Documents
+    hdbscan_min_cluster_size: int
+    embedding_type: str
+    seed_images_count: int
+    alpha: float
+    max_seed_pool: int
+    seed_selection_strategy: str = "v1"
+
+    def get_document_logs(self) -> Iterable[SynDocumentLog]:
+        dsfiles = self.get_file_structure()
+        # TODO: dont read files but read from dataset log
+        for logfile in dsfiles.document_logs_directory.iterdir():
+            docid = logfile.stem
+            yield SynDocumentLog(
+                document_id=docid, logdir=dsfiles.document_logs_directory
+            )
+
+    def write_to_document_log(self, document_id: str, vals: dict):
+        dsfiles = self.get_file_structure()
+        log_path = dsfiles.document_logs_directory / f"{document_id}.json"
+
+        log = {}
+        if log_path.exists():
+            log = json.loads(log_path.read_text("utf-8"))
+
+        log.update(vals)
+        log_path.write_text(json.dumps(log, indent=2), encoding="utf-8")
+
+    def reset_data_except_prompt_and_seeds(self):
+        import shutil
+
+        dsfiles = self.get_file_structure()
+
+        dirs_to_delete = [
+            dsfiles._annotations_directory,
+            dsfiles._bbox_directory,
+            dsfiles._debug_directory,
+            dsfiles._handwriting_directory,
+            dsfiles._html_directory,
+            dsfiles.layout_element_definitions_directory,
+            dsfiles.geometries_directory,
+            dsfiles._pdf_directory,
+            dsfiles._visual_elements_directory,
+            dsfiles.img_directory,
+            dsfiles.document_logs_directory,
+            dsfiles.message_processing_logs_directory,
+        ]
+        for dir_path in dirs_to_delete:
+            shutil.rmtree(dir_path)  # remove entire directory
+
+        # Clear cache
+        del self._file_structure
+        # Recreate directory structure
+        self.get_file_structure()
+
+    def get_file_structure(self) -> SyntheticDatasetFileStructure:
+        if hasattr(self, "_file_structure"):
+            return self._file_structure
+        else:
+            self._file_structure = SyntheticDatasetFileStructure(ds_name=self.name)
+
+        return self._file_structure
+
+    def get_prompt_template(self) -> str:
+        taskname = f"-{self.prompt_task}" if self.prompt_task else ""
+        return (
+            ENV.PROMPT_TEMPLATES_DIR
+            / self.prompt_template
+            / f"seed-based{taskname}.txt"
+        ).read_text()
+
+    def get_prompt(self) -> str:
+        if hasattr(self, "_prompt"):
+            return self._prompt
+        else:
+            prompt = self.get_prompt_template()
+            prompt = prompt.replace(
+                "{num_solutions}", f"{self.prompt_params.num_solutions}"
+            )
+            prompt = prompt.replace("{doc_type}", f"{self.prompt_params.doc_type}")
+            prompt = prompt.replace("{language}", f"{self.prompt_params.language}")
+            prompt = prompt.replace("{gt_type}", f"{self.prompt_params.gt_type}")
+            prompt = prompt.replace("{gt_format}", f"{self.prompt_params.gt_format}")
+            self._prompt = prompt
+
+        return self._prompt
+
+    @staticmethod
+    def from_file(yaml_path: str | pathlib.Path) -> SynDatasetDefinition:
+        with open(yaml_path, "r") as f:
+            data = yaml.safe_load(f)
+        return from_dict(SynDatasetDefinition, data)
diff --git a/docgenie/generation/pipeline_01/claude_batching.py b/docgenie/generation/pipeline_01/claude_batching.py
new file mode 100755
index 0000000000000000000000000000000000000000..fdbb16278b1b21fed73177fbddb40f9a16eeaf9a
--- /dev/null
+++ b/docgenie/generation/pipeline_01/claude_batching.py
@@ -0,0 +1,444 @@
+from dataclasses import dataclass
+from datetime import datetime
+import time
+import os
+import pathlib
+import uuid
+from typing import Iterable, Literal, Callable, TypeVar
+import anthropic
+from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
+from anthropic.types import (
+    MessageParam,
+    ImageBlockParam,
+    TextBlockParam,
+    Base64ImageSourceParam,
+    TextBlock,
+)
+from anthropic.types.messages.batch_create_params import Request
+from anthropic.types.messages.message_batch import MessageBatch
+from anthropic.types.messages.message_batch_individual_response import (
+    MessageBatchIndividualResponse,
+)
+from anthropic.types.messages.message_batch_succeeded_result import (
+    MessageBatchSucceededResult,
+)
+
+from docgenie import ENV, LLM, GENERATION
+
+import json
+
+from docgenie.generation.models import PromptMsgResultLogKey, SynDatasetDefinition
+from docgenie.generation.utils.serialization import image_to_base64
+from docgenie.generation.utils.status import StatusLine
+from docgenie.generation.pipeline_01.cost import (
+    calculate_message_cost,
+    get_total_cost,
+    print_cost_report,
+)
+
+T = TypeVar('T')
+
+def retry_with_backoff(
+    func: Callable[[], T],
+    max_retries: int = 3,
+    initial_delay: float = 1.0,
+    backoff_factor: float = 2.0,
+) -> T:
+    """
+    Retry a function with exponential backoff on transient Anthropic API errors.
+    
+    Args:
+        func: Function to retry (should take no arguments, use lambda if needed)
+        max_retries: Maximum number of retry attempts
+        initial_delay: Initial delay in seconds before first retry
+        backoff_factor: Multiplier for delay after each retry
+    
+    Returns:
+        Result of the function call
+    
+    Raises:
+        The last exception if all retries fail
+    """
+    last_exception = None
+    delay = initial_delay
+    
+    for attempt in range(max_retries + 1):
+        try:
+            return func()
+        except anthropic.InternalServerError as e:
+            last_exception = e
+            if attempt < max_retries:
+                print(f"⚠️  Anthropic API error (attempt {attempt + 1}/{max_retries + 1}): {e}")
+                print(f"   Retrying in {delay:.1f}s...")
+                time.sleep(delay)
+                delay *= backoff_factor
+            else:
+                print(f"❌ All {max_retries + 1} attempts failed")
+                raise
+        except anthropic.RateLimitError as e:
+            last_exception = e
+            if attempt < max_retries:
+                print(f"⚠️  Rate limit hit (attempt {attempt + 1}/{max_retries + 1}): {e}")
+                print(f"   Retrying in {delay:.1f}s...")
+                time.sleep(delay)
+                delay *= backoff_factor
+            else:
+                print(f"❌ All {max_retries + 1} attempts failed")
+                raise
+    
+    raise last_exception  # Should never reach here, but for type safety
+
+
+def create_batch(
+    client: anthropic.Anthropic,
+    id_to_message: dict[str, MessageParam],
+    model=GENERATION.LLM,
+    max_tokens=GENERATION.MAX_TOKENS,
+):
+    requests = []
+    for msg_id, msg in id_to_message.items():
+        requests.append(
+            Request(
+                custom_id=msg_id,
+                params=MessageCreateParamsNonStreaming(
+                    model=model,
+                    max_tokens=max_tokens,
+                    messages=[msg],
+                ),
+            )
+        )
+    
+    # Use retry logic to handle transient Anthropic API errors
+    message_batch = retry_with_backoff(
+        lambda: client.messages.batches.create(requests=requests),
+        max_retries=3,
+        initial_delay=1.0,
+        backoff_factor=2.0
+    )
+
+    # print(message_batch)
+    return message_batch.id
+
+
+def create_message(prompt: str, images_base64: list[str]):
+    content = []
+    # Only prompt is cached, images not (because they come after) as they change with each call
+    content.append(
+        TextBlockParam(text=prompt, type="text", cache_control={"type": "ephemeral"})
+    )
+    if images_base64:
+        for img_base64 in images_base64:
+            content.append(
+                ImageBlockParam(
+                    source=Base64ImageSourceParam(
+                        media_type="image/jpeg", type="base64", data=img_base64
+                    ),
+                    type="image",
+                )
+            )
+
+    return MessageParam(
+        role="user",
+        content=content,
+    )
+
+
+"""
+3.7.
+Claude-Sonnet 3.7 [2] is employed as the underlying
+MLLM for HTML-based document generation. For each
+document category, a set of S = 10 real documents is
+selected as seed samples to guide the generation process.
+The MLLM is prompted with the seed samples and document category to generate N = 10 synthetic documents per
+category. Each model call generates 4 HTML-based documents per iteration, repeated until the total target is reached
+"""
+
+
+class ClaudeBatchedClient:
+    def __init__(self, api_key: str):
+        self.client = anthropic.Anthropic(api_key=api_key)
+
+    def send_batch(
+        self,
+        model: str,
+        prompts: Iterable[str],
+        images_base64: Iterable[list[str]],
+        image_docids: Iterable[list[str]],
+        batch_data_directory: pathlib.Path,
+        max_tokens: int = 8192,
+    ):
+        # assert len(prompts) == len(images_base64)
+
+        # Collect batch data
+        id_to_message = dict()
+        id_to_message_seed_docids = dict()
+        for prompt, image_base64s, seed_docids in zip(
+            prompts, images_base64, image_docids
+        ):
+            # Create GUID message ID
+            message_id = str(uuid.uuid4())
+            message = create_message(prompt=prompt, images_base64=image_base64s)
+            id_to_message[message_id] = message
+            id_to_message_seed_docids[message_id] = seed_docids
+
+        # Send batch
+        batch_id = create_batch(
+            client=self.client,
+            id_to_message=id_to_message,
+            model=model,
+            max_tokens=max_tokens,
+        )
+
+        # Store batch data
+        batch_data_file = batch_data_directory / f"{batch_id}.json"
+        batch_metadata = {
+            "id": batch_id,
+            "model": model,
+            "processing_status": "in_progress",
+            "message_ids": list(id_to_message.keys()),
+            "message_id_to_seed_docids": id_to_message_seed_docids,
+            "created_at": datetime.now().isoformat(),
+            "ended_at": "",
+            "cost_tracking": {
+                "total_cost_usd": 0.0,
+                "total_input_tokens": 0,
+                "total_output_tokens": 0,
+                "total_cache_creation_tokens": 0,
+                "total_cache_read_tokens": 0,
+            },
+        }
+        batch_metadata_json = json.dumps(batch_metadata, indent=2)
+        batch_data_file.write_text(batch_metadata_json, encoding="utf-8")
+
+    def get_running_batches(self, batch_data_directory: pathlib.Path):
+        # Get metadata for all batches that are currently running
+        running_batches = []
+        awaited_messages_total = 0
+        for f in batch_data_directory.iterdir():
+            if f.is_file():
+                batch_metadata = json.loads(f.read_text())
+                if batch_metadata["processing_status"] == "in_progress":
+                    running_batches.append(batch_metadata)
+                    awaited_messages_total += len(batch_metadata["message_ids"])
+
+        return running_batches, awaited_messages_total
+
+    def await_batches(
+        self,
+        batch_data_directory: pathlib.Path,
+        message_data_directory: pathlib.Path,
+        sleep_seconds_between_batch: float = 2,
+        sleep_seconds_iteration: float = 30,
+    ):
+        running_batches, awaited_messages_total = self.get_running_batches(
+            batch_data_directory=batch_data_directory
+        )
+        running_batches_count = len(running_batches)
+        print(
+            f"Found {running_batches_count} batches with {awaited_messages_total} messages in total."
+        )
+
+        status = StatusLine()
+        status.start()
+
+        while any(running_batches):
+            finished_batches = []
+            # print(f"Awaiting {len(running_batches)} batches...")
+            status.update_message(f"Awaiting {len(running_batches)} batches...")
+
+            for batch_metadata in running_batches:
+                batch_id = batch_metadata["id"]
+                
+                # Use retry logic to handle transient Anthropic API errors
+                message_batch = retry_with_backoff(
+                    lambda: self.client.messages.batches.retrieve(
+                        message_batch_id=batch_id
+                    ),
+                    max_retries=3,
+                    initial_delay=2.0,
+                    backoff_factor=2.0
+                )
+
+                if message_batch.processing_status != "in_progress":
+                    # Batch has finished or was canceled
+                    # print(f"Batch {message_batch.id} processing status is now {message_batch.processing_status}")
+                    status.log(
+                        f"Batch {message_batch.id} processing status is now {message_batch.processing_status}"
+                    )
+
+                    # Retrieve batch results if batch was processed
+                    if message_batch.processing_status == "ended":
+                        cost_tracking = self._finalize_batch(
+                            message_batch=message_batch,
+                            batch_id=batch_id,
+                            message_ids=set(batch_metadata["message_ids"]),
+                            message_data_directory=message_data_directory,
+                            model=batch_metadata.get("model", GENERATION.LLM),
+                        )
+                        batch_metadata["cost_tracking"] = cost_tracking
+
+                    # Update batch metadata
+                    batch_metadata["processing_status"] = (
+                        message_batch.processing_status
+                    )
+                    batch_metadata["ended_at"] = datetime.now().isoformat()
+                    batch_metadata_json = json.dumps(batch_metadata, indent=2)
+                    batch_data_file = batch_data_directory / f"{batch_id}.json"
+                    batch_data_file.write_text(batch_metadata_json, encoding="utf-8")
+
+                    # Dont keep polling this batch
+                    finished_batches.append(batch_metadata)
+
+                time.sleep(sleep_seconds_between_batch)
+
+            for batch_metadata in finished_batches:
+                running_batches.remove(batch_metadata)
+
+            time.sleep(sleep_seconds_iteration)
+
+        status.stop()
+
+        print(f"Finished awaiting {running_batches_count} batches.")
+
+    def get_total_cost(self, batch_data_directory: pathlib.Path) -> dict:
+        return get_total_cost(batch_data_directory)
+
+    def print_cost_report(
+        self, batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path = None
+    ):
+        print_cost_report(batch_data_directory, dataset_log_path)
+
+    def _finalize_batch(
+        self,
+        message_batch: MessageBatch,
+        batch_id: str,
+        message_ids: set[str],
+        message_data_directory: pathlib.Path,
+        model: str,
+    ) -> dict:
+        """
+        Finalize a batch by processing results and calculating costs.
+
+        Returns:
+            Dictionary with cost tracking information
+        """
+        assert message_batch.processing_status == "ended"
+
+        # Initialize cost tracking
+        cost_tracking = {
+            "total_cost_usd": 0.0,
+            "total_input_tokens": 0,
+            "total_output_tokens": 0,
+            "total_cache_creation_tokens": 0,
+            "total_cache_read_tokens": 0,
+        }
+
+        # Stream results file in memory-efficient chunks, processing one at a time
+        result: MessageBatchIndividualResponse
+        
+        # Retrieve results with retry logic for transient errors
+        results_iterator = retry_with_backoff(
+            lambda: list(self.client.messages.batches.results(message_batch_id=batch_id)),
+            max_retries=3,
+            initial_delay=2.0,
+            backoff_factor=2.0
+        )
+        
+        for result in results_iterator:
+            # Ensure we know this message in this batch
+            assert result.custom_id in message_ids, (
+                f"Unknown message '{result.custom_id}' in batch '{batch_id}'"
+            )
+
+            message_data = {
+                PromptMsgResultLogKey.custom_id: result.custom_id,
+                PromptMsgResultLogKey.id: "",
+                PromptMsgResultLogKey.result_type: result.result.type,
+                PromptMsgResultLogKey.error: "",
+                PromptMsgResultLogKey.response: "",
+                PromptMsgResultLogKey.usage_input_tokens: -1,
+                PromptMsgResultLogKey.usage_output_tokens: -1,
+            }
+
+            match result.result.type:
+                case "succeeded":
+                    res: MessageBatchSucceededResult = result.result
+                    message_data["id"] = res.message.id
+
+                    # Extract token usage
+                    input_tokens = res.message.usage.input_tokens
+                    output_tokens = res.message.usage.output_tokens
+                    cache_creation_tokens = getattr(
+                        res.message.usage, "cache_creation_input_tokens", 0
+                    )
+                    cache_read_tokens = getattr(
+                        res.message.usage, "cache_read_input_tokens", 0
+                    )
+
+                    message_data[PromptMsgResultLogKey.usage_input_tokens] = (
+                        input_tokens
+                    )
+                    message_data[PromptMsgResultLogKey.usage_output_tokens] = (
+                        output_tokens
+                    )
+
+                    # Calculate cost for this message
+                    message_cost = calculate_message_cost(
+                        model=model,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        cache_creation_input_tokens=cache_creation_tokens,
+                        cache_read_input_tokens=cache_read_tokens,
+                    )
+
+                    # Update batch totals
+                    cost_tracking["total_cost_usd"] += message_cost
+                    cost_tracking["total_input_tokens"] += input_tokens
+                    cost_tracking["total_output_tokens"] += output_tokens
+                    cost_tracking["total_cache_creation_tokens"] += (
+                        cache_creation_tokens
+                    )
+                    cost_tracking["total_cache_read_tokens"] += cache_read_tokens
+
+                    if res.message.stop_reason == "refusal":
+                        # The LLM refused to process the request because of a policy violation
+                        url = "https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/handle-streaming-refusals"
+                        print(f"[SKIPPING] Policy Violation error ({url})")
+                        message_data[PromptMsgResultLogKey.error] = "refusal"
+                        message_data[PromptMsgResultLogKey.response] = None
+                    else:
+                        # raise Exception(f"Policy violation from Claude API ({url})")
+
+                        # Assert that content is of expected shape and type
+                        assert len(res.message.content) == 1 and isinstance(
+                            res.message.content[0], TextBlock
+                        ), (
+                            f"Content validation failed: len={len(res.message.content)}, "
+                            f"content={res.message.content}, "
+                            f"type={type(res.message.content[0]).__name__ if res.message.content else 'empty'}"
+                        )
+
+                        # Fetch actual response
+                        response: TextBlock = res.message.content[0]
+                        message_data[PromptMsgResultLogKey.response] = response.text
+
+                case "errored":
+                    if result.result.error.type == "invalid_request":
+                        # Request body must be fixed before re-sending request
+                        print(f"Validation error {result.custom_id}")
+                        raise Exception(
+                            f"Validation error from Claude API: {result.result.error}"
+                        )
+                    else:
+                        # Request can be retried directly
+                        print(f"Server error {result.custom_id} {result.result.error}")
+                        message_data[PromptMsgResultLogKey.error] = (
+                            f"{result.result.error}"
+                        )
+
+            # Save message to disk
+            message_data_file = message_data_directory / f"{result.custom_id}.json"
+            message_data_json = json.dumps(message_data, indent=2)
+            message_data_file.write_text(message_data_json, encoding="utf-8")
+
+        return cost_tracking
diff --git a/docgenie/generation/pipeline_01/cost.py b/docgenie/generation/pipeline_01/cost.py
new file mode 100755
index 0000000000000000000000000000000000000000..bef41600d330403c7ae736c24b64e13c11f0de9d
--- /dev/null
+++ b/docgenie/generation/pipeline_01/cost.py
@@ -0,0 +1,182 @@
+import json
+import pathlib
+
+from rich.console import Console
+from rich.table import Table
+
+ANTHROPIC_PRICING = {
+    "claude-sonnet-4-20250514": {
+        "input": 3.00,
+        "output": 15.00,
+        "cache_write": 3.75,
+        "cache_read": 0.30,
+    },
+    "claude-sonnet-4-5-20250929": {
+        "input": 3.00,
+        "output": 15.00,
+        "cache_write": 3.75,
+        "cache_read": 0.30,
+    },
+    "claude-haiku-4-5-20251001": {
+        "input": 1.00,
+        "output": 5.00,
+        "cache_write": 1.25,
+        "cache_read": 0.10,
+    },
+}
+
+
+def calculate_message_cost(
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    cache_creation_input_tokens: int = 0,
+    cache_read_input_tokens: int = 0,
+) -> float:
+    """
+    Calculate the cost of a single message based on token usage.
+
+    Args:
+        model: The model name (e.g., "claude-sonnet-4-5-20250929")
+        input_tokens: Number of input tokens
+        output_tokens: Number of output tokens
+        cache_creation_input_tokens: Number of tokens used for cache creation
+        cache_read_input_tokens: Number of tokens read from cache
+
+    Returns:
+        Cost in USD
+    """
+    if model not in ANTHROPIC_PRICING:
+        print(f"Warning: Unknown model '{model}'. Using Claude Sonnet 4.5 pricing.")
+        model = "claude-sonnet-4-5-20250929"
+
+    pricing = ANTHROPIC_PRICING[model]
+
+    regular_input_tokens = (
+        input_tokens - cache_creation_input_tokens - cache_read_input_tokens
+    )
+
+    cost_usd = (
+        (regular_input_tokens / 1_000_000) * pricing["input"]
+        + (output_tokens / 1_000_000) * pricing["output"]
+        + (cache_creation_input_tokens / 1_000_000) * pricing["cache_write"]
+        + (cache_read_input_tokens / 1_000_000) * pricing["cache_read"]
+    )
+
+    return cost_usd
+
+
+def get_total_cost(batch_data_directory: pathlib.Path) -> dict:
+    """
+    Calculate the total cost across all batches in a directory.
+
+    Args:
+        batch_data_directory: Directory containing batch metadata files
+
+    Returns:
+        Dictionary with aggregated cost information
+    """
+    total_cost_summary = {
+        "total_cost_usd": 0.0,
+        "total_input_tokens": 0,
+        "total_output_tokens": 0,
+        "total_cache_creation_tokens": 0,
+        "total_cache_read_tokens": 0,
+        "num_batches": 0,
+        "num_messages": 0,
+    }
+
+    for batch_file in batch_data_directory.iterdir():
+        if batch_file.is_file() and batch_file.suffix == ".json":
+            batch_metadata = json.loads(batch_file.read_text())
+
+            if batch_metadata.get("processing_status") == "ended":
+                cost_tracking = batch_metadata.get("cost_tracking", {})
+
+                total_cost_summary["total_cost_usd"] += cost_tracking.get(
+                    "total_cost_usd", 0.0
+                )
+                total_cost_summary["total_input_tokens"] += cost_tracking.get(
+                    "total_input_tokens", 0
+                )
+                total_cost_summary["total_output_tokens"] += cost_tracking.get(
+                    "total_output_tokens", 0
+                )
+                total_cost_summary["total_cache_creation_tokens"] += cost_tracking.get(
+                    "total_cache_creation_tokens", 0
+                )
+                total_cost_summary["total_cache_read_tokens"] += cost_tracking.get(
+                    "total_cache_read_tokens", 0
+                )
+                total_cost_summary["num_batches"] += 1
+                total_cost_summary["num_messages"] += len(
+                    batch_metadata.get("message_ids", [])
+                )
+
+    return total_cost_summary
+
+
+def print_cost_report(
+    batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path | None = None
+):
+    """
+    Print a formatted cost report using Rich tables.
+
+    Args:
+        batch_data_directory: Directory containing batch metadata files
+        dataset_log_path: Optional path to dataset log for per-document cost calculation
+    """
+    single_page_pdfs_count = -1
+    if dataset_log_path and dataset_log_path.exists():
+        dataset_log = json.loads(dataset_log_path.read_text(encoding="utf-8"))
+        single_page_pdfs_count = dataset_log.get("valid_samples", {}).get("total", 0)
+
+    total_cost_summary = get_total_cost(batch_data_directory)
+
+    console = Console()
+
+    table = Table(
+        title="Batch Cost Report", show_header=True, header_style="bold magenta"
+    )
+    table.add_column("Metric", style="cyan", width=35)
+    table.add_column("Value", justify="right", style="white", width=20)
+
+    table.add_row("Number of batches", str(total_cost_summary["num_batches"]))
+    table.add_row("Number of messages", str(total_cost_summary["num_messages"]))
+    table.add_row("Number of PDFs", str(single_page_pdfs_count))
+
+    table.add_section()
+    table.add_row("Total input tokens", f"{total_cost_summary['total_input_tokens']:,}")
+    table.add_row(
+        "Total output tokens", f"{total_cost_summary['total_output_tokens']:,}"
+    )
+    table.add_row(
+        "Total cache creation tokens",
+        f"{total_cost_summary['total_cache_creation_tokens']:,}",
+    )
+    table.add_row(
+        "Total cache read tokens", f"{total_cost_summary['total_cache_read_tokens']:,}"
+    )
+
+    table.add_section()
+    total_cost_usd = total_cost_summary["total_cost_usd"] / 2.0
+    table.add_row(
+        "[bold green]TOTAL COST \n(including 50% batch discount)[/bold green]",
+        f"[bold green]${total_cost_usd:.2f} USD[/bold green]",
+    )
+
+    if single_page_pdfs_count > 0:
+        avg_cost_per_document = total_cost_usd / single_page_pdfs_count
+        table.add_row(
+            "[bold yellow]Average cost per document[/bold yellow]",
+            f"[bold yellow]${avg_cost_per_document:.2f} USD[/bold yellow]",
+        )
+        table.add_row(
+            "  Documents counted", f"{single_page_pdfs_count} single-page PDFs"
+        )
+
+    console.print()
+    console.print(table)
+    console.print()
+
+    return total_cost_summary
diff --git a/docgenie/generation/pipeline_01/deepseek.py b/docgenie/generation/pipeline_01/deepseek.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docgenie/generation/pipeline_01/opensource_batching.py b/docgenie/generation/pipeline_01/opensource_batching.py
new file mode 100755
index 0000000000000000000000000000000000000000..4041f96c0200a18e4285ab64f4b1930c25efea80
--- /dev/null
+++ b/docgenie/generation/pipeline_01/opensource_batching.py
@@ -0,0 +1,369 @@
+from dataclasses import dataclass
+from datetime import datetime
+import time
+import os
+import pathlib
+import uuid
+from typing import Iterable, Literal
+import anthropic
+from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
+from anthropic.types import (
+    MessageParam,
+    ImageBlockParam,
+    TextBlockParam,
+    Base64ImageSourceParam,
+    TextBlock,
+)
+from anthropic.types.messages.batch_create_params import Request
+from anthropic.types.messages.message_batch import MessageBatch
+from anthropic.types.messages.message_batch_individual_response import (
+    MessageBatchIndividualResponse,
+)
+from anthropic.types.messages.message_batch_succeeded_result import (
+    MessageBatchSucceededResult,
+)
+
+from docgenie import ENV, LLM, GENERATION
+
+import json
+
+from docgenie.generation.models import PromptMsgResultLogKey, SynDatasetDefinition
+from docgenie.generation.utils.serialization import image_to_base64
+from docgenie.generation.utils.status import StatusLine
+from docgenie.generation.pipeline_01.cost import (
+    calculate_message_cost,
+    get_total_cost,
+    print_cost_report,
+)
+
+
+def create_batch(
+    client: anthropic.Anthropic,
+    id_to_message: dict[str, MessageParam],
+    model=GENERATION.LLM,
+    max_tokens=GENERATION.MAX_TOKENS,
+):
+    requests = []
+    for msg_id, msg in id_to_message.items():
+        requests.append(
+            Request(
+                custom_id=msg_id,
+                params=MessageCreateParamsNonStreaming(
+                    model=model,
+                    max_tokens=max_tokens,
+                    messages=[msg],
+                ),
+            )
+        )
+    message_batch = client.messages.batches.create(requests=requests)
+
+    # print(message_batch)
+    return message_batch.id
+
+
+def create_message(prompt: str, images_base64: list[str]):
+    content = []
+    # Only prompt is cached, images not (because they come after) as they change with each call
+    content.append(
+        TextBlockParam(text=prompt, type="text", cache_control={"type": "ephemeral"})
+    )
+    if images_base64:
+        for img_base64 in images_base64:
+            content.append(
+                ImageBlockParam(
+                    source=Base64ImageSourceParam(
+                        media_type="image/jpeg", type="base64", data=img_base64
+                    ),
+                    type="image",
+                )
+            )
+
+    return MessageParam(
+        role="user",
+        content=content,
+    )
+
+
+"""
+3.7.
+Claude-Sonnet 3.7 [2] is employed as the underlying
+MLLM for HTML-based document generation. For each
+document category, a set of S = 10 real documents is
+selected as seed samples to guide the generation process.
+The MLLM is prompted with the seed samples and document category to generate N = 10 synthetic documents per
+category. Each model call generates 4 HTML-based documents per iteration, repeated until the total target is reached
+"""
+
+
+class ClaudeBatchedClient:
+    def __init__(self, api_key: str):
+        self.client = anthropic.Anthropic(api_key=api_key)
+
+    def send_batch(
+        self,
+        model: str,
+        prompts: Iterable[str],
+        images_base64: Iterable[list[str]],
+        image_docids: Iterable[list[str]],
+        batch_data_directory: pathlib.Path,
+        max_tokens: int = 8192,
+    ):
+        # assert len(prompts) == len(images_base64)
+
+        # Collect batch data
+        id_to_message = dict()
+        id_to_message_seed_docids = dict()
+        for prompt, image_base64s, seed_docids in zip(
+            prompts, images_base64, image_docids
+        ):
+            # Create GUID message ID
+            message_id = str(uuid.uuid4())
+            message = create_message(prompt=prompt, images_base64=image_base64s)
+            id_to_message[message_id] = message
+            id_to_message_seed_docids[message_id] = seed_docids
+
+        # Send batch
+        batch_id = create_batch(
+            client=self.client,
+            id_to_message=id_to_message,
+            model=model,
+            max_tokens=max_tokens,
+        )
+
+        # Store batch data
+        batch_data_file = batch_data_directory / f"{batch_id}.json"
+        batch_metadata = {
+            "id": batch_id,
+            "model": model,
+            "processing_status": "in_progress",
+            "message_ids": list(id_to_message.keys()),
+            "message_id_to_seed_docids": id_to_message_seed_docids,
+            "created_at": datetime.now().isoformat(),
+            "ended_at": "",
+            "cost_tracking": {
+                "total_cost_usd": 0.0,
+                "total_input_tokens": 0,
+                "total_output_tokens": 0,
+                "total_cache_creation_tokens": 0,
+                "total_cache_read_tokens": 0,
+            },
+        }
+        batch_metadata_json = json.dumps(batch_metadata, indent=2)
+        batch_data_file.write_text(batch_metadata_json, encoding="utf-8")
+
+    def get_running_batches(self, batch_data_directory: pathlib.Path):
+        # Get metadata for all batches that are currently running
+        running_batches = []
+        awaited_messages_total = 0
+        for f in batch_data_directory.iterdir():
+            if f.is_file():
+                batch_metadata = json.loads(f.read_text())
+                if batch_metadata["processing_status"] == "in_progress":
+                    running_batches.append(batch_metadata)
+                    awaited_messages_total += len(batch_metadata["message_ids"])
+
+        return running_batches, awaited_messages_total
+
+    def await_batches(
+        self,
+        batch_data_directory: pathlib.Path,
+        message_data_directory: pathlib.Path,
+        sleep_seconds_between_batch: float = 2,
+        sleep_seconds_iteration: float = 30,
+    ):
+        running_batches, awaited_messages_total = self.get_running_batches(
+            batch_data_directory=batch_data_directory
+        )
+        running_batches_count = len(running_batches)
+        print(
+            f"Found {running_batches_count} batches with {awaited_messages_total} messages in total."
+        )
+
+        status = StatusLine()
+        status.start()
+
+        while any(running_batches):
+            finished_batches = []
+            # print(f"Awaiting {len(running_batches)} batches...")
+            status.update_message(f"Awaiting {len(running_batches)} batches...")
+
+            for batch_metadata in running_batches:
+                batch_id = batch_metadata["id"]
+                message_batch = self.client.messages.batches.retrieve(
+                    message_batch_id=batch_id
+                )
+
+                if message_batch.processing_status != "in_progress":
+                    # Batch has finished or was canceled
+                    # print(f"Batch {message_batch.id} processing status is now {message_batch.processing_status}")
+                    status.log(
+                        f"Batch {message_batch.id} processing status is now {message_batch.processing_status}"
+                    )
+
+                    # Retrieve batch results if batch was processed
+                    if message_batch.processing_status == "ended":
+                        cost_tracking = self._finalize_batch(
+                            message_batch=message_batch,
+                            batch_id=batch_id,
+                            message_ids=set(batch_metadata["message_ids"]),
+                            message_data_directory=message_data_directory,
+                            model=batch_metadata.get("model", GENERATION.LLM),
+                        )
+                        batch_metadata["cost_tracking"] = cost_tracking
+
+                    # Update batch metadata
+                    batch_metadata["processing_status"] = (
+                        message_batch.processing_status
+                    )
+                    batch_metadata["ended_at"] = datetime.now().isoformat()
+                    batch_metadata_json = json.dumps(batch_metadata, indent=2)
+                    batch_data_file = batch_data_directory / f"{batch_id}.json"
+                    batch_data_file.write_text(batch_metadata_json, encoding="utf-8")
+
+                    # Dont keep polling this batch
+                    finished_batches.append(batch_metadata)
+
+                time.sleep(sleep_seconds_between_batch)
+
+            for batch_metadata in finished_batches:
+                running_batches.remove(batch_metadata)
+
+            time.sleep(sleep_seconds_iteration)
+
+        status.stop()
+
+        print(f"Finished awaiting {running_batches_count} batches.")
+
+    def get_total_cost(self, batch_data_directory: pathlib.Path) -> dict:
+        return get_total_cost(batch_data_directory)
+
+    def print_cost_report(
+        self, batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path = None
+    ):
+        print_cost_report(batch_data_directory, dataset_log_path)
+
+    def _finalize_batch(
+        self,
+        message_batch: MessageBatch,
+        batch_id: str,
+        message_ids: set[str],
+        message_data_directory: pathlib.Path,
+        model: str,
+    ) -> dict:
+        """
+        Finalize a batch by processing results and calculating costs.
+
+        Returns:
+            Dictionary with cost tracking information
+        """
+        assert message_batch.processing_status == "ended"
+
+        # Initialize cost tracking
+        cost_tracking = {
+            "total_cost_usd": 0.0,
+            "total_input_tokens": 0,
+            "total_output_tokens": 0,
+            "total_cache_creation_tokens": 0,
+            "total_cache_read_tokens": 0,
+        }
+
+        # Stream results file in memory-efficient chunks, processing one at a time
+        result: MessageBatchIndividualResponse
+        for result in self.client.messages.batches.results(message_batch_id=batch_id):
+            # Ensure we know this message in this batch
+            assert result.custom_id in message_ids, (
+                f"Unknown message '{result.custom_id}' in batch '{batch_id}'"
+            )
+
+            message_data = {
+                PromptMsgResultLogKey.custom_id: result.custom_id,
+                PromptMsgResultLogKey.id: "",
+                PromptMsgResultLogKey.result_type: result.result.type,
+                PromptMsgResultLogKey.error: "",
+                PromptMsgResultLogKey.response: "",
+                PromptMsgResultLogKey.usage_input_tokens: -1,
+                PromptMsgResultLogKey.usage_output_tokens: -1,
+            }
+
+            match result.result.type:
+                case "succeeded":
+                    res: MessageBatchSucceededResult = result.result
+                    message_data["id"] = res.message.id
+
+                    # Extract token usage
+                    input_tokens = res.message.usage.input_tokens
+                    output_tokens = res.message.usage.output_tokens
+                    cache_creation_tokens = getattr(
+                        res.message.usage, "cache_creation_input_tokens", 0
+                    )
+                    cache_read_tokens = getattr(
+                        res.message.usage, "cache_read_input_tokens", 0
+                    )
+
+                    message_data[PromptMsgResultLogKey.usage_input_tokens] = (
+                        input_tokens
+                    )
+                    message_data[PromptMsgResultLogKey.usage_output_tokens] = (
+                        output_tokens
+                    )
+
+                    # Calculate cost for this message
+                    message_cost = calculate_message_cost(
+                        model=model,
+                        input_tokens=input_tokens,
+                        output_tokens=output_tokens,
+                        cache_creation_input_tokens=cache_creation_tokens,
+                        cache_read_input_tokens=cache_read_tokens,
+                    )
+
+                    # Update batch totals
+                    cost_tracking["total_cost_usd"] += message_cost
+                    cost_tracking["total_input_tokens"] += input_tokens
+                    cost_tracking["total_output_tokens"] += output_tokens
+                    cost_tracking["total_cache_creation_tokens"] += (
+                        cache_creation_tokens
+                    )
+                    cost_tracking["total_cache_read_tokens"] += cache_read_tokens
+
+                    if res.message.stop_reason == "refusal":
+                        # The LLM refused to process the request because of a policy violation
+                        url = "https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/handle-streaming-refusals"
+                        print(f"[SKIPPING] Policy Violation error ({url})")
+                        message_data[PromptMsgResultLogKey.error] = "refusal"
+                        message_data[PromptMsgResultLogKey.response] = None
+                    else:
+                        # raise Exception(f"Policy violation from Claude API ({url})")
+
+                        # Assert that content is of expected shape and type
+                        assert len(res.message.content) == 1 and isinstance(
+                            res.message.content[0], TextBlock
+                        ), (
+                            f"Content validation failed: len={len(res.message.content)}, "
+                            f"content={res.message.content}, "
+                            f"type={type(res.message.content[0]).__name__ if res.message.content else 'empty'}"
+                        )
+
+                        # Fetch actual response
+                        response: TextBlock = res.message.content[0]
+                        message_data[PromptMsgResultLogKey.response] = response.text
+
+                case "errored":
+                    if result.result.error.type == "invalid_request":
+                        # Request body must be fixed before re-sending request
+                        print(f"Validation error {result.custom_id}")
+                        raise Exception(
+                            f"Validation error from Claude API: {result.result.error}"
+                        )
+                    else:
+                        # Request can be retried directly
+                        print(f"Server error {result.custom_id} {result.result.error}")
+                        message_data[PromptMsgResultLogKey.error] = (
+                            f"{result.result.error}"
+                        )
+
+            # Save message to disk
+            message_data_file = message_data_directory / f"{result.custom_id}.json"
+            message_data_json = json.dumps(message_data, indent=2)
+            message_data_file.write_text(message_data_json, encoding="utf-8")
+
+        return cost_tracking
diff --git a/docgenie/generation/pipeline_01_select_seeds.py b/docgenie/generation/pipeline_01_select_seeds.py
new file mode 100755
index 0000000000000000000000000000000000000000..2beba24bffdb6c5023ef036909a2a19f62bb8f8c
--- /dev/null
+++ b/docgenie/generation/pipeline_01_select_seeds.py
@@ -0,0 +1,163 @@
+"""
+TODO: select seeds based on clusters
+"""
+
+import math
+from pathlib import Path
+
+import pandas as pd
+from tqdm import tqdm
+
+from docgenie.analyzation.clustering.cmds.generate_seeds import (
+    GenerateSeedsConfig,
+    generate_seeds_for_embedding_type,
+)
+from docgenie.analyzation.clustering.core._utilities import EmbeddingType
+from docgenie.data.interface import load_dataset
+from docgenie.generation.constants import SEED_IMAGE_MAX_WIDTH, SEED_IMAGE_QUALITY
+from docgenie.generation.models import PipelineParameters, SynDatasetDefinition
+from docgenie.generation.utils.image import (
+    downscale_and_compress,
+)
+from docgenie.generation.utils.log import log_pipeline_level
+
+
+def prepare_seed_images(dsdef: SynDatasetDefinition, seeds_df: pd.DataFrame):
+    dsfiles = dsdef.get_file_structure()
+
+    dataset = load_dataset(dsdef.base_dataset_name, split="train")
+
+    all_doc_ids = set(seeds_df.stack())
+    for seed in tqdm(all_doc_ids, desc="Downscaling and compressing seed images"):
+        outfile = dsfiles.preprocessed_seed_images_directory / f"{seed}.jpg"
+        if not outfile.exists():
+            img = dataset.train.get_by_id(seed).image.content  # type: ignore
+
+            downscale_and_compress(
+                img=img,
+                save_to_path=outfile,
+                max_width=SEED_IMAGE_MAX_WIDTH,
+                quality=SEED_IMAGE_QUALITY,
+            )
+
+
+def visualize_selected_seed_labels(
+    dsdef: SynDatasetDefinition, seeds_df: pd.DataFrame, save_to: Path
+):
+    import matplotlib.pyplot as plt
+
+    dataset = load_dataset(dsdef.base_dataset_name, split="train")
+
+    all_doc_ids = set(seeds_df.stack())
+    label_counter: dict[str, int] = {}
+    for seed in tqdm(all_doc_ids, desc="Extracting class labels for seed images"):
+        doc = dataset.train.get_by_id(seed)  # type: ignore
+
+        document_label = None
+        for annotation in doc.annotations:  # type: ignore
+            if annotation._type == "classification":
+                document_label = annotation.label.name
+                break
+
+        if document_label is not None:
+            label_counter[document_label] = label_counter.get(document_label, 0) + 1
+
+    if len(label_counter) == 0:
+        return
+
+    print("Seed image class label distribution:")
+    for label, count in label_counter.items():
+        print(f"Label: {label}, Count: {count}")
+
+    # visualize the seed label distribution as a bar chart
+    fig = plt.figure(figsize=(10, 6))
+    plt.bar(list(label_counter.keys()), list(label_counter.values()))
+    plt.xlabel("Class Labels")
+    plt.ylabel("Frequency")
+    plt.title("Seed Image Class Label Distribution")
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    plt.savefig(save_to)
+    plt.close(fig)
+
+
+def visualize_selected_clusters(clusters_df, save_to):
+    from collections import Counter
+
+    import matplotlib.pyplot as plt
+
+    # Flatten all values into a single list
+    all_clusters = clusters_df.values.flatten()
+
+    # Count occurrences per cluster
+    cluster_counts = Counter(all_clusters)
+
+    # Sort by cluster index for plotting
+    clusters_sorted = sorted(cluster_counts.keys())
+    counts_sorted = [cluster_counts[c] for c in clusters_sorted]
+
+    # Plot histogram
+    plt.bar(clusters_sorted, counts_sorted)
+    plt.xlabel("Cluster")
+    plt.ylabel("Frequency")
+    plt.title("Histogram of Cluster Occurrences")
+    plt.savefig(save_to)
+
+
+def pipeline_select_seeds(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef: SynDatasetDefinition = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    total_prompt_calls = int(
+        math.ceil(dsdef.documents_count / dsdef.prompt_params.num_solutions)
+    )
+    # Add a bit of buffer because some documents will fail and we need to prompt more often
+    # total_prompt_calls += 100
+
+    cfg = GenerateSeedsConfig(
+        dataset_name=dsdef.base_dataset_name,
+        hdbscan_min_cluster_size=dsdef.hdbscan_min_cluster_size,
+        output_dir=dsfiles.base_path,
+        total_seed_runs=total_prompt_calls,
+        total_seeds_per_run=dsdef.seed_images_count,
+        visualize_seeds=True,
+        alpha=dsdef.alpha,
+        max_pool_size=dsdef.max_seed_pool,
+        seed=42,
+        seed_selection_strategy=dsdef.seed_selection_strategy,
+    )
+    embedding_type = EmbeddingType(dsdef.embedding_type)
+    seeds_path: Path
+    clusters_path: Path
+    seeds_path, clusters_path = generate_seeds_for_embedding_type(
+        cfg=cfg, embedding_type=embedding_type
+    )
+
+    # Rename seeds file
+    new_file = seeds_path.with_name("seeds.csv")
+    seeds_path.rename(new_file)
+    seeds_path = new_file
+
+    # Rename clusters file
+    new_file = clusters_path.with_name("clusters.csv")
+    clusters_path.rename(new_file)
+    clusters_path = new_file
+
+    seeds_df = pd.read_csv(seeds_path)
+
+    # visualize document classes if possible
+    visualize_selected_seed_labels(
+        dsdef=dsdef,
+        seeds_df=seeds_df,
+        save_to=seeds_path.with_name("seed_label_distribution.png"),
+    )
+
+    # Prepare seed images
+    prepare_seed_images(dsdef=dsdef, seeds_df=seeds_df)
+
+    clusters_df = pd.read_csv(clusters_path)
+    visualize_selected_clusters(
+        clusters_df, save_to=clusters_path.with_name("clusters_hist.png")
+    )
diff --git a/docgenie/generation/pipeline_02_prompt_llm.py b/docgenie/generation/pipeline_02_prompt_llm.py
new file mode 100755
index 0000000000000000000000000000000000000000..9df5fda3ab7cdd991752427f208d70084fac868a
--- /dev/null
+++ b/docgenie/generation/pipeline_02_prompt_llm.py
@@ -0,0 +1,175 @@
+import json
+import math
+import os
+
+import pandas as pd
+from docgenie import GENERATION
+from docgenie.generation.constants import (
+    PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE,
+    PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH,
+    PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION,
+)
+from docgenie.generation.utils.serialization import image_to_base64
+from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient
+from docgenie.generation.models import (
+    PipelineParameters,
+    SynDatasetDefinition,
+    LLMType,
+    SyntheticDatasetFileStructure,
+)
+from docgenie.generation.utils.log import log_pipeline_level
+
+
+def _create_batch(
+    prompt: str,
+    cur_msg_index: int,
+    seeds_df: pd.DataFrame,
+    batch_size: int,
+    dsfiles: SyntheticDatasetFileStructure,
+):
+    def prompt_gen():
+        for _ in range(batch_size):
+            yield prompt
+
+    def seed_doc_ids_gen():
+        for i in range(batch_size):
+            index = (cur_msg_index + i) % len(
+                seeds_df
+            )  # Start all over again when we have more messages than seeds
+            seeds_row = seeds_df.iloc[index]
+            doc_ids = seeds_row.tolist()
+            yield doc_ids
+
+    batch_seed_doc_ids = list(seed_doc_ids_gen())
+
+    def imgs_gen():
+        for i in range(batch_size):
+            doc_ids = batch_seed_doc_ids[i]
+            img_base64 = [
+                image_to_base64(
+                    imgpath=dsfiles.preprocessed_seed_images_directory / f"{docid}.jpg"
+                )
+                for docid in doc_ids
+            ]
+            yield img_base64
+
+    return cur_msg_index + batch_size, prompt_gen(), imgs_gen(), batch_seed_doc_ids
+
+
+def get_remaining_prompt_calls_count(
+    dsdef: SynDatasetDefinition, awaited_messages_total: int
+):
+    dsfiles = dsdef.get_file_structure()
+
+    probable_message_responses = awaited_messages_total
+    total_message_responses = awaited_messages_total
+    for f in dsfiles.message_results_directory.iterdir():
+        if f.is_file():
+            total_message_responses += 1
+            msg_result = json.loads(f.read_text(encoding="utf-8"))
+            if msg_result["result_type"] == "succeeded":
+                probable_message_responses += 1
+
+    # Each message response should contain N HTML documents
+    # TODO: better count actual number of responses here in case num_solutions was changed afterwards
+    probable_raw_documents = (
+        probable_message_responses * dsdef.prompt_params.num_solutions
+    )
+    remaining_documents_count = dsdef.documents_count - probable_raw_documents
+    print(
+        f"{dsdef.documents_count=} {probable_message_responses=} {probable_raw_documents=} {remaining_documents_count=}"
+    )
+
+    # Create batches
+    remaining_prompt_calls = (
+        remaining_documents_count / dsdef.prompt_params.num_solutions
+    )
+
+    return (
+        total_message_responses,
+        remaining_documents_count,
+        remaining_prompt_calls,
+    )
+
+
+def pipeline_retrieve_document_html_seed_based(
+    params: PipelineParameters,
+):
+    log_pipeline_level()
+
+    prompt_batch_max_size = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE
+    prompt_batch_poll_sleep_seconds_between_batch = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH
+    prompt_batch_poll_sleep_seconds_between_iteration = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION
+
+    dsdef = params.dsdef
+    # Check which LLM Type sanity
+    assert params.llmtype.value in [e.value for e in LLMType], (
+        f"Invalid model:{params.llmtype.value}"
+    )
+
+    dsfiles = dsdef.get_file_structure()
+    prompt = dsdef.get_prompt()
+
+    seeds_path = dsfiles.base_path / "seeds.csv"
+    seeds_df = pd.read_csv(seeds_path)
+    assert len(seeds_df.columns) == dsdef.seed_images_count
+
+    # TODO Saifullah: implement opensource VLMs here
+
+    api_key_env_variable_name = params.api_key_env_variable_name or "ANTHROPIC_API_KEY"
+    api_key = os.getenv(api_key_env_variable_name)
+    print(f"{api_key_env_variable_name=} len: {len(api_key)}")  # type: ignore
+    if params.api_key_env_variable_name:
+        input("PRESS ENTER TO CONFIRM")
+
+    client = ClaudeBatchedClient(api_key=api_key)  # type: ignore
+    running_batches, awaited_messages_total = client.get_running_batches(
+        batch_data_directory=dsfiles.prompt_batches_directory
+    )
+    total_message_responses, remaining_documents_count, remaining_prompt_calls = (
+        get_remaining_prompt_calls_count(
+            dsdef=dsdef, awaited_messages_total=awaited_messages_total
+        )
+    )
+
+    cur_msg_index = total_message_responses
+    if remaining_documents_count > 0:
+        num_batches = math.ceil(remaining_prompt_calls / prompt_batch_max_size)
+        print(f"{remaining_prompt_calls=} {num_batches=}")
+
+        for i_batch in range(num_batches):
+            # For each batch, we select <seed_images_count> images from all seed images
+            this_batch_size = min(
+                math.ceil(
+                    remaining_documents_count / dsdef.prompt_params.num_solutions
+                ),
+                prompt_batch_max_size,
+            )
+            cur_msg_index, batch_prompts, batch_imgs, batch_seed_docids = _create_batch(
+                prompt=prompt,
+                cur_msg_index=cur_msg_index,
+                seeds_df=seeds_df,
+                batch_size=this_batch_size,
+                dsfiles=dsfiles,
+            )
+
+            # Batch is sent to LLM and batch infos are stored locally on disk
+            client.send_batch(
+                model=GENERATION.LLM,
+                prompts=batch_prompts,
+                images_base64=batch_imgs,
+                image_docids=batch_seed_docids,
+                batch_data_directory=dsfiles.prompt_batches_directory,
+                max_tokens=GENERATION.MAX_TOKENS,  # up to 32k
+            )
+
+            remaining_documents_count -= this_batch_size
+
+    # Await all previously sent batches
+    # LLM Responses are saved to disk
+    client.await_batches(
+        batch_data_directory=dsfiles.prompt_batches_directory,
+        message_data_directory=dsfiles.message_results_directory,
+        sleep_seconds_between_batch=prompt_batch_poll_sleep_seconds_between_batch,
+        sleep_seconds_iteration=prompt_batch_poll_sleep_seconds_between_iteration,
+    )
diff --git a/docgenie/generation/pipeline_03/HeadlessRenderer.py b/docgenie/generation/pipeline_03/HeadlessRenderer.py
new file mode 100755
index 0000000000000000000000000000000000000000..a85708219e8ae1bf8d8eb0c67cb65df6d72dfba1
--- /dev/null
+++ b/docgenie/generation/pipeline_03/HeadlessRenderer.py
@@ -0,0 +1,39 @@
+import json
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from PyPDF2 import PdfReader
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TaskProgressColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from playwright.sync_api import sync_playwright
+
+
+class HeadlessRenderer:
+    """Keep a single Chromium browser open to measure HTML body size."""
+
+    def __init__(self):
+        self.playwright = sync_playwright().start()
+        self.browser = self.playwright.chromium.launch(
+            headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"]
+        )
+        self.page = self.browser.new_page()
+
+    def calc_size(self, html: str):
+        """Return rendered width/height of <body>."""
+        self.page.set_content(html)
+        size = self.page.evaluate(
+            """() => {
+                const rect = document.body.getBoundingClientRect();
+                return { width: rect.width, height: rect.height };
+            }"""
+        )
+        return size["width"], size["height"]
+
+    def close(self):
+        self.page.close()
+        self.browser.close()
+        self.playwright.stop()
diff --git a/docgenie/generation/pipeline_03/css copy.py b/docgenie/generation/pipeline_03/css copy.py
new file mode 100755
index 0000000000000000000000000000000000000000..e3d9f5e40c730196dc1e30fc352054a0d35bf5b7
--- /dev/null
+++ b/docgenie/generation/pipeline_03/css copy.py	
@@ -0,0 +1,408 @@
+import logging
+
+from bs4 import BeautifulSoup
+import cssutils
+from docgenie import ENV
+from docgenie.generation.constants import (
+    BS_PARSER,
+    HANDWRITING_CLASS_NAME,
+    HANDWRITING_FONT_SIZE,
+)
+from docgenie.generation.utils.handwriting import get_all_author_ids
+from docgenie.generation.utils.visualelement import get_visual_element_id
+
+# Your input HTML (replace with reading from file if needed)
+html = """
+<html>
+<head>
+  <style>
+    @page {
+      size: A4;
+      margin: 1in;
+    }
+    body {
+      display: block;
+      font-size: 12pt;
+    }
+  </style>
+</head>
+<body>
+  <p>Hello World</p>
+</body>
+</html>
+"""
+
+# html_path = ENV.DATA_DIR / "html" / "receipt3.html"
+# html = html_path.read_text(encoding="utf-8")
+
+
+# Get cssutils logger
+cssutils_logger = logging.getLogger("CSSUTILS")
+
+# Remove all handlers (if any) and prevent propagation
+cssutils_logger.handlers.clear()
+cssutils_logger.propagate = False
+
+# Add a NullHandler so it discards all logs
+cssutils_logger.addHandler(logging.NullHandler())
+
+
+def mark_visual_elements_for_ocr(soup: BeautifulSoup):
+    fields = soup.find_all(attrs={"data-placeholder": True})
+    for i, div in enumerate(fields):
+        div.clear()  # type: ignore
+        div.string = get_visual_element_id(i)  # type: ignore
+        style = cssutils.parseStyle(div.get("style"))  # type: ignore
+        style["font-size"] = "1px"
+        style["text-align"] = "center"
+        style["display"] = "flex"
+        style["justify-content"] = "center"
+        style["align-items"] = "center"
+
+        div["style"] = style.cssText  # type: ignore
+        # keep "position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10;
+
+    return soup
+
+
+def unmark_visual_elements(soup: BeautifulSoup):
+    fields = soup.find_all(attrs={"data-placeholder": True})
+
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)  # type: ignore
+
+    for i, div in enumerate(fields):
+        div.clear()  # type: ignore
+        style = cssutils.parseStyle(div.get("style"))  # type: ignore
+        del style["font-size"]
+        del style["text-align"]
+        del style["display"]
+        del style["justify-content"]
+        del style["align-items"]
+        # Delete stuff that the LLM may have added
+        del style["border"]
+        del style["background-color"]
+        del style["background-image"]
+
+        # Maybe the element has a class that rotates it, this happened for stamps
+        # Copy rotate from potential class
+        transforms = []
+        viselemclasses = div.get("class")
+        if viselemclasses:  # type: ignore
+            for cls in viselemclasses:  # type: ignore
+                for rule in css:
+                    if rule.type == rule.STYLE_RULE and rule.selectorText == f".{cls}":
+                        if "transform" in rule.style:
+                            transforms.append(rule.style["transform"])
+
+        if len(transforms) > 1:
+            raise ValueError(
+                f"Visual element has multiple transforms, assigned by {len(transforms)} classes"
+            )
+
+        if len(transforms) == 1:
+            # Copy transform
+            style["transform"] = transforms[0]
+            # print(f"Copied transform! {transforms[0]}")
+
+        div["style"] = style.cssText  # type: ignore
+        div["class"] = []  # type: ignore
+
+    return soup
+
+
+def prepare_html_for_size_measurement(html: str):
+    body_display = "inline-block"
+    # body_display = "block"
+
+    # Parse HTML
+    soup = BeautifulSoup(html, BS_PARSER)
+
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    # --- Handle @page ---
+    page_rule = None
+    for rule in css:
+        if rule.type == rule.PAGE_RULE:
+            page_rule = rule
+            break
+
+    if not page_rule:
+        page_rule = cssutils.css.CSSPageRule()
+        css.add(page_rule)
+
+    # # Remove 'size' if it exists
+    # if "size" in page_rule.style:
+    #     page_rule.style.removeProperty("size")
+
+    # Set width and height
+    page_rule.style["width"] = None  #'8.5in'
+    page_rule.style["height"] = None  #'11in'
+    page_rule.style["margin"] = "0"
+
+    # --- Ensure body { display: ... } exists ---
+    body_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and rule.selectorText == "body":
+            body_rule = rule
+            break
+
+    if body_rule:
+        body_rule.style["display"] = body_display  # update existing
+        body_rule.style["margin"] = "0"
+        body_rule.style["padding"] = "0"
+        body_rule.style["width"] = None
+        body_rule.style["height"] = None
+        body_rule.style["min-height"] = None
+    else:
+        # Create a new rule if body doesn't exist
+        new_rule = cssutils.css.CSSStyleRule(selectorText="body")
+        new_rule.style["display"] = body_display
+        new_rule.style["margin"] = "0"
+        new_rule.style["padding"] = "0"
+        new_rule.style["width"] = None
+        new_rule.style["height"] = None
+        new_rule.style["min-height"] = None
+        css.add(new_rule)
+
+    # Increase handwriting font size
+    hwselector = f".{HANDWRITING_CLASS_NAME}"
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and hwselector in rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=hwselector)
+    if hw_rule:
+        hw_rule.style["font-size"] = f"{HANDWRITING_FONT_SIZE}px"
+    if not found:
+        css.add(hw_rule)
+
+    # Add unique ID to visual elements to get their position via BBox extraction from PDF
+    mark_visual_elements_for_ocr(soup)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup.prettify()
+
+
+def prepare_html_for_rendering_first_pass(
+    soup: BeautifulSoup, width: float, height: float
+) -> BeautifulSoup:
+    """
+    Adjusts HTML for PDF with perfect fit to content
+    """
+    # body_display = "inline-block"
+    body_display = "block"
+
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    # --- Handle @page ---
+    page_rule = None
+    for rule in css:
+        if rule.type == rule.PAGE_RULE:
+            page_rule = rule
+            break
+
+    if not page_rule:
+        page_rule = cssutils.css.CSSPageRule()
+        css.add(page_rule)
+
+    # # Remove 'size' if it exists
+    # if "size" in page_rule.style:
+    #     page_rule.style.removeProperty("size")
+
+    # Set width and height
+    page_rule.style["width"] = f"{width}px"  #'8.5in'
+    page_rule.style["height"] = f"{height}px"  #'11in'
+    page_rule.style["margin"] = "0"
+
+    # --- Ensure body { display: ... } exists ---
+    body_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and rule.selectorText == "body":
+            body_rule = rule
+            break
+
+    if body_rule:
+        body_rule.style["display"] = body_display  # update existing
+        body_rule.style["margin"] = "0"
+        body_rule.style["padding"] = "0"
+        body_rule.style["width"] = None
+        body_rule.style["height"] = None
+        body_rule.style["min-height"] = None
+    else:
+        # Create a new rule if body doesn't exist
+        new_rule = cssutils.css.CSSStyleRule(selectorText="body")
+        new_rule.style["display"] = body_display
+        new_rule.style["margin"] = "0"
+        new_rule.style["padding"] = "0"
+        new_rule.style["width"] = None
+        new_rule.style["height"] = None
+        new_rule.style["min-height"] = None
+        css.add(new_rule)
+
+    # Increase handwriting font size
+    hwselector = f".{HANDWRITING_CLASS_NAME}"
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and hwselector in rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=hwselector)
+    if hw_rule:
+        hw_rule.style["font-size"] = f"{HANDWRITING_FONT_SIZE}px"
+    if not found:
+        css.add(hw_rule)
+
+    # # Iterate and remove @media print blocks
+    # for rule in list(css.cssRules):  # list() to avoid modifying during iteration
+    #     if rule.type == rule.MEDIA_RULE:
+    #         if "print" in rule.media:
+    #             css.deleteRule(rule)
+
+    for rule in list(css.cssRules):
+        if rule.type == rule.MEDIA_RULE:
+            # Join all media queries and search for 'print' keyword
+            media_text = rule.media.mediaText.lower()
+            if "print" in media_text:
+                # print(f"FOUND! {rule}")
+                css.deleteRule(rule)
+
+    # Add unique ID to visual elements to get their position via BBox extraction from PDF
+    mark_visual_elements_for_ocr(soup)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup
+
+
+def prepare_html_for_rendering_second_pass(soup: BeautifulSoup) -> BeautifulSoup:
+    soup = postprocess_handwriting(soup)
+    # soup = postprocess_visual_elements(soup)
+    soup = unmark_visual_elements(soup)
+    return soup
+
+
+def postprocess_handwriting(soup: BeautifulSoup) -> BeautifulSoup:
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    hwselector = f".{HANDWRITING_CLASS_NAME}"
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and hwselector in rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=hwselector)
+    if hw_rule:
+        hw_rule.style["color"] = "transparent"  # This removes the text from PDF
+        # hw_rule.style["-webkit-print-color-adjust"] = "exact"
+        # hw_rule.style["opacity"] = 0 # doesnt work
+        # hw_rule.style["color"] = "rgba(0, 0, 0, 0.01)" # wors but looks bad
+        hw_rule.style["text-shadow"] = "none"
+        hw_rule.style["text-decoration"] = "none"
+    if not found:
+        css.add(hw_rule)
+
+    all_author_ids = get_all_author_ids(soup)
+    for authorid in all_author_ids:
+        authorselector = f".{authorid}"
+        author_rule = None
+        for rule in css:
+            if rule.type == rule.STYLE_RULE and authorselector in rule.selectorText:
+                author_rule = rule
+                break
+
+        found = author_rule is not None
+        if not found:
+            author_rule = cssutils.css.CSSStyleRule(selectorText=authorselector)
+        if author_rule:
+            author_rule.style["color"] = "transparent"  # This removes the text from PDF
+            author_rule.style["text-shadow"] = "none"
+            author_rule.style["text-decoration"] = "none"
+        if not found:
+            css.add(author_rule)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup
+
+
+def postprocess_visual_elements(soup: BeautifulSoup) -> BeautifulSoup:
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and ".visual-placeholder" in rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=".visual-placeholder")
+    if hw_rule:
+        hw_rule.style["visibility"] = "hidden"  # update existing
+    if not found:
+        css.add(hw_rule)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup
+
+
+if __name__ == "__main__":
+    html_path = (
+        ENV.SYN_DATASETS_DIR
+        / "docvqa-handwritten-sizes4"
+        / "raw_html"
+        / "7c232ba5-1d62-4f90-af6a-7d416b6568ef_0.html"
+    )
+    html = html_path.read_text(encoding="utf-8")
+    soup = BeautifulSoup(html, BS_PARSER)
+    soup = postprocess_handwriting(soup)
+    p = ENV.TEMP_DIR / "7c232ba5-1d62-4f90-af6a-7d416b6568ef_0.html"
+    p.write_text(soup.prettify(), encoding="utf-8")
diff --git a/docgenie/generation/pipeline_03/css.py b/docgenie/generation/pipeline_03/css.py
new file mode 100755
index 0000000000000000000000000000000000000000..417af652365306b07771eeaaa140be6b7ec5cf81
--- /dev/null
+++ b/docgenie/generation/pipeline_03/css.py
@@ -0,0 +1,261 @@
+import logging
+
+from bs4 import BeautifulSoup
+import cssutils
+from docgenie import ENV
+from docgenie.generation.constants import (
+    BS_PARSER,
+    HANDWRITING_CLASS_NAME,
+    HANDWRITING_FONT_SIZE,
+)
+from docgenie.generation.utils.handwriting import get_all_author_ids
+from docgenie.generation.utils.visualelement import get_visual_element_id
+
+# Your input HTML (replace with reading from file if needed)
+html = """
+<html>
+<head>
+  <style>
+    @page {
+      size: A4;
+      margin: 1in;
+    }
+    body {
+      display: block;
+      font-size: 12pt;
+    }
+  </style>
+</head>
+<body>
+  <p>Hello World</p>
+</body>
+</html>
+"""
+
+# html_path = ENV.DATA_DIR / "html" / "receipt3.html"
+# html = html_path.read_text(encoding="utf-8")
+
+
+# Get cssutils logger
+cssutils_logger = logging.getLogger("CSSUTILS")
+
+# Remove all handlers (if any) and prevent propagation
+cssutils_logger.handlers.clear()
+cssutils_logger.propagate = False
+
+# Add a NullHandler so it discards all logs
+cssutils_logger.addHandler(logging.NullHandler())
+
+
+def unmark_visual_elements_alt(soup: BeautifulSoup):
+    fields = soup.find_all(attrs={"data-placeholder": True})
+
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)  # type: ignore
+
+    for i, div in enumerate(fields):
+        div.clear()  # type: ignore
+        style = cssutils.parseStyle(div.get("style"))  # type: ignore
+        del style["font-size"]
+        del style["text-align"]
+        del style["display"]
+        del style["justify-content"]
+        del style["align-items"]
+        # Delete stuff that the LLM may have added
+        del style["border"]
+        del style["background-color"]
+        del style["background-image"]
+
+        # Maybe the element has a class that rotates it, this happened for stamps
+        # Copy rotate from potential class
+        transforms = []
+        viselemclasses = div.get("class")
+        if viselemclasses:  # type: ignore
+            for cls in viselemclasses:  # type: ignore
+                for rule in css:
+                    if rule.type == rule.STYLE_RULE and rule.selectorText == f".{cls}":
+                        if "transform" in rule.style:
+                            transforms.append(rule.style["transform"])
+
+        if len(transforms) > 1:
+            raise ValueError(
+                f"Visual element has multiple transforms, assigned by {len(transforms)} classes"
+            )
+
+        if len(transforms) == 1:
+            # Copy transform
+            style["transform"] = transforms[0]
+            # print(f"Copied transform! {transforms[0]}")
+
+        div["style"] = style.cssText  # type: ignore
+        div["class"] = []  # type: ignore
+
+    return soup
+
+
+def unmark_visual_elements(soup: BeautifulSoup):
+    # Find all elements that have the data-placeholder attribute
+    fields = soup.find_all(attrs={"data-placeholder": True})
+
+    # Get the style tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)  # type: ignore
+
+    # Remove any CSS rules targeting data-placeholder elements
+    to_remove = []
+    for rule in css:
+        if rule.type == rule.STYLE_RULE:
+            selector = rule.selectorText
+            # Match any selector like div[data-placeholder="..."]
+            if "[data-placeholder" in selector:
+                to_remove.append(rule)
+
+    for rule in to_remove:
+        css.deleteRule(rule)
+
+    # Apply inline style cleanup for each placeholder field
+    for div in fields:
+        div.clear()  # type: ignore
+        style = cssutils.parseStyle(div.get("style", ""))  # type: ignore
+
+        for prop in [
+            "font-size",
+            "text-align",
+            "display",
+            "justify-content",
+            "align-items",
+            "border",
+            "background-color",
+            "background-image",
+            "background",
+            "color",
+        ]:
+            if prop in style:
+                del style[prop]
+
+        # Copy transform from any class rules, if needed
+        transforms = []
+        viselemclasses = div.get("class")  # type: ignore
+        if viselemclasses:
+            for cls in viselemclasses:
+                for rule in css:
+                    if rule.type == rule.STYLE_RULE and rule.selectorText == f".{cls}":
+                        if "transform" in rule.style:
+                            transforms.append(rule.style["transform"])
+
+        if len(transforms) > 1:
+            raise ValueError(
+                f"Visual element has multiple transforms, assigned by {len(transforms)} classes"
+            )
+
+        if len(transforms) == 1:
+            style["transform"] = transforms[0]
+
+        div["style"] = style.cssText  # type: ignore
+        div["class"] = []  # type: ignore
+
+    # Write the cleaned CSS back into the style tag
+    style_tag.string = css.cssText.decode("utf-8")  # type: ignore
+
+    return soup
+
+
+def increase_handwriting_font_size(soup: BeautifulSoup, dbg=False) -> BeautifulSoup:
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    # Increase handwriting font size
+    hwselector = f".{HANDWRITING_CLASS_NAME}"
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and hwselector == rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if dbg:
+        ...
+        # print(hw_rule)
+        # input()
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=hwselector)
+    if hw_rule:
+        # hw_rule.style["font-size"] = f"{HANDWRITING_FONT_SIZE}px"
+        hw_rule.style.setProperty(
+            "font-size", f"{HANDWRITING_FONT_SIZE}px", priority="important"
+        )  # = f"{HANDWRITING_FONT_SIZE}px"
+    if not found:
+        css.add(hw_rule)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup
+
+
+def postprocess_handwriting(soup: BeautifulSoup) -> BeautifulSoup:
+    # Find <style> tag
+    style_tag = soup.find("style")
+    if not style_tag:
+        raise ValueError("No <style> tag found in HTML.")
+
+    # Parse the CSS
+    css = cssutils.parseString(style_tag.string)
+
+    hwselector = f".{HANDWRITING_CLASS_NAME}"
+    hw_rule = None
+    for rule in css:
+        if rule.type == rule.STYLE_RULE and hwselector in rule.selectorText:
+            hw_rule = rule
+            break
+
+    found = hw_rule is not None
+    if not found:
+        hw_rule = cssutils.css.CSSStyleRule(selectorText=hwselector)
+    if hw_rule:
+        hw_rule.style["color"] = "transparent"  # This removes the text from PDF
+        # hw_rule.style["-webkit-print-color-adjust"] = "exact"
+        # hw_rule.style["opacity"] = 0 # doesnt work
+        # hw_rule.style["color"] = "rgba(0, 0, 0, 0.01)" # wors but looks bad
+        hw_rule.style["text-shadow"] = "none"
+        hw_rule.style["text-decoration"] = "none"
+    if not found:
+        css.add(hw_rule)
+
+    all_author_ids = get_all_author_ids(soup)
+    for authorid in all_author_ids:
+        authorselector = f".{authorid}"
+        author_rule = None
+        for rule in css:
+            if rule.type == rule.STYLE_RULE and authorselector in rule.selectorText:
+                author_rule = rule
+                break
+
+        found = author_rule is not None
+        if not found:
+            author_rule = cssutils.css.CSSStyleRule(selectorText=authorselector)
+        if author_rule:
+            author_rule.style["color"] = "transparent"  # This removes the text from PDF
+            author_rule.style["text-shadow"] = "none"
+            author_rule.style["text-decoration"] = "none"
+        if not found:
+            css.add(author_rule)
+
+    # Replace style content
+    style_tag.string = css.cssText.decode("utf-8")
+
+    # Output modified HTML
+    return soup
diff --git a/docgenie/generation/pipeline_03/exactpdf.py b/docgenie/generation/pipeline_03/exactpdf.py
new file mode 100755
index 0000000000000000000000000000000000000000..685e692950464fc349fa4d591ef13792d70bb755
--- /dev/null
+++ b/docgenie/generation/pipeline_03/exactpdf.py
@@ -0,0 +1,130 @@
+import json
+import pathlib
+import subprocess
+import tempfile
+import threading
+from typing import Literal
+
+from anyio import Semaphore
+from docgenie import ENV
+
+# from docgenie.generation.pipeline_03.css import (
+#     prepare_html_for_size_measurement,
+#     # prepare_html_for_rendering_first_pass,
+# )
+from docgenie.generation.pipeline_03.extract_positions import (
+    convert_to_bboxes,
+    extract_positions_from_subprocess_result,
+    prepare_html_for_position_extraction,
+)
+from docgenie.generation.utils.bboxes import draw_bboxes_on_pdf
+
+
+def convert_pdf_chromium_and_extract_positions(
+    html: str,
+    output_path: pathlib.Path,
+    extract_positions_for_classes: list[str],
+    semaphore: threading.Semaphore,
+    timeout_s: int = 300,
+):
+    """Spawn external Chromium CLI to print-to-pdf but limited by a semaphore."""
+    html = prepare_html_for_position_extraction(
+        html=html, classes=extract_positions_for_classes
+    )
+
+    # write tmp html
+    with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
+        tmp_file.write(html.encode("utf-8"))
+        tmp_file_path = pathlib.Path(tmp_file.name)
+    # <meta name="viewport" content="width=793, height=1124, initial-scale=1.0">
+    cmd = [
+        "chrome",
+        "--headless",
+        "--no-sandbox",
+        "--disable-gpu",
+        "--hide-scrollbars",
+        "--dump-dom",
+        "--window-size=793,1124",  # dbg
+        "--force-device-scale-factor=1",  # dbg
+        f"--print-to-pdf={output_path}",
+        f"file://{tmp_file_path.resolve()}",
+    ]
+
+    # Acquire semaphore so only a small number run concurrently
+    result = None
+    with semaphore:
+        try:
+            # Use subprocess.run with timeout
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, timeout=timeout_s
+            )
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"Chromium PDF conversion failed (rc={result.returncode}): {result.stderr}"
+                )
+        finally:
+            # Always attempt to unlink temp file
+            try:
+                tmp_file_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+
+    if result is not None:
+        positions_by_class = extract_positions_from_subprocess_result(result=result)
+        bboxes_by_class = convert_to_bboxes(positions_by_class)
+        return bboxes_by_class
+    else:
+        return None
+
+
+def convert_pdf_chromium(
+    html: str,
+    output_path: pathlib.Path,
+    semaphore: threading.Semaphore,
+    timeout_s: int = 300,
+):
+    """Spawn external Chromium CLI to print-to-pdf but limited by a semaphore."""
+
+    # write tmp html
+    with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
+        tmp_file.write(html.encode("utf-8"))
+        tmp_file_path = pathlib.Path(tmp_file.name)
+
+    cmd = [
+        "chrome",
+        "--headless",
+        "--no-sandbox",
+        "--disable-gpu",
+        "--hide-scrollbars",
+        f"--print-to-pdf={output_path}",
+        f"file://{tmp_file_path.resolve()}",
+    ]
+
+    # Acquire semaphore so only a small number run concurrently
+    with semaphore:
+        try:
+            # Use subprocess.run with timeout
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, timeout=timeout_s
+            )
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"Chromium PDF conversion failed (rc={result.returncode}): {result.stderr}"
+                )
+        finally:
+            # Always attempt to unlink temp file
+            try:
+                tmp_file_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+
+
+if __name__ == "__main__":
+    path: pathlib.Path = pathlib.Path(
+        "data/datasets/synthesized_datasets/docvqa-handwritten-sizes4/raw_html/7c232ba5-1d62-4f90-af6a-7d416b6568ef_0.html"
+    )
+    convert_pdf_chromium(
+        html=path.read_text(encoding="utf-8"),
+        output_path="test.pdf",
+        semaphore=threading.Semaphore(1),
+    )
diff --git a/docgenie/generation/pipeline_03/extract_positions.py b/docgenie/generation/pipeline_03/extract_positions.py
new file mode 100755
index 0000000000000000000000000000000000000000..6c2a22ba50adcad9f4c454016bad94329eeb6488
--- /dev/null
+++ b/docgenie/generation/pipeline_03/extract_positions.py
@@ -0,0 +1,139 @@
+from collections import defaultdict
+import subprocess
+import json
+
+from docgenie.generation.models import OCRBox
+
+
+def convert_to_pdf_coordinates(positions: list[dict]) -> list[dict]:
+    """
+    Convert pixel coordinates to PDF points.
+    """
+
+    def pixels_to_points(pixels, dpi=96):
+        return (pixels / dpi) * 72
+
+    pdf_positions = []
+    for pos in positions:
+        pdf_pos = {
+            "index": pos["index"],
+            "text": pos["text"],
+            "x_points": pixels_to_points(pos["x"]),
+            "y_points": pixels_to_points(pos["y"]),
+            "width_points": pixels_to_points(pos["width"]),
+            "height_points": pixels_to_points(pos["height"]),
+            "x_pixels": pos["x"],
+            "y_pixels": pos["y"],
+            "width_pixels": pos["width"],
+            "height_pixels": pos["height"],
+        }
+        pdf_positions.append(pdf_pos)
+
+    return pdf_positions
+
+
+def prepare_html_for_position_extraction(
+    html: str, classes: list[str] | None = None
+) -> str:
+    if classes is None:
+        classes = ["handwritten"]
+
+    # Generate JavaScript selectors for all classes
+    classes_json = json.dumps(classes)
+
+    # First, inject JavaScript to capture positions
+    position_capture_script = f"""
+    <script>
+    window.addEventListener('load', function() {{
+        const classNames = {classes_json};
+        const positionsByClass = {{}};
+
+        classNames.forEach(className => {{
+            const elements = document.querySelectorAll('.' + className);
+            const positions = [];
+
+            elements.forEach((element, index) => {{
+                const rect = element.getBoundingClientRect();
+                positions.push({{
+                    index: index,
+                    className: className,
+                    text: element.textContent.trim(),
+                    x: rect.x,
+                    y: rect.y,
+                    width: rect.width,
+                    height: rect.height
+                }});
+            }});
+
+            positionsByClass[className] = positions;
+        }});
+
+        // Write to a hidden element that we can extract
+        const dataDiv = document.createElement('div');
+        dataDiv.id = '__positions_data__';
+        dataDiv.style.display = 'none';
+        dataDiv.textContent = JSON.stringify(positionsByClass);
+        document.body.appendChild(dataDiv);
+    }});
+    </script>
+    """
+
+    # Inject the script before </body> or at the end
+    if "</body>" in html:
+        html_with_script = html.replace("</body>", f"{position_capture_script}</body>")
+    else:
+        html_with_script = html + position_capture_script
+
+    return html_with_script
+
+
+def extract_positions_from_subprocess_result(
+    result: subprocess.CompletedProcess[str],
+) -> dict[str, list[dict]]:
+    # Extract positions from DOM dump
+    positions_by_class = {}
+    if result.returncode == 0:
+        # Look for the hidden div with positions data
+        import re
+
+        match = re.search(
+            r'<div id="__positions_data__"[^>]*>([^<]+)</div>', result.stdout
+        )
+        if match:
+            try:
+                positions_by_class = json.loads(match.group(1))
+            except json.JSONDecodeError:
+                pass
+
+    # Convert all positions to PDF coordinates
+    result_dict = {}
+    for class_name, positions in positions_by_class.items():
+        result_dict[class_name] = convert_to_pdf_coordinates(positions)
+
+    return result_dict
+
+
+def convert_to_bboxes(
+    positions_by_class: dict[str, list[dict]],
+) -> dict[str, list[OCRBox]]:
+    result = defaultdict(list)
+    for classname, entries in positions_by_class.items():
+        for e in entries:
+            x0 = e["x_points"]
+            y0 = e["y_points"]
+            width = e["width_points"]
+            height = e["height_points"]
+            txt = e["text"]
+            bbox = OCRBox(
+                x0=x0,
+                y0=y0,
+                x2=x0 + width,
+                y2=y0 + height,
+                text=txt,
+                block_no=-1,
+                line_no=-1,
+                word_no=-1,
+            )
+            result[classname].append(bbox)
+
+    return result
diff --git a/docgenie/generation/pipeline_03/render/__init__.py b/docgenie/generation/pipeline_03/render/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docgenie/generation/pipeline_03/render/engines/__init__.py b/docgenie/generation/pipeline_03/render/engines/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/docgenie/generation/pipeline_03/render/engines/chrome.py b/docgenie/generation/pipeline_03/render/engines/chrome.py
new file mode 100755
index 0000000000000000000000000000000000000000..bdfd947d9e3a86a9f981a01b2fc34cde9ed48d4f
--- /dev/null
+++ b/docgenie/generation/pipeline_03/render/engines/chrome.py
@@ -0,0 +1,19 @@
+import os
+from selenium.webdriver import Chrome, ChromeService
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+
+from docgenie.generation.pipeline_03.render.render import DocumentRender
+
+
+class ChromeRender(DocumentRender):
+    def initialize_driver(self):
+        driver_options = ChromeOptions()
+        driver_options.add_argument("--headless=new")
+        driver_options.add_argument("--no-sandbox")
+        driver_options.add_argument("--disable-dev-shm-usage")
+        driver_options.add_argument("--disable-gpu")
+        driver_options.add_argument("--hide-scrollbars")
+        driver_options.add_argument("--force-device-scale-factor=1")
+        # driver_service = ChromeService(executable_path=self.DRIVER_PATH, log_output=subprocess.STDOUT)
+        driver_service = ChromeService(executable_path=os.getenv("CHROME_DRIVER_PATH"))
+        return Chrome(service=driver_service, options=driver_options)
diff --git a/docgenie/generation/pipeline_03/render/render.py b/docgenie/generation/pipeline_03/render/render.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5d688b9e59b2beb92011e4b1940215c9ed4af62
--- /dev/null
+++ b/docgenie/generation/pipeline_03/render/render.py
@@ -0,0 +1,79 @@
+from abc import ABC
+from io import BytesIO
+from urllib.parse import quote
+
+from PIL import Image
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+
+
+class Render:
+    DRIVER_PATH: str | None = None
+    DEFAULT_BROWSER: str = 'chrome'
+
+    def __init__(self, width, height, **kwargs):
+        self._width = width
+        self._height = height
+        self._driver = None
+
+        self._driver = self.initialize_driver()
+        self._driver.set_window_position(0, 0)
+        self._driver.set_window_size(self._width, self._height)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._driver.close()
+        self._driver.quit()
+
+    def __del__(self):
+        if self._driver:
+            self._driver.quit()
+
+    def initialize_driver(self):
+        raise NotImplementedError
+
+    def _ensure_image_size(self, image: Image) -> Image:
+        fill_color = (255, 255, 255)
+        padded_image = Image.new('RGB', (self._width, self._height), fill_color)
+        padded_image.paste(image, (0, 0))
+
+        return padded_image
+
+    def render_html(self, html: str) -> Image:
+        encoded_html = quote(html)
+        self._driver.get(f"data:text/html;charset=utf-8,{encoded_html}")
+
+        # self._driver.execute_script("return document.readyState") == "complete"
+
+        image = Image.open(BytesIO(self._driver.get_screenshot_as_png()))
+        # image = self._ensure_image_size(image)
+
+        return image
+
+    @staticmethod
+    def bbox_from_element(element):
+        elem_loc = element.location
+        elem_size = element.size
+        return [elem_loc['x'], elem_loc['y'], elem_size['width'], elem_size['height']]
+
+
+class DocumentRender(Render, ABC):
+    def get_geometries(self, element_ids: list[str]) -> dict[str, list[int]]:
+        def _query_element(element_id: str, timeout_sec: int = 5):
+            try:
+                return WebDriverWait(self._driver, timeout_sec).until(
+                    EC.presence_of_element_located((By.ID, str(element_id)))
+                )
+            except Exception:
+                return None
+
+        geometries = {}
+        for element_id in element_ids:
+            element = _query_element(element_id)
+            if element is not None:
+                geometries[element_id] = self.bbox_from_element(element)
+
+        return geometries
diff --git a/docgenie/generation/pipeline_03_process_response.py b/docgenie/generation/pipeline_03_process_response.py
new file mode 100755
index 0000000000000000000000000000000000000000..1ebab19f07c5ce1b7e9a9343d7149c56b6932cc7
--- /dev/null
+++ b/docgenie/generation/pipeline_03_process_response.py
@@ -0,0 +1,226 @@
+import json
+import pathlib
+import re
+
+from bs4 import BeautifulSoup
+
+from docgenie.generation.constants import BS_PARSER
+from docgenie.generation.models import (
+    DocLogKey,
+    MessageProcessingLogKey,
+    PipelineParameters,
+    PromptMsgResultLogKey,
+    PromptParameters,
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+    DatasetTask,
+)
+from docgenie.generation.utils.log import log_pipeline_level
+
+
+def extract_gt(html: str):
+    # Parse HTML
+    soup = BeautifulSoup(html, BS_PARSER)
+
+    # Find the script tag with id="GT"
+    script_tag = soup.find("script", {"id": "GT"})
+
+    # Extract and parse the JSON content
+    if script_tag:
+        raw_json = script_tag.string.strip()  # type: ignore
+
+        # Remove the script tag from the HTML
+        script_tag.decompose()
+
+        # Return the JSON content and modified HTML
+        return raw_json, str(soup), soup
+    else:
+        return None, html, soup
+
+
+def extract_html_documents_from_text(text: str):
+    # Split response HTMLs based on regex
+    # Use regex to find all <html>...</html> blocks
+    html_docs = re.findall(
+        r"<!DOCTYPE html>.*?</html>", text, re.DOTALL | re.IGNORECASE
+    )
+
+    htmls = []
+    for i, doc in enumerate(html_docs):
+        # Insert invisible debugging utils to be able to inspect generated HTML.
+        doc = doc.replace("</body>", '<script src="debug.js"></script></body>')
+
+        # Prettify the minified HTML for easier inspection
+        pretty_html = None
+        try:
+            soup = BeautifulSoup(doc, BS_PARSER)
+            pretty_html = soup.prettify()
+        except Exception:
+            pretty_html = None
+
+        if pretty_html is not None:
+            htmls.append(pretty_html)
+
+    return htmls
+
+
+def _prepare_kie_json_gt(gts: dict) -> list[dict]:
+    res = []
+    for k, v in gts.items():
+        r = {"id": k, "group": None, "key": k, "value": v, "rect": None, "error": None}
+
+        # Don't allow empty key
+        if not v or not v.strip():
+            r["error"] = "missing-value"
+
+        res.append(r)
+
+    return res
+
+
+def _parse_and_save_json_gt(
+    dsdef: SynDatasetDefinition,
+    html: str,
+    gt_path: pathlib.Path,
+) -> dict:
+    dataset_task = DatasetTask(dsdef.task)
+
+    if dsdef.prompt_task == "annotation":
+        # DLA/KIE datasets do not generate JSON GT, this GT is extracted in pipeline_06_extract_layout_element_definitions_and_annotation_gt
+        return {
+            DocLogKey.raw_json_gt_found: False,
+            DocLogKey.raw_json_gt_valid_json: False,
+        }
+    elif dsdef.prompt_task == "json":
+        # ! We assume the raw gt extracted is always a dict, never a list
+
+        # Extract GT from HTML and parse it (enclosed in script tag with ID 'GT')
+        gts_json, html_without_scripttag, soup = extract_gt(html=html)
+        gt_log = dict()
+        gt_log[DocLogKey.raw_json_gt_found] = gts_json is not None
+        gt_log[DocLogKey.raw_json_gt_valid_json] = False
+
+        try:
+            gts: dict = json.loads(gts_json)  # type: ignore
+            gt_log[DocLogKey.raw_json_gt_valid_json] = True
+
+            # For QA, we just save the json with question as key and answer as value,
+            # but for KIE tasks modelled as JSON-prompt, we should stick to the same format as annoation-prompt KIE.
+            if dataset_task == DatasetTask.KIE:
+                gts = _prepare_kie_json_gt(gts)
+
+            gt_log[DocLogKey.raw_gt_or_annotation_annotations_count] = len(gts)
+
+            with open(gt_path, "w", encoding="utf-8") as f:
+                json.dump(gts, f, indent=2)
+        except Exception as e:
+            # print(f"[ERROR]: {str(e)}")
+            gt_log[DocLogKey.raw_json_gt_valid_json] = False
+
+        return gt_log
+    else:
+        raise ValueError(f"Unknown prompt_task: {dsdef.prompt_task}")
+
+
+def _process_message_response(
+    dsdef: SynDatasetDefinition,
+    message_results: dict,
+    prompt_params: PromptParameters,
+    dsfiles: SyntheticDatasetFileStructure,
+):
+    message_custom_id = message_results["custom_id"]
+    message_processing_log_path = (
+        dsfiles.message_processing_logs_directory / f"{message_custom_id}.json"
+    )
+    message_processing_log = {
+        MessageProcessingLogKey.custom_id: message_custom_id,
+        MessageProcessingLogKey.result_type: message_results[
+            PromptMsgResultLogKey.result_type
+        ],
+        MessageProcessingLogKey.num_documents_expected: -1,
+        MessageProcessingLogKey.num_documents_found: -1,
+        MessageProcessingLogKey.document_ids: [],
+    }
+
+    # Only process succeeded files
+    # Policy violations have result_type == "succeeded" but an error message
+    if (
+        message_results[PromptMsgResultLogKey.result_type] == "succeeded"
+        and not message_results[PromptMsgResultLogKey.error]
+    ):
+        # Extract HTMLs from LLM response
+        response = message_results[PromptMsgResultLogKey.response]
+        htmls = extract_html_documents_from_text(text=response)
+
+        if len(htmls) != prompt_params.num_solutions:
+            print(
+                f"Expected {prompt_params.num_solutions} HTML files but found only {len(htmls)} in {message_custom_id}!"
+            )
+
+        message_processing_log[MessageProcessingLogKey.num_documents_expected] = (
+            prompt_params.num_solutions
+        )
+        message_processing_log[MessageProcessingLogKey.num_documents_found] = len(htmls)
+
+        # Save HTMLs and parse GT
+        for i, html in enumerate(htmls):
+            document_id = f"{message_custom_id}_{i}"
+            document_log: dict[str, str | int] = {DocLogKey.document_id: document_id}
+
+            # Save raw HTML
+            html_path = dsfiles.raw_html_directory / f"{document_id}.html"
+            html_path.write_text(html, encoding="utf-8")
+            document_log[DocLogKey.html_len] = len(html)
+
+            # Extract GT from HTML and parse it (enclosed in script tag with ID 'GT')
+            gt_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+            gt_log = _parse_and_save_json_gt(dsdef=dsdef, html=html, gt_path=gt_path)
+            document_log.update(gt_log)
+
+            # Append sample log to message processing log
+            message_processing_log[MessageProcessingLogKey.document_ids].append(
+                document_id
+            )
+
+            dsdef.write_to_document_log(document_id=document_id, vals=document_log)
+
+    # Write log file, indicating finished processing of message results
+    with open(message_processing_log_path, "w", encoding="utf-8") as f:
+        json.dump(message_processing_log, f, indent=2)
+
+
+def pipeline_process_response_extract_html_and_gt(params: PipelineParameters):
+    """
+    Extract raw HTML and GT, write logs
+    """
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+    prompt_params = dsdef.prompt_params
+
+    total_message_results_count = 0
+    messages_to_process = []
+    if not params.message_custom_id:
+        for f in dsfiles.message_results_directory.iterdir():
+            if f.is_file():
+                total_message_results_count += 1
+                # logfile = dsfiles.message_processing_logs_directory / f"{f.stem}.json"
+                # if not logfile.exists():
+                messages_to_process.append(f)
+    else:
+        total_message_results_count = -1
+        f = dsfiles.message_results_directory / f"{params.message_custom_id}.json"
+        messages_to_process.append(f)
+
+    print(
+        f"{len(messages_to_process)} out of {total_message_results_count} message responses need to be processed."
+    )
+    for f in messages_to_process:
+        message_results = json.loads(f.read_text(encoding="utf-8"))
+        _process_message_response(
+            dsdef=dsdef,
+            message_results=message_results,
+            prompt_params=prompt_params,
+            dsfiles=dsfiles,
+        )
diff --git a/docgenie/generation/pipeline_04/extract_bbox.py b/docgenie/generation/pipeline_04/extract_bbox.py
new file mode 100755
index 0000000000000000000000000000000000000000..fa9d0ef0311d34c747039dff36cc7b03ea342c39
--- /dev/null
+++ b/docgenie/generation/pipeline_04/extract_bbox.py
@@ -0,0 +1,159 @@
+import pathlib
+import re
+from typing import Literal
+import pymupdf
+
+from docgenie.generation.models import OCRBox
+
+
+def extract_bboxes_from_pdf(
+    pdf_path: pathlib.Path,
+    level: Literal["word", "char"],
+    round_coordinates: bool = False,
+) -> list[OCRBox]:
+    match level:
+        # case 'span': bboxes = _extract_bboxes_from_pdf_words(pdf_path=pdf_path, level='blocks')
+        case "word":
+            bboxes = _extract_bboxes_from_pdf_words(pdf_path=pdf_path)
+        case "char":
+            bboxes = _extract_bboxes_from_pdf_chars(pdf_path=pdf_path)
+
+    converted_bboxes = []
+    for x0, y0, x2, y2, txt, block_no, line_no, word_no in bboxes:
+        box = OCRBox(
+            round(x0) if round_coordinates else x0,
+            round(y0) if round_coordinates else y0,
+            round(x2) if round_coordinates else x2,
+            round(y2) if round_coordinates else y2,
+            txt,
+            block_no,
+            line_no,
+            word_no,
+        )
+        converted_bboxes.append(box)
+    bboxes = converted_bboxes
+
+    return bboxes
+
+
+rtl_re = re.compile(
+    r"[\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F"
+    r"\u0780-\u07BF\u07C0-\u07FF\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]"
+)
+
+
+def contains_rtl(text):
+    return bool(rtl_re.search(text))
+
+
+def _extract_bboxes_from_pdf_words(
+    pdf_path: pathlib.Path,
+) -> list[tuple[float, float, float, float, str, int, int, int]]:
+    doc = pymupdf.open(pdf_path)
+    bboxes = []
+    for page_num, page in enumerate(doc.pages()):
+        for word_no, word in enumerate(page.get_text("words")):
+            x0, y0, x2, y2, txt, block_no, line_no, word_no = word
+            box = (x0, y0, x2, y2, txt, block_no, line_no, word_no)
+            bboxes.append(box)
+    doc.close()
+    return bboxes
+
+
+# Look behind
+# Word level extraction extracts 'PCS-2024-SF-0087' as single word -> only space, new line, are delmiters
+def _extract_bboxes_from_pdf_chars(
+    pdf_path: pathlib.Path,
+) -> list[tuple[float, float, float, float, str, int, int, int]]:
+    doc = pymupdf.open(pdf_path)
+    bboxes = []
+    for page_num, page in enumerate(doc.pages()):
+        # Skip images
+        block_no = -1
+        for b in page.get_text("rawdict")["blocks"]:
+            if b["type"] == 0:
+                block_no += 1
+            else:
+                continue
+            for line_no, line in enumerate(b["lines"]):
+                word_no = 0
+                cur_word = ""
+                for span in line["spans"]:
+                    for char in span["chars"]:
+                        x0, y0, x2, y2 = char["bbox"]
+                        c = char["c"]
+
+                        if (
+                            (not c.isspace())
+                            and len(cur_word.strip()) > 0
+                            and cur_word[-1].isspace()
+                        ):
+                            word_no += 1
+                        if not c.isspace():
+                            box = (x0, y0, x2, y2, c, block_no, line_no, word_no)
+                            bboxes.append(box)
+                        cur_word += c
+
+    doc.close()
+    return bboxes
+
+
+def _extract_text_from_pdf(pdf_path: pathlib.Path) -> list[str]:
+    doc = pymupdf.open(pdf_path)
+    texts = []
+    for page_num, page in enumerate(doc.pages()):
+        texts.append(page.get_text("text"))
+    doc.close()
+    return " ".join(texts)  # type: ignore
+
+
+def _get_num_pages(pdf_path: pathlib.Path) -> list[str]:
+    doc = pymupdf.open(pdf_path)
+    pc = doc.page_count
+    doc.close()
+    return pc
+
+
+def validate_char_bbox_word_mapping(
+    char_bboxes: list[OCRBox], word_bboxes: list[OCRBox]
+):
+    from collections import defaultdict
+
+    grouped = defaultdict(list)
+
+    word_bbox_lookup = dict()
+    for box in word_bboxes:
+        word_bbox_lookup[box.key] = box.text
+
+    for box in char_bboxes:
+        grouped[box.key].append(box.text)
+
+    for block_no, line_no, word_no in sorted(grouped):
+        chars = grouped[(block_no, line_no, word_no)]
+        word = "".join(chars).strip()
+
+        # logfile = ENV.TEMP_DIR / 'bbox_debug.txt'
+        # logfile.write_text('')
+        # def log(s):
+        #     with open(logfile, mode='a') as f:
+        #         f.write(f'{s}\n')
+        # print(s)
+        if (
+            block_no,
+            line_no,
+            word_no,
+        ) not in word_bbox_lookup or word != word_bbox_lookup[
+            (block_no, line_no, word_no)
+        ]:
+            # [log(b) for b in word_bboxes]
+            # log("----------------")
+            # [log(b) for b in char_bboxes]
+
+            # print(f'{(block_no, line_no, word_no)}')
+            # gt_word = word_bbox_lookup[(block_no, line_no, word_no)]
+            # print(f'{word} != {gt_word} !!')
+            # _print_block_types(pdf_path)
+            # input()
+            return False
+
+    return True
diff --git a/docgenie/generation/pipeline_04_render_pdf_and_extract_geos.py b/docgenie/generation/pipeline_04_render_pdf_and_extract_geos.py
new file mode 100755
index 0000000000000000000000000000000000000000..1a7518d1274a048dcbca9d75295fb8ca2c373bb7
--- /dev/null
+++ b/docgenie/generation/pipeline_04_render_pdf_and_extract_geos.py
@@ -0,0 +1,437 @@
+import asyncio
+from playwright.async_api import async_playwright
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import pathlib
+import re
+import time
+import tempfile
+import os
+
+from PyPDF2 import PdfReader
+from bs4 import BeautifulSoup
+from rich.progress import Progress
+
+from docgenie import ENV
+from docgenie.generation.constants import (
+    BS_PARSER,
+    HANDWRITING_CLASS_NAME,
+    PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY,
+    PIPELINE_03_RENDER_PDF__MAX_WORKERS,
+    PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES,
+    PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    PipelineParameters,
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+)
+from docgenie.generation.pipeline_03.css import (
+    increase_handwriting_font_size,
+    postprocess_handwriting,
+    unmark_visual_elements,
+)
+from docgenie.generation.utils.debug import draw_geos_on_pdf
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def safe_count_pages(pdf_path: pathlib.Path):
+    with open(pdf_path, "rb") as f:
+        reader = PdfReader(f)
+        return len(reader.pages)
+
+
+def preprocess_html_for_pdf_alt(html_content: str) -> str:
+    """
+    Preprocess HTML to remove @page rules and add override styles.
+    This is done before browser parsing, so we only need text manipulation.
+    """
+    # Remove all @page rules (handles both inline styles and style tags)
+    html_content = re.sub(
+        r"@page\s*\{[^}]*\}", "", html_content, flags=re.IGNORECASE | re.DOTALL
+    )
+
+    # Add override style to head with !important to ensure it wins
+    override_style = """<style>
+        @page {
+            size: auto !important;
+            margin: 0mm !important;
+        }
+    </style>
+</head>"""
+
+    html_content = re.sub(
+        r"</head>", override_style, html_content, count=1, flags=re.IGNORECASE
+    )
+
+    return html_content
+
+
+def preprocess_html_for_pdf(html_content: str) -> str:
+    """
+    Universal preprocessing for LLM-generated HTML.
+    Removes @page rules and ensures content determines PDF size.
+    Safe for: receipts, multi-column, tables, any layout.
+    """
+    # Step 1: Remove @page rules
+    html_content = re.sub(
+        r"@page\s*\{[^}]*\}", "", html_content, flags=re.IGNORECASE | re.DOTALL
+    )
+
+    # Step 2: Add override styles
+    # Key: Don't force any widths - let content be measured as-is
+    override_style = """<style>
+        @page {
+            size: auto !important;
+            margin: 0mm !important;
+        }
+    </style>
+</head>"""
+
+    html_content = re.sub(
+        r"</head>", override_style, html_content, count=1, flags=re.IGNORECASE
+    )
+
+    return html_content
+
+
+async def render_pdf_async(
+    doc_id,
+    html,
+    dsfiles: SyntheticDatasetFileStructure,
+    extract_geos_for_classes: list[str],
+    semaphore: asyncio.Semaphore,
+    max_retries=2,
+    timeout_seconds=60,  # Add timeout parameter
+):
+    """
+    Async version: Render PDF using Playwright with automatic size detection.
+    Also extracts element geometries for specified classes.
+    """
+    selectorMap = {
+        "layout_element": '[class*="LE-"]',
+        "handwriting": f".{HANDWRITING_CLASS_NAME}",
+        "visual_element": "[data-placeholder]",
+    }
+    if any(extract_geos_for_classes):
+        selectorMap["custom"] = ", ".join([f".{c}" for c in extract_geos_for_classes])
+
+    last_error = None
+    for attempt in range(1, max_retries + 2):
+        browser = None
+        try:
+            pdf_path = dsfiles.pdf_initial_directory / f"{doc_id}.pdf"
+            render_html_path = dsfiles.render_html_directory / f"{doc_id}.html"
+            geometry_json_path = dsfiles.geometries_directory / f"{doc_id}.json"
+
+            # Preprocess HTML (synchronous - fast)
+            html = preprocess_html_for_pdf(html)
+            soup = BeautifulSoup(html, BS_PARSER)
+
+            soup = increase_handwriting_font_size(
+                soup, dbg=doc_id == "1f100208-1fd8-4f60-b071-a51be9d7b495_2"
+            )
+            # soup = postprocess_handwriting(soup)
+            soup = unmark_visual_elements(soup)
+
+            prep_html = soup.prettify()
+            render_html_path.write_text(prep_html, encoding="utf-8")  # type: ignore
+
+            # Acquire semaphore for Chromium concurrency control
+            async with semaphore:
+                try:
+                    async with asyncio.timeout(timeout_seconds):
+                        async with async_playwright() as p:
+                            browser = await p.chromium.launch(headless=True)
+                            page = await browser.new_page()
+
+                            # Load HTML
+                            await page.goto(
+                                f"file://{render_html_path}",
+                                wait_until="domcontentloaded",
+                            )
+                            await page.emulate_media(media="screen")
+
+                            # Auto-detect content dimensions
+                            dimensions = await page.evaluate(MEASURE_DIMENSIONS)
+
+                            page_width_px = dimensions["width"]
+                            page_height_px = dimensions["height"]
+
+                            # Set viewport and wait for layout
+                            await page.set_viewport_size(
+                                {"width": page_width_px, "height": page_height_px}
+                            )
+                            await page.wait_for_timeout(30)
+
+                            # Extract geometries
+                            # class_selectors = ", ".join(
+                            #     [f".{cls}" for cls in extract_positions_for_classes]
+                            # )
+
+                            geo_eval_str = f"""
+                            () => {{
+                                const data = [];
+
+                                // Define individual selectors with labels
+                                const selectorMap = {selectorMap};
+
+                                const processedElements = new Map(); // Use Map to track matches
+
+                                // First pass: collect all elements and their matching selectors
+                                Object.entries(selectorMap).forEach(([label, selector]) => {{
+                                    document.querySelectorAll(selector).forEach(el => {{
+                                        if (!processedElements.has(el)) {{
+                                            processedElements.set(el, []);
+                                        }}
+                                        processedElements.get(el).push(label);
+                                    }});
+                                }});
+
+                                // Second pass: create geometry data for each unique element
+                                processedElements.forEach((selectorTypes, el) => {{
+                                    const rect = el.getBoundingClientRect();
+                                    const computed = window.getComputedStyle(el);
+
+                                    // Get text content (matches your Python logic)
+                                    let text = '';
+                                    if (el.tagName.toLowerCase() === 'input') {{
+                                        text = (el.value || '').trim();
+                                    }} else {{
+                                        text = (el.innerText || el.textContent || '').trim();
+                                    }}
+
+                                    data.push({{
+                                        id: el.id || null,
+                                        tag: el.tagName.toLowerCase(),
+                                        classes: el.className || null,
+                                        rect: {{
+                                            x: rect.x,
+                                            y: rect.y,
+                                            width: rect.width,
+                                            height: rect.height
+                                        }},
+                                        visibility: computed.visibility,
+                                        dataContent: el.getAttribute('data-content') || null,
+                                        dataPlaceholder: el.getAttribute('data-placeholder') || null,
+                                        style: el.getAttribute('style') || null,
+                                        text: text,
+                                        selectorTypes: selectorTypes  // Array of all matching selector types
+                                    }});
+                                }});
+
+                                return data;
+                            }}
+                            """
+                            # input(geo_eval_str)
+                            geometries = await page.evaluate(geo_eval_str)
+
+                            # Generate PDF
+                            page_width_inches = page_width_px / 96
+                            page_height_inches = page_height_px / 96
+
+                            await page.pdf(
+                                path=str(pdf_path),
+                                width=f"{page_width_inches}in",
+                                height=f"{page_height_inches}in",
+                                margin={
+                                    "top": "0",
+                                    "bottom": "0",
+                                    "left": "0",
+                                    "right": "0",
+                                },
+                                print_background=True,
+                                display_header_footer=False,
+                                prefer_css_page_size=False,
+                                scale=1.0,
+                            )
+
+                            await browser.close()
+                except asyncio.TimeoutError:
+                    print(
+                        f"PDF rendering timed out after {timeout_seconds}s for {doc_id}"
+                    )
+                    raise TimeoutError(
+                        f"PDF rendering timed out after {timeout_seconds}s for {doc_id}"
+                    )
+                finally:
+                    # Ensure browser closes even on timeout
+                    if browser is not None:
+                        try:
+                            await browser.close()
+                        except Exception:
+                            pass
+                    # os.unlink(temp_path)
+
+            # Convert CSS pixels to PDF points
+            scale = 72 / 96
+            for g in geometries:
+                g["rect"]["x"] *= scale
+                g["rect"]["y"] *= scale
+                g["rect"]["width"] *= scale
+                g["rect"]["height"] *= scale
+
+            # Save geometry JSON
+            with open(geometry_json_path, "w") as f:
+                json.dump(geometries, f, indent=2)
+
+            # DEBUG
+            draw_geos_on_pdf(
+                geos=geometries,
+                pdf_in=pdf_path,
+                pdf_out=dsfiles.debug_pdf_geometries_directory / f"{doc_id}.pdf",
+            )
+
+            pdf_num_pages = safe_count_pages(pdf_path)
+
+            return {
+                DocLogKey.document_id: doc_id,
+                DocLogKey.render_html_width: page_width_px,
+                DocLogKey.render_html_height: page_height_px,
+                DocLogKey.pdf_num_pages: pdf_num_pages,
+                DocLogKey.pdf_render_error: None,
+                DocLogKey.num_geometries_extracted: len(geometries),
+            }
+
+        except Exception as e:
+            print(f"[yellow]Attempt {attempt} failed for {doc_id}: {e}")
+            await asyncio.sleep(1)
+            last_error = str(e)
+
+    return {
+        DocLogKey.document_id: doc_id,
+        DocLogKey.render_html_width: None,
+        DocLogKey.render_html_height: None,
+        DocLogKey.pdf_num_pages: None,
+        DocLogKey.pdf_render_error: last_error,
+        DocLogKey.num_geometries_extracted: 0,
+    }
+
+
+async def process_batch_async(
+    html_data,
+    dsfiles,
+    extract_geos_for_classes,
+    chromium_concurrency,
+    dsdef,
+    progress,
+    render_task,
+    max_retries,
+    timeout_seconds,
+):
+    """Process a batch of PDFs asynchronously."""
+    semaphore = asyncio.Semaphore(chromium_concurrency)
+
+    tasks = [
+        render_pdf_async(
+            doc_id,
+            html,
+            dsfiles,
+            extract_geos_for_classes,
+            semaphore,
+            max_retries=max_retries,
+            timeout_seconds=timeout_seconds,
+        )
+        for (doc_id, html) in html_data
+    ]
+
+    results = []
+    for coro in asyncio.as_completed(tasks):
+        try:
+            result = await coro
+            dsdef.write_to_document_log(
+                document_id=result[DocLogKey.document_id], vals=result
+            )
+            progress.update(render_task, advance=1)
+            results.append(result)
+
+            if result[DocLogKey.pdf_render_error]:
+                print(
+                    f"[red]PDF failed for {result[DocLogKey.document_id]}: {result[DocLogKey.pdf_render_error]}"
+                )
+            elif (
+                result[DocLogKey.pdf_num_pages] and result[DocLogKey.pdf_num_pages] > 1
+            ):
+                print(
+                    f"[yellow]Warning: {result[DocLogKey.document_id]} rendered to {result[DocLogKey.pdf_num_pages]} pages"
+                )
+
+        except Exception as e:
+            print(f"[red]Unexpected error: {e}")
+            progress.update(render_task, advance=1)
+
+    return results
+
+
+def pipeline_render_pdf_and_extract_geos_parallel(params: PipelineParameters):
+    """
+    Render HTML documents to PDF using async Playwright with automatic size detection.
+    Much faster than sync version!
+    """
+    log_pipeline_level()
+
+    chromium_concurrency = PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY
+    max_retries = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES
+    timeout_seconds = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT
+    # extract_positions_for_classes = ["handwritten"]  # or whatever you need
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+    extract_geos_for_classes = dsdef.valid_labels or []
+
+    # Get valid documents that need PDF generation
+    html_data = []
+    total_samples = 0
+
+    for doc in dsdef.get_document_logs():
+        total_samples += 1
+        pdf_path = dsfiles.pdf_initial_directory / f"{doc.document_id}.pdf"
+        valid_pdf = pdf_path.exists() and doc.pdf_num_pages == 1
+        if not valid_pdf:
+            html_path = dsfiles.raw_html_directory / f"{doc.document_id}.html"
+            if html_path.exists():
+                html = html_path.read_text(encoding="utf-8")
+                html_data.append((doc.document_id, html))
+
+    total = len(html_data)
+    print(
+        f"{total} valid samples out of {total_samples} total samples need to be converted."
+    )
+
+    with get_progress_bar() as progress:
+        render_task = progress.add_task("[red]Rendering PDFs Pass 1...", total=total)
+
+        # Run async event loop
+        results = asyncio.run(
+            process_batch_async(
+                html_data,
+                dsfiles,
+                extract_geos_for_classes,
+                chromium_concurrency,
+                dsdef,
+                progress,
+                render_task,
+                max_retries=max_retries,
+                timeout_seconds=timeout_seconds,
+            )
+        )
+
+    print(f"✅ Finished rendering {len(results)}/{total} PDFs.")
+
+    # Summary stats
+    successful = sum(1 for r in results if r[DocLogKey.pdf_num_pages] == 1)
+    multi_page = sum(
+        1
+        for r in results
+        if r[DocLogKey.pdf_num_pages] and r[DocLogKey.pdf_num_pages] > 1
+    )
+    failed = sum(1 for r in results if r[DocLogKey.pdf_render_error])
+
+    print(
+        f"📊 Summary: {successful} single-page, {multi_page} multi-page, {failed} failed"
+    )
+
+    return results
diff --git a/docgenie/generation/pipeline_05/pdftoimage.py b/docgenie/generation/pipeline_05/pdftoimage.py
new file mode 100755
index 0000000000000000000000000000000000000000..9773220cb8c265d3f062aa005737a1e7cc969a52
--- /dev/null
+++ b/docgenie/generation/pipeline_05/pdftoimage.py
@@ -0,0 +1,14 @@
+import pathlib
+from pdf2image import convert_from_path
+from PIL import Image
+
+from docgenie.generation.constants import PDF_DPI
+
+
+def convert_from_path_singlepage(
+    pdf_path: pathlib.Path, target_size: tuple[int, int] | None = None
+) -> Image.Image:
+    images = convert_from_path(pdf_path, dpi=PDF_DPI, size=target_size)
+    assert len(images) == 1, "Multi-page document are not supported"
+    img = images[0]
+    return img
diff --git a/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py b/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py
new file mode 100755
index 0000000000000000000000000000000000000000..6765f549389e95cf82d61794caba8efbda9281c5
--- /dev/null
+++ b/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py
@@ -0,0 +1,114 @@
+import json
+from docgenie.generation.models import (
+    DocLogKey,
+    OCRBox,
+    PipelineParameters,
+)
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+from docgenie.generation.pipeline_04.extract_bbox import (
+    extract_bboxes_from_pdf,
+    validate_char_bbox_word_mapping,
+)
+from docgenie.generation.utils.bboxes import (
+    draw_bboxes_on_pdf,
+    read_syn_dataset_bboxes,
+    save_bboxes,
+)
+from docgenie.generation.utils.debug import draw_geos_and_bboxes_on_pdf
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+
+    bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid)
+    bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path)
+
+    pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf"
+    outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf"
+
+    geo_path = dsfiles.geometries_directory / f"{docid}.json"
+    geos = json.loads(geo_path.read_text(encoding="utf-8"))
+
+    outpath2 = dsfiles.debug_pdf_bboxes_and_geos_directory / f"{docid}.pdf"
+
+    try:
+        draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm)
+        draw_geos_and_bboxes_on_pdf(
+            pdf_in=pdf_path,
+            pdf_out=outpath2,
+            bboxes_=bbox_unnorm,
+            geos=geos,
+            verbose=False,
+        )
+    except Exception as err:
+        print(f"[ERROR]: Skipping debug PDF: {str(err)}")
+
+
+def pipeline_extract_bboxes(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    valid_document_ids = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1:
+            bbox_path = dsfiles.get_pdf_bbox_path(
+                level="word", doc_id=doclog.document_id
+            )
+            if not bbox_path.exists():
+                valid_document_ids.append(doclog.document_id)
+
+    print(
+        f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for BBox extraction."
+    )
+
+    with get_progress_bar() as progress:
+        bbox_task = progress.add_task(
+            f"[red]Extracting BBoxes from {len(valid_document_ids)} PDFs...",
+            total=len(valid_document_ids),
+        )
+
+        for document_id in valid_document_ids:
+            pdf_path = dsfiles.pdf_initial_directory / f"{document_id}.pdf"
+
+            word_bboxes = extract_bboxes_from_pdf(pdf_path=pdf_path, level="word")
+            # Save word level bounding boxes
+            save_bboxes(
+                bboxes=word_bboxes,
+                bbox_path=dsfiles.get_pdf_bbox_path(level="word", doc_id=document_id),
+            )
+
+            if params.debug:
+                draw_bbox_debug(dsdef=dsdef, docid=document_id)
+
+            # Save character level bounding boxes for splitting handwritting text
+            # before inputting to difussion model (they support only short text)
+            char_bboxes = extract_bboxes_from_pdf(pdf_path=pdf_path, level="char")
+            can_map_chars_to_words = validate_char_bbox_word_mapping(
+                char_bboxes=char_bboxes, word_bboxes=word_bboxes
+            )
+
+            if can_map_chars_to_words:
+                save_bboxes(
+                    bboxes=char_bboxes,
+                    bbox_path=dsfiles.get_pdf_bbox_path(
+                        level="char", doc_id=document_id
+                    ),
+                )
+
+            dsdef.write_to_document_log(
+                document_id=document_id,
+                vals={
+                    DocLogKey.num_word_bboxes: len(word_bboxes),
+                    DocLogKey.num_char_bboxes: len(char_bboxes),
+                    DocLogKey.can_map_chars_to_words: can_map_chars_to_words,
+                },
+            )
+
+            progress.update(bbox_task, advance=1)
diff --git a/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py b/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py
new file mode 100755
index 0000000000000000000000000000000000000000..46fc79923228ababc183bb313ad84258e2b1466b
--- /dev/null
+++ b/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py
@@ -0,0 +1,252 @@
+from dataclasses import asdict
+import json
+from docgenie.generation.models import (
+    DocLogKey,
+    PipelineParameters,
+    SynDatasetDefinition,
+    SynDocumentLog,
+)
+from docgenie.generation.models._bbox import LayoutBox
+from docgenie.generation.models._consts import DatasetTask
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.utils.debug import draw_geos_on_pdf
+from docgenie.generation.utils.geos import (
+    read_custom_elements_from_geos,
+    read_layout_elements_from_geos,
+)
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def extract_layout_elements_from_geos(
+    dsdef: SynDatasetDefinition, doc_id: str, debug: bool
+) -> list[dict]:
+    files = dsdef.get_file_structure()
+    geo_path = files.geometries_directory / f"{doc_id}.json"
+    geos_gen = read_layout_elements_from_geos(geo_path=geo_path)
+    geos: list[dict] = list(geos_gen)
+
+    # Add temporary unique ID to each layout element.
+    results = []
+    for i, geo in enumerate(geos):
+        r = geo["rect"]
+        if not geo["classes"]:
+            result = {
+                "id": f"le{i}",
+                "class": None,
+                "content": None,
+                "rect": r,
+                "error": "no-class",
+            }
+        else:
+            classes = geo["classes"].split(" ")
+            # layout_node_class cannot be None, since the elementes are selected by class in pipeline_04
+            layout_node_class = next(
+                (cls for cls in classes if cls.startswith("LE-")), None
+            )
+            result = {
+                "id": f"le{i}",
+                "class": layout_node_class,
+                "content": layout_node_class,
+                "rect": r,
+                "error": None,
+            }
+
+            if r["width"] == 0 or r["height"] == 0:
+                result["error"] = "invalid-size"
+
+        results.append(result)
+
+    if debug:
+        debug_pdf_file = files.debug_pdf_layout_directory / f"{doc_id}.pdf"
+        draw_geos_on_pdf(
+            geos=geos,
+            pdf_in=files.pdf_initial_directory / f"{doc_id}.pdf",
+            pdf_out=debug_pdf_file,
+        )
+
+    return results
+
+
+def process_dla(
+    document_log: SynDocumentLog,
+    dsdef: SynDatasetDefinition,
+    dsfiles: SyntheticDatasetFileStructure,
+    debug: bool,
+):
+    document_id = document_log.document_id
+    data = extract_layout_elements_from_geos(
+        dsdef=dsdef, doc_id=document_id, debug=debug
+    )
+    valid_data = [d for d in data if d["error"] is None]
+    errors = [f"{d['id']}: {d['error']}" for d in data if d["error"] is not None]
+
+    dsdef.write_to_document_log(
+        document_id=document_id,
+        vals={
+            DocLogKey.layout_elements_num_elements: len(data),
+            DocLogKey.layout_elements_extraction_errors: errors,
+            DocLogKey.layout_elements_generation_logs: data,
+            DocLogKey.raw_annotation_gt_found: len(valid_data) > 0,
+            DocLogKey.raw_annotation_gt_extraction_errors: errors,
+            DocLogKey.raw_gt_or_annotation_annotations_count: len(data),
+        },
+    )
+
+    # data is None if there were no bboxes extracted for layout elements
+    if data is None or len(data) == 0:  # type: ignore
+        return 0
+
+    result_path = dsfiles.layout_element_definitions_directory / f"{document_id}.json"
+    result_path.write_text(json.dumps(data, indent=4), encoding="utf-8")
+
+    # Write GT for DLA: raw_annotations are layout bboxes and gt is normalized layout bboxes (created later in pipeline where bboxes are normalized)
+    raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+    layout_bboxes = []
+    for d in data:
+        x0 = d["rect"]["x"]
+        y0 = d["rect"]["y"]
+        x2 = d["rect"]["x"] + d["rect"]["width"]
+        y2 = d["rect"]["y"] + d["rect"]["height"]
+        layout_bboxes.append(LayoutBox(x0=x0, y0=y0, x2=x2, y2=y2, label=d["content"]))
+
+    boxes_dicts = [asdict(b) for b in layout_bboxes]
+    raw_annotations_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8")
+
+    return len(data)
+
+
+def extract_kie_elements_from_geos(
+    dsdef: SynDatasetDefinition, doc_id: str
+) -> list[dict]:
+    files = dsdef.get_file_structure()
+    geo_path = files.geometries_directory / f"{doc_id}.json"
+    geos_gen = read_custom_elements_from_geos(geo_path=geo_path)
+    geos: list[dict] = list(geos_gen)
+
+    # Add temporary unique ID to each layout element.
+    results = []
+    for i, geo in enumerate(geos):
+        classes = geo["classes"].split(" ")
+        # kie_label cannot be None, since the elementes are selected by class in pipeline_04
+        all_kie_labels = [c for c in classes if c in dsdef.valid_labels]
+
+        kie_label = all_kie_labels[0]
+        # kie_secondary_label can be none, depending on the task
+        all_secondary_labels = [
+            c for c in classes if c in (dsdef.valid_secondary_labels or [])
+        ]
+        kie_secondary_label = (
+            all_secondary_labels[0] if len(all_secondary_labels) > 0 else None
+        )
+
+        # print(f"{classes=} {kie_label=} {kie_secondary_label=}")
+        # input()
+
+        result = {
+            "id": f"{i}_{kie_label}_{kie_secondary_label}",
+            "group": kie_secondary_label,
+            "key": kie_label,
+            "value": geo["text"],
+            "rect": geo["rect"],
+            "error": None,
+        }
+
+        # Don't allow empty key
+        if not geo["text"] or not geo["text"].strip():
+            result["error"] = "missing-value"
+
+        if len(all_kie_labels) > 1:
+            result["error"] = "multiple-labels"
+
+        results.append(result)
+
+    return results
+
+
+def process_kie(
+    document_log: SynDocumentLog,
+    dsdef: SynDatasetDefinition,
+    dsfiles: SyntheticDatasetFileStructure,
+):
+    document_id = document_log.document_id
+    data = extract_kie_elements_from_geos(dsdef=dsdef, doc_id=document_id)
+    valid_data = [d for d in data if d["error"] is None]
+    errors = [f"{d['id']}: {d['error']}" for d in data if d["error"] is not None]
+
+    dsdef.write_to_document_log(
+        document_id=document_id,
+        vals={
+            DocLogKey.layout_elements_num_elements: 0,
+            DocLogKey.layout_elements_extraction_errors: [],
+            DocLogKey.layout_elements_generation_logs: [],
+            DocLogKey.raw_annotation_gt_found: len(valid_data) > 0,
+            DocLogKey.raw_annotation_gt_extraction_errors: errors,
+            DocLogKey.raw_gt_or_annotation_annotations_count: len(valid_data),
+        },
+    )
+
+    # data is None if there were no bboxes extracted for layout elements
+    if data is None or len(data) == 0:  # type: ignore
+        return 0
+
+    # Write GT for KIE: raw_annotations are extracted annotation and gt is with value mapped to word bboxes (created later)
+    raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+    raw_annotations_path.write_text(json.dumps(data, indent=4), encoding="utf-8")
+
+    return len(data)
+
+
+def pipeline_extract_layout_element_definitions_and_annotation_gt(
+    params: PipelineParameters,
+):
+    log_pipeline_level()
+
+    if params.dsdef.prompt_task != "annotation":
+        return
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    document_logs = {}
+    valid_document_ids = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1:
+            valid_document_ids.append(doclog.document_id)
+            document_logs[doclog.document_id] = doclog
+
+    total_document_count = 0
+    total_layout_elements_count = 0
+    dataset_task = DatasetTask(dsdef.task)
+
+    with get_progress_bar() as progress:
+        annotation_task = progress.add_task(
+            description=f"[red]Extracting layout elements from {len(valid_document_ids)} PDFs...",
+            total=len(valid_document_ids),
+        )
+
+        for document_id, document_log in document_logs.items():
+            if dataset_task == DatasetTask.DLA:
+                found_annotations = process_dla(
+                    document_log=document_log,
+                    dsdef=dsdef,
+                    dsfiles=dsfiles,
+                    debug=params.debug,
+                )
+                total_document_count += 1 if found_annotations > 0 else 0
+                total_layout_elements_count += found_annotations
+
+            # This whole pipeline step is only executed if prompt_task == 'annotation', thus KIE raw_annotations are not overridden if its modelled as prompt_task=='json'
+            elif dataset_task == DatasetTask.KIE:
+                found_annotations = process_kie(
+                    document_log=document_log, dsdef=dsdef, dsfiles=dsfiles
+                )
+                total_document_count += 1 if found_annotations > 0 else 0
+                total_layout_elements_count += found_annotations
+
+            progress.update(annotation_task, advance=1)
+
+        print(f"Extracted {total_layout_elements_count=} from {total_document_count=}")
diff --git a/docgenie/generation/pipeline_07_extract_handwriting.py b/docgenie/generation/pipeline_07_extract_handwriting.py
new file mode 100755
index 0000000000000000000000000000000000000000..c9699a6f7f7d2816b31b7993c274c106444ee975
--- /dev/null
+++ b/docgenie/generation/pipeline_07_extract_handwriting.py
@@ -0,0 +1,284 @@
+from collections import defaultdict
+from curses.ascii import isspace
+import json
+import pathlib
+
+from bs4 import BeautifulSoup
+from docgenie.generation.constants import (
+    BBOX_TO_GEO_MATCHING_THRESHOLD,
+    HANDWRITING_CLASS_NAME,
+    PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN,
+    SIGNATURE_CLASS_NAME,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    OCRBox,
+    PipelineParameters,
+    SynDatasetDefinition,
+)
+
+from docgenie.generation.utils.bboxes import is_in_rect, read_syn_dataset_bboxes
+from docgenie.generation.utils.geos import read_handwriting_elements_from_geos
+from docgenie.generation.utils.handwriting import get_author_id
+from docgenie.generation.utils.html import get_field_text
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+__BS_PARSER = "lxml"  # "html.parser"
+
+
+"""
+Need to work with char level here as well and split words longer than N characters
+"""
+
+
+def decompose_word_bboxes(word_bboxes, char_bboxes, N: int = 7, verbose=False):
+    bbox: OCRBox
+    word_no_to_chars = defaultdict(list)
+    for bbox in char_bboxes:
+        word_no_to_chars[bbox.key].append(bbox)
+
+    result = []
+    for bbox in word_bboxes:
+        # result.append([])
+        if N > 0 and len(bbox.text) > N:
+            # Split bbox -> collect char bboxes
+            chars = word_no_to_chars[bbox.key]
+            # print(bbox.text, ''.join([c.text for c in chars]))
+            for i in range(0, len(bbox.text), N):
+                subword_chars: list[OCRBox] = chars[i : i + N]
+                if verbose:
+                    for c in subword_chars:
+                        print(c)
+                subword_bbox = OCRBox(
+                    x0=subword_chars[0].x0,
+                    y0=subword_chars[0].y0,
+                    x2=subword_chars[-1].x2,
+                    y2=subword_chars[-1].y2,
+                    text=bbox.text[i : i + N],
+                    block_no=bbox.block_no,
+                    line_no=bbox.line_no,
+                    word_no=bbox.word_no,
+                )
+                # print(subword_bbox)
+                # result[-1].append(subword_bbox)
+                result.append(subword_bbox)
+        else:
+            # result[-1].append(bbox)
+            result.append(bbox)
+
+    # print(result)
+    # input()
+    return result
+
+
+def extract_handwritten_fields(
+    dsdef: SynDatasetDefinition, doc_id: str, max_word_len: int = -1
+) -> list[dict]:
+    paths = dsdef.get_file_structure()
+    word_bbox_path = paths.get_pdf_bbox_path(level="word", doc_id=doc_id)
+    word_bboxes = read_syn_dataset_bboxes(word_bbox_path)
+    char_bbox_path = paths.get_pdf_bbox_path(level="char", doc_id=doc_id)
+    char_bboxes = read_syn_dataset_bboxes(char_bbox_path)
+
+    geo_path = paths.geometries_directory / f"{doc_id}.json"
+    geos = read_handwriting_elements_from_geos(geo_path=geo_path)
+    geos = list(geos)
+
+    # Extract text content
+    result = []
+    taken_bbox_indices = set()
+    for i, geo in enumerate(geos):
+        field_text = geo["text"]
+
+        # Get author ID
+        classes = geo["classes"].split(" ")
+        author_id = get_author_id(classes)  # type: ignore
+
+        is_signature = SIGNATURE_CLASS_NAME in classes  # type: ignore
+
+        if author_id is None:
+            value = {
+                "id": f"hw{i}",
+                "text": field_text,
+                "author-id": None,
+                "bboxes": None,
+                "rect": geo["rect"],
+                "is_signature": is_signature,
+                "error": "no-authorid",
+            }
+            result.append(value)
+            continue
+
+        if not field_text or field_text.isspace():
+            value = {
+                "id": f"hw{i}",
+                "text": field_text,
+                "author-id": None,
+                "bboxes": None,
+                "rect": geo["rect"],
+                "is_signature": is_signature,
+                "error": "no-text",
+            }
+            result.append(value)
+            continue
+
+        startidx, stopidx = find_bbox_indices(
+            word_bboxes,
+            query=field_text,
+            taken_indices=taken_bbox_indices,
+            rect=geo["rect"],
+            verbose=False,
+        )
+        taken_bbox_indices.add((startidx, stopidx))
+        if startidx is None or stopidx is None:
+            value = {
+                "id": f"hw{i}",
+                "text": field_text,
+                "author-id": None,
+                "bboxes": None,
+                "rect": geo["rect"],
+                "is_signature": is_signature,
+                "error": "not-found",
+            }
+            result.append(value)
+            continue
+
+        corresponding_boxes = word_bboxes[startidx:stopidx]
+        extracted_text = " ".join([b.text for b in corresponding_boxes])
+        extracted_text = extracted_text.strip()
+        # assert field_text == extracted_text, f'{field_text=} {extracted_text=}'
+
+        # Split words to max len for diffusion model
+        if max_word_len > 1:
+            split_bboxes = decompose_word_bboxes(
+                word_bboxes=corresponding_boxes,
+                char_bboxes=char_bboxes,
+                N=max_word_len,
+                verbose=False,
+            )
+            bboxes = split_bboxes
+        else:
+            bboxes = corresponding_boxes
+
+        value = {
+            "id": f"hw{i}",
+            "text": field_text,
+            "author-id": author_id,
+            "bboxes": [b.as_string() for b in bboxes],
+            "rect": geo["rect"],
+            "is_signature": is_signature,
+            "error": None,
+        }
+        result.append(value)
+
+    return result
+
+
+def find_bbox_indices(
+    bboxes: list[OCRBox],
+    query: str,
+    taken_indices: set[tuple[int, int]],
+    rect: dict,
+    verbose: bool,
+) -> tuple[int | None, int | None]:
+    """
+    Find consecutive bounding boxes matching the full query string.
+
+    Parameters:
+        bboxes (list of tuples): [(x1, y1, x2, y2, text), ...]
+        query (str): The full string to search for (words separated by spaces)
+
+    Returns:
+        list of tuples: The matching sublist of bounding boxes, or [] if not found
+    """
+    words = query.split()
+    n = len(words)
+
+    for i in range(len(bboxes) - n + 1):
+        # Extract the text from a consecutive slice
+        slice_texts = [b.text for b in bboxes[i : i + n]]
+        start, stop = i, i + n
+
+        if slice_texts == words:
+            if (start, stop) not in taken_indices:
+                start_in_rect = is_in_rect(
+                    rect=rect,
+                    bbox=bboxes[start],
+                    threshold=BBOX_TO_GEO_MATCHING_THRESHOLD,
+                )
+                stop_in_rect = is_in_rect(
+                    rect=rect,
+                    bbox=bboxes[stop - 1],
+                    threshold=BBOX_TO_GEO_MATCHING_THRESHOLD,
+                )
+                # # start_in_rect = True
+                # # stop_in_rect = True
+                # if query == "K. Thompson":
+                #     print(
+                #         f"{bboxes[start]=} {bboxes[stop]=} {rect=} {start_in_rect=} {stop_in_rect=}"
+                #     )
+                #     input()
+                if start_in_rect and stop_in_rect:
+                    return (start, stop)
+
+    return (None, None)
+
+
+def pipeline_extract_handwritten_fields(params: PipelineParameters):
+    log_pipeline_level()
+
+    max_word_len = PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    valid_document_ids = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1 and doclog.can_map_chars_to_words:
+            bbox_path = dsfiles.get_pdf_bbox_path(
+                level="char", doc_id=doclog.document_id
+            )
+            if bbox_path.exists():
+                valid_document_ids.append(doclog.document_id)
+
+    print(
+        f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for handwritten&signature extraction."
+    )
+
+    with get_progress_bar() as progress:
+        hw_task = progress.add_task(
+            f"[red]Extracting Handwriting from {len(valid_document_ids)} PDFs...",
+            total=len(valid_document_ids),
+        )
+
+        for document_id in valid_document_ids:
+            data = extract_handwritten_fields(
+                dsdef=dsdef, doc_id=document_id, max_word_len=max_word_len
+            )
+
+            errors = [
+                f'{d["id"]}: {d["error"]}, text: "{d["text"]}"'
+                for d in data
+                if d["error"] is not None
+            ]
+
+            if len(data) > 0:
+                result_path = (
+                    dsfiles.handwritten_bboxes_directory / f"{document_id}.json"
+                )
+                json_str = json.dumps(data, indent=4)
+                result_path.write_text(json_str, encoding="utf-8")
+
+            dsdef.write_to_document_log(
+                document_id=document_id,
+                vals={
+                    DocLogKey.handwriting_num_elements: len(data),
+                    DocLogKey.handwriting_element_extraction_errors: errors,
+                },
+            )
+
+            progress.update(hw_task, advance=1)
diff --git a/docgenie/generation/pipeline_08_extract_visual_element_definitions.py b/docgenie/generation/pipeline_08_extract_visual_element_definitions.py
new file mode 100755
index 0000000000000000000000000000000000000000..eeb12b9888c26b6228996b79e7af182dbfb4c722
--- /dev/null
+++ b/docgenie/generation/pipeline_08_extract_visual_element_definitions.py
@@ -0,0 +1,210 @@
+import json
+import pathlib
+import re
+
+from bs4 import BeautifulSoup
+import cssutils
+
+from docgenie.generation.constants import (
+    BS_PARSER,
+    VISUAL_ELEMENT_TYPE_SYNONYMS,
+    VISUAL_ELEMENT_TYPES,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    OCRBox,
+    PipelineParameters,
+    SynDatasetDefinition,
+)
+from rich.progress import (
+    Progress,
+    TimeElapsedColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+)
+
+from docgenie.generation.utils.bboxes import draw_bboxes_on_pdf, read_syn_dataset_bboxes
+from docgenie.generation.utils.geos import read_visual_elements_from_geos
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+from docgenie.generation.utils.visualelement import get_visual_element_id
+
+
+def extract_dimensions(style: str) -> tuple[int | None, int | None]:
+    """
+    Returns width,height in milimeters or None,None
+    """
+    # Parse width and height
+    width = None
+    height = None
+
+    for prop in style.split(";"):
+        if ":" not in prop:
+            continue
+        key, value = prop.split(":", 1)
+        key = key.strip().lower()
+        value = value.strip()
+        if key == "width":
+            width = value
+        elif key == "height":
+            height = value
+
+    def normalize(val: str | None):
+        if val is None:
+            return None
+
+        if val.endswith("mm"):
+            return int(val.replace("mm", ""))
+        elif val.endswith("cm"):
+            return int(float(val.replace("cm", "")) * 10)
+        else:
+            print(f'Encountered an unknown size unit "{val}". (Setting size to `None`)')
+            return None
+
+    return normalize(width), normalize(height)
+
+
+def parse_2d_rotation(transform_str):
+    """
+    Extracts the 2D rotation angle in degrees from a CSS transform string.
+    Returns None if no rotation is found.
+    """
+    # Regex to match rotate(<angle>deg)
+    match = re.search(r"rotate\(\s*([-+]?\d*\.?\d+)\s*deg\s*\)", transform_str)
+    if match:
+        return float(match.group(1))
+    return None
+
+
+def extract_rotation_from_transform(style: str) -> float | None | None:
+    if not style:
+        return None
+    style = cssutils.parseStyle(style)
+    return parse_2d_rotation(style["transform"])  # type: ignore
+
+
+def extract_visual_elements_from_geos(
+    dsdef: SynDatasetDefinition, doc_id: str
+) -> list[dict]:
+    files = dsdef.get_file_structure()
+    geo_path = files.geometries_directory / f"{doc_id}.json"
+    geos = read_visual_elements_from_geos(geo_path=geo_path)
+    geos = list(geos)
+
+    result = []
+    for i, geo in enumerate(geos):
+        data_type = geo["dataPlaceholder"]
+
+        # Map using type synonyms
+        valid_type = data_type in VISUAL_ELEMENT_TYPES
+        type_mapped = data_type
+        if not valid_type:
+            if data_type in VISUAL_ELEMENT_TYPE_SYNONYMS:
+                type_mapped = VISUAL_ELEMENT_TYPE_SYNONYMS[data_type]  # type: ignore
+            else:
+                type_mapped = None
+
+        data_content = geo["dataContent"]
+
+        style = geo["style"]
+        # width, height = extract_dimensions(style)  # type: ignore
+        rotation = extract_rotation_from_transform(style)  # type: ignore
+        invalid_size = geo["rect"]["width"] == 0 or geo["rect"]["height"] == 0
+        if type_mapped is None:
+            value = {
+                "id": f"ve{i}",
+                "type": None,
+                "type_unmapped": data_type,
+                "content": data_content,
+                "rect": geo["rect"],
+                "rotation": rotation,
+                "error": "unknown-type",
+            }
+        elif invalid_size:
+            value = {
+                "id": f"ve{i}",
+                "type": type_mapped,
+                "type_unmapped": data_type,
+                "content": data_content,
+                "rect": geo["rect"],
+                "rotation": rotation,
+                "error": "invalid-size",
+            }
+        else:
+            value = {
+                "id": f"ve{i}",
+                "type": type_mapped,
+                "type_unmapped": data_type,
+                "content": data_content,
+                "rect": geo["rect"],
+                "rotation": rotation,
+                "error": None,
+            }
+
+        # print(value)
+        result.append(value)
+
+    return result
+
+
+def mm_to_px(mm):
+    return mm * 72 / 25.4
+
+
+def pipeline_extract_visual_element_definitions(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    valid_document_ids = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1:
+            valid_document_ids.append(doclog.document_id)
+
+    print(
+        f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for visual element extraction."
+    )
+
+    with get_progress_bar() as progress:
+        vee_task = progress.add_task(
+            f"[red]Extracting visual elements from {len(valid_document_ids)} PDFs...",
+            total=len(valid_document_ids),
+        )
+
+        total_visual_elements_count = 0
+        for document_id in valid_document_ids:
+            data = extract_visual_elements_from_geos(dsdef=dsdef, doc_id=document_id)
+
+            errors = [
+                f"{d['id']}: {d['error']}" for d in data if d["error"] is not None
+            ]
+
+            dsdef.write_to_document_log(
+                document_id=document_id,
+                vals={
+                    DocLogKey.visual_elements_num_elements: len(data),
+                    DocLogKey.visual_elements_extraction_errors: errors,
+                },
+            )
+
+            # data is None if there were no bboxes extracted for visual elements
+            if data is None or len(data) == 0:  # type: ignore
+                progress.update(vee_task, advance=1)
+                continue
+
+            total_visual_elements_count += len(data)
+
+            result_path = (
+                dsfiles.visual_element_definitions_directory / f"{document_id}.json"
+            )
+            json_str = json.dumps(data, indent=4)
+            result_path.write_text(json_str, encoding="utf-8")
+
+            progress.update(vee_task, advance=1)
+
+        print(f"{total_visual_elements_count=}")
diff --git a/docgenie/generation/pipeline_09_create_handwriting_images.py b/docgenie/generation/pipeline_09_create_handwriting_images.py
new file mode 100755
index 0000000000000000000000000000000000000000..e7aad2cae23baa0cccfef613efc62d4063719c67
--- /dev/null
+++ b/docgenie/generation/pipeline_09_create_handwriting_images.py
@@ -0,0 +1,66 @@
+import json
+import pathlib
+from docgenie import GENERATION
+from docgenie.generation.constants import WRITER_STYLES
+from docgenie.generation.handwriting_diffusion.add_handwriting_blur import (
+    blur_handwriting,
+)
+from docgenie.generation.handwriting_diffusion.generate_handwriting_diffusion_raw import (
+    generate_handwriting,
+)
+from docgenie.generation.models import PipelineParameters
+from docgenie.generation.models._log import DocLogKey, SynDocumentLog
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def pipeline_create_handwriting_images(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+    has_handwriting = list(
+        [f for f in dsfiles.handwritten_bboxes_directory.iterdir() if f.is_file()]
+    )
+
+    if params.generate_handwriting and has_handwriting:
+        # sentences_path = dsfiles.handwritten_text_images_directory / "sentences"
+        # handwriting_exists = sentences_path.exists()
+        # if handwriting_exists:
+        #     print(f"Existing handwriting found at {sentences_path} - skipping step.")
+        #     return
+
+        with get_progress_bar() as progress:
+            generate_handwriting(
+                input_dir=dsfiles.handwritten_bboxes_directory,
+                output_dir=dsfiles.handwritten_text_images_directory,
+                run_dir=GENERATION.HANDWRITING_MODEL_CHECKPOINT.parent,
+                checkpoint=GENERATION.HANDWRITING_MODEL_CHECKPOINT.name,
+                progress=progress,
+                word_gap=40,
+                segment_gap=0,
+                allowed_writers=[str(s) for s in WRITER_STYLES],
+                baseline_percentile=50,
+                batch_size=params.handwriting_batch_size,
+            )
+
+            print(f"{params.blur_handwriting_images=}")
+            if params.blur_handwriting_images:
+                blur_handwriting(
+                    input_root=dsfiles.handwritten_text_images_directory / "sentences",
+                    in_place=True,
+                    suffix="",
+                )
+
+            # Log selected writer styles
+            log_path = dsfiles.handwritten_text_images_directory / "raw_token_map.json"
+            genlogs = json.loads(log_path.read_text(encoding="utf-8"))
+            for k, v in genlogs["file_author_styles"].items():
+                doc_id = k.replace(".json", "")
+                dsdef.write_to_document_log(
+                    document_id=doc_id,
+                    vals={DocLogKey.handwriting_generation_authorid_to_writerstyle: v},
+                )
+
+    else:
+        print("No handwriting bboxes found - skipping step.")
diff --git a/docgenie/generation/pipeline_10_create_visual_elements.py b/docgenie/generation/pipeline_10_create_visual_elements.py
new file mode 100755
index 0000000000000000000000000000000000000000..42c256776dd84968b4f5b7fafb555b9a0f62351c
--- /dev/null
+++ b/docgenie/generation/pipeline_10_create_visual_elements.py
@@ -0,0 +1,300 @@
+"""
+TODO: latent diffusion model inference
+"""
+
+import pathlib
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.stamp import (
+    create_stamp,
+)
+import json
+from docgenie import ENV
+import random
+from pathlib import Path
+from PIL import Image
+import io
+from barcode import Code128
+from barcode.writer import ImageWriter
+from docgenie.generation.models import (
+    DocLogKey,
+    PipelineParameters,
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+    LLMType,
+)
+from docgenie.generation.utils.status import get_progress_bar
+
+__LOGO_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "logo"
+__FIGURE_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "figure"
+__PHOTO_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "photo"
+_LOGO_CACHE = None
+_PHOTO_CACHE = None
+_CHART_CACHE = None
+
+
+def _get_prefabs_paths(image_type: str) -> list[Path]:
+    """Cache logo paths to avoid repeated directory scans."""
+    global _LOGO_CACHE, _PHOTO_CACHE, _CHART_CACHE
+
+    image_type_lower = image_type.lower()
+
+    if image_type_lower == "logo":
+        if _LOGO_CACHE is None:
+            _LOGO_CACHE = _scan_directory(__LOGO_PREFABS__, "logo")
+        return _LOGO_CACHE
+    elif image_type_lower == "photo":
+        if _PHOTO_CACHE is None:
+            _PHOTO_CACHE = _scan_directory(__PHOTO_PREFABS__, "photo")
+        return _PHOTO_CACHE
+    elif image_type_lower == "figure":
+        if _CHART_CACHE is None:
+            _CHART_CACHE = _scan_directory(__FIGURE_PREFABS__, "figure")
+        return _CHART_CACHE
+    else:
+        raise ValueError(
+            f"Invalid image_type: {image_type}. Must be 'logo', 'photo', or 'figure'"
+        )
+
+
+def _scan_directory(directory, image_type):
+    """Helper to scan directory for images."""
+    paths = []
+    for ext in ("*.png", "*.jpg", "*.jpeg"):
+        paths.extend(directory.glob(ext))
+
+    if not paths:
+        raise FileNotFoundError(f"No {image_type} images found in {directory}")
+
+    return paths
+
+
+"""
+{
+        "id": "ve0",
+        "type": "stamp",
+        "type_unmapped": "stamp",
+        "content": "CONFIDENTIAL",
+        "rect": {
+            "x": 766.7671508789062,
+            "y": 100.63824462890625,
+            "width": 138.8602294921875,
+            "height": 138.8602294921875
+        },
+        "rotation": -15.0,
+        "error": null
+    }
+"""
+
+
+def _prepare_stamp(
+    result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+):
+    content = ved["content"]
+    rotation = ved["rotation"]
+    width = ved["rect"]["width"]
+    height = ved["rect"]["height"]
+    # we dont pass rotation here, each stamp has a slight random rotation, we apply rotation in insertion
+    stamp = create_stamp(text=content, width=width, height=height, rot_angle=None)
+    stamp.save(result_path)
+
+
+def _prepare_logo(
+    result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+):
+    logo_paths = _get_prefabs_paths("logo")  # getting chached logo paths here
+
+    selected_logo_image_path = random.choice(logo_paths)
+    logo_image = Image.open(selected_logo_image_path).convert(
+        "RGBA"
+    )  # check this conversion if face any issues
+    """If anyone want to do any processing on image do it here->like text insertion"""
+    logo_image.save(result_path)
+
+
+# Generate barcode with transparent background
+writer = ImageWriter()
+writer.set_options(
+    {  # I think we have to play around with these numbers
+        "module_width": 0.3,
+        "module_height": 15.0,
+        "quiet_zone": 6.5,
+        "font_size": 7,
+        "text_distance": 5,
+        "background": "rgba(255, 255, 255, 0)",  # Transparent background
+        "foreground": "black",
+    }
+)
+
+
+def _prepare_barcode(
+    result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+):
+    content = ved["content"]
+    if content and content.strip().isdigit():
+        barcode_content = content.strip()
+    else:
+        # Generate random number if content is invalid or empty
+        barcode_content = str(
+            random.randint(100000000000, 999999999999)
+        )  # 12-digit number
+
+    code128 = Code128(barcode_content, writer=writer)
+
+    # Save to buffer first to handle transparency
+    buffer = io.BytesIO()
+    code128.write(buffer, options={"format": "PNG"})
+    buffer.seek(0)
+
+    barcode_image = Image.open(buffer).convert("RGBA")  # Transparent background
+    barcode_image.save(result_path)
+
+
+def _prepare_photo(
+    result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+):
+    photo_paths = _get_prefabs_paths("photo")  # getting chached photo paths here
+
+    selected_photo_image_path = random.choice(photo_paths)
+    photo_image = Image.open(
+        selected_photo_image_path
+    )  # check this conversion if face any issues
+    photo_image.save(result_path)
+
+
+def _prepare_figure(
+    result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+):
+    chart_paths = _get_prefabs_paths("figure")  # getting chached charts paths here
+
+    selected_chart_image_path = random.choice(chart_paths)
+    chart_image = Image.open(
+        selected_chart_image_path
+    )  # check this conversion if face any issues
+    chart_image.save(result_path)
+
+
+def process_visual_element_definition(
+    ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure
+) -> dict:
+    content = ved["content"]
+    ved_id = ved["id"]
+    error = ved["error"]
+    log = {
+        "id": ved_id,
+        "type": ved["type"],
+        "type_unmapped": ved["type_unmapped"],
+        "content": content,
+        "error": error,
+    }
+
+    document_visual_elements_dir = dsfiles.visual_elements_directory / docid
+    document_visual_elements_dir.mkdir(parents=True, exist_ok=True)
+    result_path = document_visual_elements_dir / f"{ved_id}.png"
+
+    # Skip already generated vis elements
+    if error is None and not result_path.exists():
+        match ved["type"]:
+            case "stamp":
+                _prepare_stamp(
+                    result_path=result_path,
+                    ved=ved,
+                    docid=docid,
+                    dsfiles=dsfiles,
+                )
+            case "logo":
+                _prepare_logo(
+                    result_path=result_path,
+                    ved=ved,
+                    docid=docid,
+                    dsfiles=dsfiles,
+                )
+            case "barcode":
+                _prepare_barcode(
+                    result_path=result_path,
+                    ved=ved,
+                    docid=docid,
+                    dsfiles=dsfiles,
+                )
+            case "photo":
+                _prepare_photo(
+                    result_path=result_path,
+                    ved=ved,
+                    docid=docid,
+                    dsfiles=dsfiles,
+                )
+            case "figure":
+                _prepare_figure(
+                    result_path=result_path,
+                    ved=ved,
+                    docid=docid,
+                    dsfiles=dsfiles,
+                )
+            case _:
+                log["error"] = "unknown-type"
+
+    log["image_path"] = str(result_path) if result_path is not None else None
+
+    return log
+
+
+def prepare_visual_elements(
+    defs: list[dict], docid: str, dsfiles: SyntheticDatasetFileStructure
+) -> list[dict]:
+    logs = []
+
+    random.seed(docid)
+    for ved in defs:
+        log = process_visual_element_definition(ved, docid=docid, dsfiles=dsfiles)
+        logs.append(log)
+
+    return logs
+
+
+def pipeline_create_visual_elements(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid documents
+    valid_documents = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1:
+            has_visual_elements = doclog.visual_elements_num_elements > 0
+            if has_visual_elements:
+                valid_documents.append(doclog.document_id)
+
+    print(
+        f"{len(valid_documents)} of {total_pdfs_count} documents valid for visual element generation."
+    )
+
+    with get_progress_bar() as progress:
+        insert_task = progress.add_task(
+            "[red]Creating visual elements...", total=len(valid_documents)
+        )
+        for docid in valid_documents:
+            visual_element_def_file = (
+                dsfiles.visual_element_definitions_directory / f"{docid}.json"
+            )
+            visual_element_definitions = json.loads(
+                visual_element_def_file.read_text(encoding="utf-8")
+            )
+            insertion_logs = prepare_visual_elements(
+                defs=visual_element_definitions, docid=docid, dsfiles=dsfiles
+            )
+
+            errors = [
+                f"{d['id']}: {d['error']}"
+                for d in insertion_logs
+                if d["error"] is not None
+            ]
+            dsdef.write_to_document_log(
+                document_id=docid,
+                vals={
+                    DocLogKey.visual_elements_generation_logs: insertion_logs,
+                    DocLogKey.visual_elements_generation_errors: errors,
+                },
+            )
+            progress.update(insert_task, advance=1)
diff --git a/docgenie/generation/pipeline_11_render_pdf_second_pass.py b/docgenie/generation/pipeline_11_render_pdf_second_pass.py
new file mode 100755
index 0000000000000000000000000000000000000000..77576d23afb7d98a479b94b5851950100bdc5a6f
--- /dev/null
+++ b/docgenie/generation/pipeline_11_render_pdf_second_pass.py
@@ -0,0 +1,281 @@
+import asyncio
+from playwright.async_api import async_playwright
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import pathlib
+import re
+import time
+import tempfile
+import os
+
+from PyPDF2 import PdfReader
+from bs4 import BeautifulSoup
+from rich.progress import Progress
+
+from docgenie import ENV
+from docgenie.generation.constants import (
+    BS_PARSER,
+    HANDWRITING_CLASS_NAME,
+    PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY,
+    PIPELINE_03_RENDER_PDF__MAX_WORKERS,
+    PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES,
+    PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    PipelineParameters,
+    SyntheticDatasetFileStructure,
+    SynDatasetDefinition,
+)
+from docgenie.generation.models._log import SynDocumentLog
+from docgenie.generation.pipeline_03.css import (
+    increase_handwriting_font_size,
+    postprocess_handwriting,
+    unmark_visual_elements,
+)
+from docgenie.generation.utils.debug import draw_geos_on_pdf
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def safe_count_pages(pdf_path: pathlib.Path):
+    with open(pdf_path, "rb") as f:
+        reader = PdfReader(f)
+        return len(reader.pages)
+
+
+async def render_pdf_async(
+    doclog: SynDocumentLog,
+    dsfiles: SyntheticDatasetFileStructure,
+    extract_geos_for_classes: list[str],
+    semaphore: asyncio.Semaphore,
+    max_retries=2,
+    timeout_seconds=60,
+):
+    """
+    Async version: Render PDF using Playwright with automatic size detection.
+    """
+    doc_id = doclog.document_id
+
+    last_error = None
+    for attempt in range(1, max_retries + 2):
+        browser = None
+        try:
+            pdf_path = (
+                dsfiles.pdf_without_handwriting_placeholder_directory / f"{doc_id}.pdf"
+            )
+            render_html_path = (
+                dsfiles.render_html_second_pass_directory / f"{doc_id}.html"
+            )
+            html_path = dsfiles.render_html_directory / f"{doc_id}.html"
+            html = html_path.read_text(encoding="utf-8")
+
+            soup = BeautifulSoup(html, BS_PARSER)
+            soup = postprocess_handwriting(soup)
+            prep_html = soup.prettify()
+            render_html_path.write_text(prep_html, encoding="utf-8")  # type: ignore
+
+            # Acquire semaphore for Chromium concurrency control
+            async with semaphore:
+                try:
+                    async with asyncio.timeout(timeout_seconds):
+                        async with async_playwright() as p:
+                            browser = await p.chromium.launch(headless=True)
+                            page = await browser.new_page()
+
+                            # Load HTML
+                            await page.goto(
+                                f"file://{render_html_path}",
+                                wait_until="domcontentloaded",
+                            )
+                            await page.emulate_media(media="screen")
+
+                            page_width_px = doclog.render_html_width
+                            page_height_px = doclog.render_html_height
+
+                            # Set viewport and wait for layout
+                            await page.set_viewport_size(
+                                {"width": page_width_px, "height": page_height_px}  # type: ignore
+                            )
+                            await page.wait_for_timeout(30)
+
+                            # Generate PDF
+                            page_width_inches = page_width_px / 96  # type: ignore
+                            page_height_inches = page_height_px / 96  # type: ignore
+
+                            await page.pdf(
+                                path=str(pdf_path),
+                                width=f"{page_width_inches}in",
+                                height=f"{page_height_inches}in",
+                                margin={
+                                    "top": "0",
+                                    "bottom": "0",
+                                    "left": "0",
+                                    "right": "0",
+                                },
+                                print_background=True,
+                                display_header_footer=False,
+                                prefer_css_page_size=False,
+                                scale=1.0,
+                            )
+
+                            await browser.close()
+                except asyncio.TimeoutError:
+                    print(
+                        f"PDF rendering timed out after {timeout_seconds}s for {doc_id}"
+                    )
+                    raise TimeoutError(
+                        f"PDF rendering timed out after {timeout_seconds}s for {doc_id}"
+                    )
+                finally:
+                    # Ensure browser closes even on timeout
+                    if browser is not None:
+                        try:
+                            await browser.close()
+                        except Exception:
+                            pass
+
+            pdf_num_pages = safe_count_pages(pdf_path)
+
+            return {
+                DocLogKey.document_id: doc_id,
+                DocLogKey.pdf_num_pages: pdf_num_pages,
+                DocLogKey.pdf_render_error: None,
+            }
+
+        except Exception as e:
+            print(f"[yellow]Attempt {attempt} failed for {doc_id}: {e}")
+            await asyncio.sleep(1)
+            last_error = str(e)
+
+    return {
+        DocLogKey.document_id: doc_id,
+        DocLogKey.pdf_num_pages: None,
+        DocLogKey.pdf_render_error: last_error,
+    }
+
+
+async def process_batch_async(
+    doclogs: list[SynDocumentLog],
+    dsfiles,
+    extract_geos_for_classes,
+    chromium_concurrency,
+    dsdef,
+    progress,
+    render_task,
+    max_retries,
+    timeout_seconds,
+):
+    """Process a batch of PDFs asynchronously."""
+    semaphore = asyncio.Semaphore(chromium_concurrency)
+
+    tasks = [
+        render_pdf_async(
+            doclog,
+            dsfiles,
+            extract_geos_for_classes,
+            semaphore,
+            max_retries=max_retries,
+            timeout_seconds=timeout_seconds,
+        )
+        for (doclog) in doclogs
+    ]
+
+    results = []
+    for coro in asyncio.as_completed(tasks):
+        try:
+            result = await coro
+            dsdef.write_to_document_log(
+                document_id=result[DocLogKey.document_id], vals=result
+            )
+            progress.update(render_task, advance=1)
+            results.append(result)
+
+            if result[DocLogKey.pdf_render_error]:
+                print(
+                    f"[red]PDF failed for {result[DocLogKey.document_id]}: {result[DocLogKey.pdf_render_error]}"
+                )
+            elif (
+                result[DocLogKey.pdf_num_pages] and result[DocLogKey.pdf_num_pages] > 1
+            ):
+                print(
+                    f"[yellow]Warning: {result[DocLogKey.document_id]} rendered to {result[DocLogKey.pdf_num_pages]} pages"
+                )
+
+        except Exception as e:
+            print(f"[red]Unexpected error: {e}")
+            progress.update(render_task, advance=1)
+
+    return results
+
+
+def pipeline_render_pdf_second_pass(params: PipelineParameters):
+    """
+    Render HTML documents to PDF using async Playwright with automatic size detection.
+    Much faster than sync version!
+    """
+    log_pipeline_level()
+
+    chromium_concurrency = PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY
+    max_retries = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES
+    timeout_seconds = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT
+    # extract_positions_for_classes = ["handwritten"]  # or whatever you need
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+    extract_geos_for_classes = dsdef.valid_labels or []
+
+    # Get valid documents that need PDF generation
+    valid_doclogs = []
+    total_samples = 0
+
+    for doc in dsdef.get_document_logs():
+        total_samples += 1
+        pdf_path = (
+            dsfiles.pdf_without_handwriting_placeholder_directory
+            / f"{doc.document_id}.pdf"
+        )
+        if not pdf_path.exists():
+            html_path = dsfiles.render_html_directory / f"{doc.document_id}.html"
+            if html_path.exists():
+                valid_doclogs.append(doc)
+
+    total = len(valid_doclogs)
+    print(
+        f"{total} valid samples out of {total_samples} total samples need to be converted."
+    )
+
+    with get_progress_bar() as progress:
+        render_task = progress.add_task("[red]Rendering PDFs Pass 2...", total=total)
+
+        # Run async event loop
+        results = asyncio.run(
+            process_batch_async(
+                valid_doclogs,
+                dsfiles,
+                extract_geos_for_classes,
+                chromium_concurrency,
+                dsdef,
+                progress,
+                render_task,
+                max_retries=max_retries,
+                timeout_seconds=timeout_seconds,
+            )
+        )
+
+    print(f"✅ Finished rendering {len(results)}/{total} PDFs.")
+
+    # Summary stats
+    successful = sum(1 for r in results if r[DocLogKey.pdf_num_pages] == 1)
+    multi_page = sum(
+        1
+        for r in results
+        if r[DocLogKey.pdf_num_pages] and r[DocLogKey.pdf_num_pages] > 1
+    )
+    failed = sum(1 for r in results if r[DocLogKey.pdf_render_error])
+
+    print(
+        f"📊 Summary: {successful} single-page, {multi_page} multi-page, {failed} failed"
+    )
+
+    return results
diff --git a/docgenie/generation/pipeline_12_insert_handwriting_images.py b/docgenie/generation/pipeline_12_insert_handwriting_images.py
new file mode 100755
index 0000000000000000000000000000000000000000..613e954f0c5e146b51be177862407f1093cb07ee
--- /dev/null
+++ b/docgenie/generation/pipeline_12_insert_handwriting_images.py
@@ -0,0 +1,281 @@
+"""
+Handwriting insertion with left-alignment only (no region-aware scaling).
+"""
+
+from collections import Counter, defaultdict
+from io import BytesIO
+import json
+import pathlib
+import random
+import shutil
+from PIL import Image
+
+import fitz  # PyMuPDF
+from fitz import Page
+from docgenie import ENV
+from docgenie.generation.constants import (
+    FIXED_HANDWRITING_X_OFFSET,
+    MAX_HANDWRITING_RAND_DEG_ROT,
+    MAX_HANDWRITING_RAND_X_OFFSET_LEFT,
+    MAX_HANDWRITING_RAND_X_OFFSET_RIGHT,
+    MAX_HANDWRITING_RAND_Y_OFFSET_DOWN,
+    MAX_HANDWRITING_RAND_Y_OFFSET_UP,
+    PIPELINE_04_3_SCALE_UP_FACTOR,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    OCRBox,
+    PipelineParameters,
+    SyntheticDatasetFileStructure,
+)
+
+from docgenie.generation.utils.bboxes import (
+    draw_bboxes_on_pdf,
+    read_syn_dataset_bbox_str,
+)
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def resize_to_bbox_highres(img, bbox_width, bbox_height, scale_up=3):
+    """Resize with preserved aspect ratio, pad to bbox, upscale for sharpness."""
+    bbox_width = round(bbox_width)
+    bbox_height = round(bbox_height)
+
+    # Aspect Ratio
+    iw, ih = img.size
+    scale = min(bbox_width / iw, bbox_height / ih)
+
+    new_w = int(iw * scale * scale_up)
+    new_h = int(ih * scale * scale_up)
+
+    img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA")
+    final_img = Image.new("RGBA", (new_w, new_h), (255, 255, 255, 0))
+    final_img.paste(img_resized, (0, 0), mask=img_resized)
+
+    return final_img
+
+
+def group_handwriting_bboxes_by_block_line(entry: dict):
+    """Group handwriting bboxes by block and line."""
+    groupedbboxes = defaultdict(list)
+
+    for seg_idx, bbox in enumerate(entry["bboxes"]):
+        box = read_syn_dataset_bbox_str(bbox)
+        groupedbboxes[(box.block_no, box.line_no)].append(box)
+
+    for key, bboxes in groupedbboxes.items():
+        first = bboxes[0]
+        # x0, y0 = first.x0, first.y0
+        x0 = min([b.x0 for b in bboxes])
+        y0 = min([b.y0 for b in bboxes])
+        x2 = max([b.x2 for b in bboxes])
+        y2 = max([b.y2 for b in bboxes])
+        # last = bboxes[-1]
+        # x2, y2 = last.x2, last.y2
+        txt = " ".join(b.text for b in bboxes)
+        yield OCRBox(
+            x0=x0,
+            y0=y0,
+            x2=x2,
+            y2=y2,
+            text=txt,
+            block_no=key[0],
+            line_no=key[1],
+            word_no=first.word_no,
+        )
+
+
+def insert_handwriting_images(
+    docid: str, dsfiles: SyntheticDatasetFileStructure, scale_up: int, debug: bool
+):
+    """
+    Insert handwriting images with LEFT-ALIGNMENT at rect.x position.
+    Uses original bbox height, no region-aware scaling.
+    """
+    images_path = dsfiles.handwritten_text_images_directory / "sentences" / docid
+    images_generated = images_path.exists()
+
+    json_path = dsfiles.handwritten_bboxes_directory / f"{docid}.json"
+    handwriting_bboxes = json.loads(json_path.read_text(encoding="utf-8"))
+    pdf_path = dsfiles.pdf_without_handwriting_placeholder_directory / f"{docid}.pdf"
+    doc = fitz.open(pdf_path)
+
+    missing_images = []
+    inserted_bboxes = []
+
+    for entry in handwriting_bboxes:
+        hw_id = entry["id"]
+        rect = entry["rect"]
+
+        for seg_idx, bbox in enumerate(group_handwriting_bboxes_by_block_line(entry)):
+            img_name_prefix = f"{hw_id}_block{bbox.block_no}_line{bbox.line_no}"
+
+            if not images_generated:
+                if img_name_prefix not in missing_images:
+                    missing_images.append(img_name_prefix)
+                continue
+
+            img_path = images_path / f"{img_name_prefix}.png"
+
+            if not img_path.exists():
+                if img_name_prefix not in missing_images:
+                    missing_images.append(img_name_prefix)
+                continue
+
+            img = Image.open(img_path)
+            bbox_w, bbox_h = bbox.x2 - bbox.x0, bbox.y2 - bbox.y0
+
+            # Resize using original logic
+            # print(f"{docid=} {img_name_prefix=} {bbox_w=} {bbox_h=}")
+            img_resized = resize_to_bbox_highres(img, bbox_w, bbox_h, scale_up=scale_up)
+
+            # Random rotation
+            rnddeg = 0  # random.random() * 1.5 - (1.5 / 2)
+            img_resized = img_resized.rotate(rnddeg)
+
+            # Convert to bytes
+            img_bytes = BytesIO()
+            img_resized.save(img_bytes, format="png")
+            img_bytes = img_bytes.getvalue()
+
+            # LEFT-ALIGN at rect.x instead of bbox.x0
+            y_padding = 50
+            offset_x = (
+                random.randint(
+                    -MAX_HANDWRITING_RAND_X_OFFSET_LEFT,
+                    MAX_HANDWRITING_RAND_X_OFFSET_RIGHT,
+                )
+                + FIXED_HANDWRITING_X_OFFSET
+            )
+            offset_y = random.randint(
+                -MAX_HANDWRITING_RAND_Y_OFFSET_UP, MAX_HANDWRITING_RAND_Y_OFFSET_DOWN
+            )
+            x0 = rect["x"] + offset_x
+            y0 = bbox.y0 + offset_y - y_padding
+            x2 = min(x0 + img_resized.size[0] / scale_up, bbox.x2) + offset_x
+            y2 = (
+                min(y0 + img_resized.size[1] / scale_up, bbox.y2)
+                + offset_y
+                + 2 * y_padding
+            )
+
+            # print(
+            #     f"{bbox=} {offset_x=} {x0=} {x2=} {img_resized.size[0] / scale_up=} {docid=} {img_name_prefix=}"
+            # )
+
+            rect_fitz = fitz.Rect(x0, y0, x2, y2)
+
+            assert len(doc) == 1
+            page: Page = doc[0]
+            page.insert_image(rect_fitz, stream=img_bytes)
+
+            # Store for debug
+            debug_bbox = OCRBox(
+                x0=x0,
+                y0=y0,
+                x2=x2,
+                y2=y2,
+                text=bbox.text,
+                block_no=bbox.block_no,
+                line_no=bbox.line_no,
+                word_no=bbox.word_no,
+            )
+            inserted_bboxes.append(debug_bbox)
+
+    output_path = dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf"
+    doc.save(output_path)
+    doc.close()
+
+    # Debug
+    if debug:
+        draw_bboxes_on_pdf(
+            dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf",
+            dsfiles.debug_pdf_handwriting_directory / f"{docid}.pdf",
+            inserted_bboxes,
+            color=(1, 0, 0),  # handwriting red
+        )
+
+    return {
+        DocLogKey.handwriting_insertion_success: images_generated
+        and len(missing_images) == 0,
+        DocLogKey.handwriting_images_were_generated: images_generated,
+        DocLogKey.handwriting_missing_images: missing_images,
+    }
+
+
+def pipeline_handwritten_text_insertion(params: PipelineParameters, scale_up: int = 3):
+    """Pipeline for inserting handwritten text with left-alignment."""
+    log_pipeline_level()
+
+    scale_up = PIPELINE_04_3_SCALE_UP_FACTOR
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    valid_document_ids = []
+    total_documents_count = 0
+    cnt = Counter()
+
+    for doclog in dsdef.get_document_logs():
+        total_documents_count += 1
+        if doclog.pdf_num_pages == 1:
+            cnt["pdf_num_pages"] += 1
+
+            # Copy each PDF to pdf_with_handwriting_directory
+            src = (
+                dsfiles.pdf_without_handwriting_placeholder_directory
+                / f"{doclog.document_id}.pdf"
+            )
+            dst = dsfiles.pdf_with_handwriting_directory / f"{doclog.document_id}.pdf"
+            shutil.copy(src, dst)
+
+            if doclog.handwriting_num_elements > 0:
+                cnt["has_handwriting"] += 1
+                if len(doclog.handwriting_element_extraction_errors) == 0:
+                    cnt["no_errors"] += 1
+                    valid_document_ids.append(doclog.document_id)
+                else:
+                    print(
+                        doclog.document_id, doclog.handwriting_element_extraction_errors
+                    )
+            else:
+                dsdef.write_to_document_log(
+                    document_id=doclog.document_id,
+                    vals={
+                        DocLogKey.handwriting_insertion_success: True,
+                        DocLogKey.handwriting_images_were_generated: True,
+                        DocLogKey.handwriting_missing_images: [],
+                    },
+                )
+
+    print(
+        f"{len(valid_document_ids)} out of {total_documents_count} Documents valid for handwriting image insertion: {cnt}"
+    )
+
+    with get_progress_bar() as progress:
+        insert_task = progress.add_task(
+            "[red]Inserting text into pdfs...", total=len(valid_document_ids)
+        )
+
+        success = 0
+        all_logs = []
+        for docid in valid_document_ids:
+            insertion_log = insert_handwriting_images(
+                docid=docid, dsfiles=dsfiles, scale_up=scale_up, debug=params.debug
+            )
+
+            dsdef.write_to_document_log(document_id=docid, vals=insertion_log)
+            all_logs.append(insertion_log)
+
+            if insertion_log[DocLogKey.handwriting_insertion_success]:
+                success += 1
+
+            progress.update(insert_task, advance=1)
+
+        print(
+            f"""Inserted handwriting images in {success} PDFs
+    {len(valid_document_ids) - success} errors:
+    {len([1 for insertlog in all_logs if not insertlog[DocLogKey.handwriting_images_were_generated]])} documents dont have images generated
+    {sum([len(insertlog[DocLogKey.handwriting_missing_images]) for insertlog in all_logs if insertlog[DocLogKey.handwriting_images_were_generated]])} images missing for documents where images were generated"""
+        )
diff --git a/docgenie/generation/pipeline_13_insert_visual_elements.py b/docgenie/generation/pipeline_13_insert_visual_elements.py
new file mode 100755
index 0000000000000000000000000000000000000000..c444c1c4f9f45bc912dd8c329c36930cfd937450
--- /dev/null
+++ b/docgenie/generation/pipeline_13_insert_visual_elements.py
@@ -0,0 +1,212 @@
+import pathlib
+import shutil
+from docgenie.generation.models import (
+    DocLogKey,
+    PipelineParameters,
+    SyntheticDatasetFileStructure,
+    SynDocumentLog,
+    OCRBox,
+)
+from rich.progress import (
+    Progress,
+    TimeElapsedColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+)
+from docgenie.generation.constants import PIPELINE_04_3_SCALE_UP_FACTOR
+import fitz
+from fitz import Page
+from PIL import Image
+from io import BytesIO
+
+import json
+from typing import Union
+
+from docgenie.generation.utils.geos import rect_to_ocrbox
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+__SCALE_UP__ = PIPELINE_04_3_SCALE_UP_FACTOR
+
+
+def resize_to_bbox_highres(img, bbox_width, bbox_height, scale_up=3):
+    """Resize with preserved aspect ratio, pad to bbox, upscale for sharpness."""
+
+    """I am not directly resizing image to bbox coords,
+    First calculate a scale factor that avoids overfllow
+    in horizontal and vertical direction(that's why min)"""
+    """Because scale is used for both width and height,
+    aspect ratio = display_w/display_h = iw/ih (unchanged) ratio will remain same."""
+    bbox_width = round(bbox_width)
+    bbox_height = round(bbox_height)
+
+    # -----------Aspect Ratio---------------
+    iw, ih = img.size
+    scale = min(bbox_width / iw, bbox_height / ih)
+
+    new_w = int(iw * scale * scale_up)
+    new_h = int(ih * scale * scale_up)
+    # -----------Aspect Ratio---------------
+    # ------------Resolution-----------------
+    """f you embed an image whose pixel dimensions are exactly (display_w, display_h),
+    those are the only pixels available to draw the strokes — often too few for a crisp
+    rendering, especially if display_w or display_h is small.
+    If we X with scale_up we have more pixels to draw image."""
+    # ------------Resolution-----------------
+
+    img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA")
+
+    # Create high-res white background
+    final_img = Image.new(
+        "RGBA", (bbox_width * scale_up, bbox_height * scale_up), (255, 255, 255, 0)
+    )
+
+    # Paste resized image centered
+    offset_x = (bbox_width * scale_up - new_w) // 2
+    offset_y = (bbox_height * scale_up - new_h) // 2
+    final_img.paste(img_resized, (offset_x, offset_y), mask=img_resized)
+
+    return final_img
+
+
+def mm_to_px(mm: Union[int, float]):
+    return mm * 72 / 25.4
+
+
+def insert_visual_elements(
+    veds: list[dict],
+    docid: str,
+    dsfiles: SyntheticDatasetFileStructure,
+):
+    input_path = dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf"
+    output_pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
+
+    ve_dir = dsfiles.visual_elements_directory / f"{docid}"
+    ve_generated = ve_dir.exists()
+    missing_ves = []
+
+    doc = fitz.open(input_path)
+    for d in veds:
+        ve_id = d.get("id", None)
+
+        if not ve_generated:
+            print(
+                f"[Warning] Visual elements directory does not exist for {docid}. Skipping"
+            )
+            if ve_id not in missing_ves:
+                missing_ves.append(ve_id)
+            continue
+        img_path = ve_dir / f"{ve_id}.png"
+
+        if not img_path.exists():
+            print(
+                f"[Warning] Visual element with id {ve_id} do not exist for {docid}. Skipping"
+            )
+            if ve_id not in missing_ves:
+                missing_ves.append(ve_id)
+            continue
+
+        # computing bbox as in gitlab ticket
+        # width_pt = mm_to_px(d["width_mm"])
+        # height_pt = mm_to_px(d["height_mm"])
+        # off_x, off_y = width_pt / 2.0, height_pt / 2.0
+        # b = OCRBox(
+        #     x0=d["center_x"] - off_x,
+        #     x2=d["center_x"] - off_x + width_pt,
+        #     y0=d["center_y"] - off_y,
+        #     y2=d["center_y"] - off_y + height_pt,
+        #     text="",
+        #     block_no=-1,
+        #     line_no=-1,
+        #     word_no=-1,
+        # )
+        rect = d["rect"]
+        b = rect_to_ocrbox(rect)
+        bbox_w, bbox_h = b.width, b.height
+
+        img = Image.open(img_path)
+        img_resized = resize_to_bbox_highres(img, bbox_w, bbox_h, scale_up=__SCALE_UP__)
+
+        img_bytes = BytesIO()
+        img_resized.save(img_bytes, format="PNG")
+        img_bytes = img_bytes.getvalue()
+
+        rect = fitz.Rect(b.x0, b.y0, b.x2, b.y2)
+        assert len(doc) == 1, (
+            f"Multipage: {dsfiles.pdf_initial_directory / f'{docid}.pdf'}, {dsfiles.pdf_with_handwriting_directory / f'{docid}.pdf'}"
+        )
+        page: Page = doc[0]  # single-page assumption
+        page.insert_image(rect, stream=img_bytes)  # type: ignore
+
+    doc.save(output_pdf_path)
+    doc.close()
+    return {
+        DocLogKey.visual_elements_insertion_success: ve_generated
+        and len(missing_ves) == 0,
+        DocLogKey.visual_elements_were_generated: ve_generated,
+        DocLogKey.visual_elements_missing_images: missing_ves,
+    }
+
+
+def pipeline_insert_visual_elements(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    valid_document_ids = []
+    total_documents_count = 0
+
+    for doclog in dsdef.get_document_logs():
+        total_documents_count += 1
+
+        if doclog.pdf_num_pages == 1:
+            # Already copy each PDF to pdf_final, those which have vis elems inserted are later overridden
+            src = dsfiles.pdf_with_handwriting_directory / f"{doclog.document_id}.pdf"
+            dst = dsfiles.final_pdf_directory / f"{doclog.document_id}.pdf"
+            shutil.copy(src, dst)
+
+            if (
+                doclog.visual_elements_num_elements > 0
+                and len(doclog.visual_elements_extraction_errors) == 0
+            ):
+                valid_document_ids.append(doclog.document_id)
+    print(
+        f"{len(valid_document_ids)} of {total_documents_count} documents valid for visual element insertion."
+    )
+
+    with get_progress_bar() as progress:
+        insert_task = progress.add_task(
+            "[red]Inserting visual elements into pdfs...", total=len(valid_document_ids)
+        )
+        success = 0
+        examples = list()
+        for docid in valid_document_ids:
+            visual_element_def_file = (
+                dsfiles.visual_element_definitions_directory / f"{docid}.json"
+            )
+            visual_element_definitions = json.loads(
+                visual_element_def_file.read_text(encoding="utf-8")
+            )
+
+            insertion_logs = insert_visual_elements(
+                veds=visual_element_definitions, docid=docid, dsfiles=dsfiles
+            )
+            dsdef.write_to_document_log(document_id=docid, vals=insertion_logs)
+            if insertion_logs[DocLogKey.visual_elements_insertion_success]:
+                success += 1
+                examples.append(
+                    {
+                        "docid": docid,
+                        "types": sorted(
+                            {v["type"] for v in visual_element_definitions}
+                        ),
+                    }
+                )
+            progress.update(insert_task, advance=1)
+
+        print(
+            f"""Inserted visual elements in {success} PDFs and {len(valid_document_ids) - success} errors occur.
+    Examples: {examples[:3]}"""
+        )
diff --git a/docgenie/generation/pipeline_14_render_image.py b/docgenie/generation/pipeline_14_render_image.py
new file mode 100755
index 0000000000000000000000000000000000000000..00254bc50902ffe8c33ab6793490edf6176facc0
--- /dev/null
+++ b/docgenie/generation/pipeline_14_render_image.py
@@ -0,0 +1,61 @@
+import pathlib
+from rich.progress import (
+    Progress,
+    TimeElapsedColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+)
+
+from docgenie.generation.constants import IMAGE_RENDER_EXT
+from docgenie.generation.models import (
+    PipelineParameters,
+)
+from docgenie.generation.pipeline_05.pdftoimage import convert_from_path_singlepage
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+# In a PDF, the default coordinate system uses points as its unit of measurement, and a point is defined as 1/72 of an inch.
+# This means the coordinate system is effectively 72 DPI (dots per inch).
+
+
+def pipeline_render_image(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    valid_document_ids = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1:
+            final_pdf_path = dsfiles.final_pdf_directory / f"{doclog.document_id}.pdf"
+            img_path = dsfiles.img_directory / f"{doclog.document_id}.png"
+            if final_pdf_path.exists() and not img_path.exists():
+                valid_document_ids.append(doclog.document_id)
+
+    print(
+        f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for image conversion."
+    )
+
+    with get_progress_bar() as progress:
+        img_task = progress.add_task(
+            f"[red]Converting {len(valid_document_ids)} PDFs to images...",
+            total=len(valid_document_ids),
+        )
+
+        for document_id in valid_document_ids:
+            # Convert PDF to list of PIL images
+            """Changing pdf locattion to final_pdf directory"""
+            pdf_path = dsfiles.final_pdf_directory / f"{document_id}.pdf"
+            img = convert_from_path_singlepage(pdf_path)
+
+            img_path = dsfiles.img_directory / f"{document_id}.{IMAGE_RENDER_EXT}"
+            img.save(img_path, IMAGE_RENDER_EXT.upper())
+
+            # bboxes_path = dsfiles.bboxes_directory / f'{sample_id}.txt'
+            # _draw_bboxes(img_path, bboxes_path)
+
+            progress.update(img_task, advance=1)
diff --git a/docgenie/generation/pipeline_15_perform_ocr.py b/docgenie/generation/pipeline_15_perform_ocr.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9c94961b1244731e08fcc0fbc65329775b9cec2
--- /dev/null
+++ b/docgenie/generation/pipeline_15_perform_ocr.py
@@ -0,0 +1,236 @@
+from collections import defaultdict
+import json
+from typing import Literal
+from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
+from docgenie.generation.models import DocLogKey, OCRBox, PipelineParameters
+from docgenie.generation.models._syndatadef import SynDatasetDefinition
+from docgenie.generation.utils.bboxes import read_syn_dataset_bboxes, save_bboxes
+from docgenie.generation.utils.debug import draw_geos_and_bboxes_on_pdf
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.ocr import call_ocr_service_from_file
+from docgenie.generation.utils.status import get_progress_bar
+from docgenie.utils.ocr import MicrosoftOCR, MicrosoftOCRWord
+
+
+def _convert_ms_ocr_to_ocrbox(
+    ocr: MicrosoftOCR, level: Literal["word", "lines"]
+) -> list[OCRBox]:
+    res = list()
+    word: MicrosoftOCRWord
+    collection = ocr.words if level == "word" else ocr.lines
+    for word in collection:
+        (left, top, width, height) = tuple(word.geo)
+        box = OCRBox(
+            x0=left,
+            y0=top,
+            x2=left + width,
+            y2=top + height,
+            text=word.text,
+            block_no=-1,  # not supplied
+            line_no=-1,  # not supplied
+            word_no=-1,  # not supplied
+        )
+        res.append(box)
+
+    return res
+
+
+def _convert_word_level_to_line_level_bboxes(bboxes: list[OCRBox]) -> list[OCRBox]:
+    grouped = defaultdict(list)
+    for b in bboxes:
+        grouped[(b.block_no, b.line_no)].append(b)
+
+    result = []
+    for (block_no, line_no), boxes in grouped.items():
+        first: OCRBox = boxes[0]
+        last: OCRBox = boxes[-1]
+        txt = " ".join([b.text for b in boxes])
+        result.append(
+            OCRBox(
+                x0=first.x0,
+                y0=first.y0,
+                x2=last.x2,
+                y2=last.y2,
+                text=txt,
+                block_no=block_no,
+                line_no=line_no,
+                word_no=first.word_no,
+            )
+        )
+
+    return result
+
+
+def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str, bboxes: list[OCRBox]):
+    dsfiles = dsdef.get_file_structure()
+
+    pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
+
+    geo_path = dsfiles.geometries_directory / f"{docid}.json"
+    geos = json.loads(geo_path.read_text(encoding="utf-8"))
+
+    outpath = dsfiles.debug_ocr_bboxes_and_geos_directory / f"{docid}.pdf"
+
+    # for g in geos:
+    #     g["rect"] = pdf_region_to_image(g["rect"])
+
+    bboxes = [b.scale(72.0 / PDF_DPI) for b in bboxes]
+
+    try:
+        draw_geos_and_bboxes_on_pdf(
+            pdf_in=pdf_path,
+            pdf_out=outpath,
+            bboxes_=bboxes,
+            geos=geos,
+            verbose=False,
+        )
+    except Exception as err:
+        print(f"[ERROR]: Skipping debug PDF: {str(err)}")
+
+
+def pipeline_perform_ocr(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid PDF paths (single page, not processed yet)
+    documents_requiring_ocr = []
+    documents_not_requiring_ocr = []
+    for doclog in dsdef.get_document_logs():
+        has_valid_handwriting = (
+            doclog.handwriting_num_elements > 0
+            and len(doclog.handwriting_element_extraction_errors) == 0
+        )
+        has_valid_visual_elements = (
+            doclog.visual_elements_num_elements > 0
+            and len(doclog.visual_elements_extraction_errors) == 0
+        )
+        is_valid_document = doclog.pdf_num_pages == 1
+        if not is_valid_document:
+            continue
+
+        if has_valid_handwriting or has_valid_visual_elements:
+            documents_requiring_ocr.append(doclog.document_id)
+        else:
+            documents_not_requiring_ocr.append(doclog.document_id)
+
+    total_valid_documents = len(documents_requiring_ocr) + len(
+        documents_not_requiring_ocr
+    )
+    print(
+        f"{len(documents_requiring_ocr)} out of {total_valid_documents} valid documents require OCR."
+    )
+
+    """
+    We collect BBoxes and supply segment-level bounding boxes in the end
+    """
+
+    # First copy BBoxes extracted from PDF for those documents that don't require OCR
+    with get_progress_bar() as progress:
+        task = progress.add_task(
+            "[white]Copy BBoxes for documents not requiring OCR...",
+            total=len(documents_not_requiring_ocr),
+        )
+
+        for docid in documents_not_requiring_ocr:
+            pdf_bbox_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid)
+            word_bboxes = read_syn_dataset_bboxes(pdf_bbox_path)
+            result_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid)
+            save_bboxes(word_bboxes, result_path)
+
+            line_bboxes = _convert_word_level_to_line_level_bboxes(word_bboxes)
+            result_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid)
+            save_bboxes(line_bboxes, result_path)
+
+            dsdef.write_to_document_log(
+                document_id=docid,
+                vals={
+                    DocLogKey.ocr_required: False,
+                    DocLogKey.ocr_found: True,
+                    DocLogKey.ocr_num_bboxes_words: len(word_bboxes),
+                    DocLogKey.ocr_num_bboxes_lines: len(line_bboxes),
+                },
+            )
+
+            progress.update(task, advance=1)
+
+    with get_progress_bar() as progress:
+        task = progress.add_task(
+            "[white]Performing OCR for documents...", total=len(documents_requiring_ocr)
+        )
+
+        # Then parse OCR results for other documents
+        ocr_not_found_count = 0
+        for docid in documents_requiring_ocr:
+            image_file = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
+            ocr_result_file = (
+                dsfiles.ocr_results_directory
+                / f"{docid}.{IMAGE_RENDER_EXT}.0.MicrosoftOcrService.json"
+            )
+
+            ocr_error = None
+            try:
+                if ocr_result_file.exists():
+                    ocr_result = MicrosoftOCR.load_from_file(ocr_result_file)
+                else:
+                    ocr_result: MicrosoftOCR = call_ocr_service_from_file(
+                        image_file, client_caching=False
+                    )
+                    ocr_result.save_to_file(ocr_result_file)
+            except Exception as e:
+                ocr_error = str(e)
+
+            ocr_found = ocr_result_file.exists()
+
+            num_bboxes_words = -1
+            num_bboxes_lines = -1
+            if ocr_found:
+                bboxes: list[OCRBox] = _convert_ms_ocr_to_ocrbox(
+                    ocr=ocr_result,  # type: ignore
+                    level="word",
+                )
+
+                # Write to file
+                result_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid)
+                save_bboxes(
+                    bboxes=bboxes,
+                    bbox_path=result_path,
+                )
+                num_bboxes_words = len(bboxes)
+
+                if params.debug:
+                    draw_bbox_debug(dsdef=dsdef, docid=docid, bboxes=bboxes)
+
+                # Parse Microsoft OCR for lines
+                ocr_result: MicrosoftOCR = MicrosoftOCR.load_from_file(ocr_result_file)
+                bboxes: list[OCRBox] = _convert_ms_ocr_to_ocrbox(
+                    ocr=ocr_result, level="lines"
+                )
+
+                # Write to file
+                result_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid)
+                save_bboxes(
+                    bboxes=bboxes,
+                    bbox_path=result_path,
+                )
+                num_bboxes_lines = len(bboxes)
+            else:
+                ocr_not_found_count += 1
+
+            dsdef.write_to_document_log(
+                document_id=docid,
+                vals={
+                    DocLogKey.ocr_required: True,
+                    DocLogKey.ocr_found: ocr_found,
+                    DocLogKey.ocr_num_bboxes_words: num_bboxes_words,
+                    DocLogKey.ocr_num_bboxes_lines: num_bboxes_lines,
+                    DocLogKey.ocr_error: ocr_error,
+                },
+            )
+
+            progress.update(task, advance=1)
+
+    print(
+        f"{ocr_not_found_count} of {len(documents_requiring_ocr)} OCR results documents missing."
+    )
diff --git a/docgenie/generation/pipeline_16_normalize_bboxes.py b/docgenie/generation/pipeline_16_normalize_bboxes.py
new file mode 100755
index 0000000000000000000000000000000000000000..5b709f16d63007043930c4aee058f9cc1a5dec8d
--- /dev/null
+++ b/docgenie/generation/pipeline_16_normalize_bboxes.py
@@ -0,0 +1,151 @@
+from dataclasses import asdict
+import json
+from PIL import Image
+import fitz
+from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
+from docgenie.generation.models import OCRBox, PipelineParameters, SynDatasetDefinition
+from docgenie.generation.models._bbox import LayoutBox
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.models._log import SynDocumentLog
+from docgenie.generation.utils.bboxes import (
+    read_syn_dataset_bboxes,
+    save_bboxes,
+)
+from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def normalize_ocrbox(bbox: OCRBox, width_px, height_px):
+    """
+    Convert a bounding box from PDF points to normalized image coordinates.
+    """
+    # Convert PDF points to pixels
+    x_min_px = bbox.x0
+    y_min_px = bbox.y0
+    x_max_px = bbox.x2
+    y_max_px = bbox.y2
+
+    # Get image size in pixels
+    img_w_px = width_px
+    img_h_px = height_px
+
+    # Normalize bounding box
+    x_min_norm = x_min_px / img_w_px
+    y_min_norm = y_min_px / img_h_px
+    x_max_norm = x_max_px / img_w_px
+    y_max_norm = y_max_px / img_h_px
+
+    return OCRBox(
+        x0=x_min_norm,
+        y0=y_min_norm,
+        x2=x_max_norm,
+        y2=y_max_norm,
+        text=bbox.text,
+        block_no=bbox.block_no,
+        line_no=bbox.line_no,
+        word_no=bbox.word_no,
+    )
+
+
+def normalize_and_save_word_and_segment_bboxes(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+
+    width_px, height_px = get_document_size_for_bbox_unnormalization(docid=docid, dsfiles=dsfiles)
+
+    # word
+    bbox_word_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid)
+    bbox_word = read_syn_dataset_bboxes(bbox_word_path)
+    bbox_word_normalized = [
+        normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
+        for b in bbox_word
+    ]
+    bbox_word_normalized_path = dsfiles.get_final_normalized_bbox_path(
+        level="word", doc_id=docid
+    )
+    save_bboxes(bboxes=bbox_word_normalized, bbox_path=bbox_word_normalized_path)
+
+    # segment
+    bbox_segment_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid)
+    bbox_segment = read_syn_dataset_bboxes(bbox_segment_path)
+    bbox_segment_normalized = [
+        normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px)
+        for b in bbox_segment
+    ]
+    bbox_segment_normalized_path = dsfiles.get_final_normalized_bbox_path(
+        level="segment", doc_id=docid
+    )
+    save_bboxes(bboxes=bbox_segment_normalized, bbox_path=bbox_segment_normalized_path)
+
+
+def normalize_layout_bboxes(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+
+    pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    width_pt, height_pt = page.rect.width, page.rect.height
+
+    raw_annotations_path = dsfiles.raw_annotations_directory / f"{docid}.json"
+    data = json.loads(raw_annotations_path.read_text(encoding="utf-8"))
+    layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data]
+
+    layout_bboxes_normalized = [
+        LayoutBox.normalize_to_pdf(
+            b, width_pt=width_pt, height_pt=height_pt, dpi=PDF_DPI
+        )
+        for b in layout_bboxes
+    ]
+
+    boxes_dicts = [asdict(b) for b in layout_bboxes_normalized]
+    gt_path = dsfiles.gt_directory / f"{docid}.json"
+    gt_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8")
+
+    doc.close()
+
+
+def pipeline_normalize_bboxes(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+
+    # Get documents valid for bbox normalization
+    valid_documents = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1 and doclog.ocr_found:
+            valid_documents.append(doclog.document_id)
+
+    print(f"Found {len(valid_documents)} documents valid for BBox normalization.")
+
+    with get_progress_bar() as progress:
+        task = progress.add_task(
+            "[white]Normalizing BBoxes...", total=len(valid_documents)
+        )
+
+        for docid in valid_documents:
+            normalize_and_save_word_and_segment_bboxes(dsdef=dsdef, docid=docid)
+            progress.update(task, advance=1)
+
+    # We also normalize the DLA GT here as they are layout BBoxes
+    # Get documents valid for layout bbox normalization
+    valid_documents = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+        if doclog.pdf_num_pages == 1 and doclog.layout_elements_num_elements > 0:
+            valid_documents.append(doclog.document_id)
+
+    print(
+        f"Found {len(valid_documents)} documents valid for Layout BBox normalization."
+    )
+
+    with get_progress_bar() as progress:
+        task = progress.add_task(
+            "[white]Normalizing Layout BBoxes...", total=len(valid_documents)
+        )
+
+        for docid in valid_documents:
+            normalize_layout_bboxes(dsdef=dsdef, docid=docid)
+            progress.update(task, advance=1)
diff --git a/docgenie/generation/pipeline_17_gt_preparation_verification.py b/docgenie/generation/pipeline_17_gt_preparation_verification.py
new file mode 100755
index 0000000000000000000000000000000000000000..c621330d641556e897151cb5a83b665b788c03ac
--- /dev/null
+++ b/docgenie/generation/pipeline_17_gt_preparation_verification.py
@@ -0,0 +1,818 @@
+import json
+import re
+from dataclasses import asdict
+from itertools import combinations
+
+import Levenshtein
+import fitz
+
+from docgenie.generation.constants import (
+    BBOX_TO_GEO_MATCHING_THRESHOLD,
+    PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF,
+    PDF_DPI,
+)
+from docgenie.generation.models import (
+    DocLogKey,
+    OCRBox,
+    PipelineParameters,
+    SynDatasetDefinition,
+    DatasetTask,
+    SyntheticDatasetFileStructure,
+)
+from docgenie.generation.models._bbox import LayoutBox
+from docgenie.generation.models._log import SynDocumentLog
+from docgenie.generation.utils.bboxes import is_in_rect, read_syn_dataset_bboxes
+from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization, get_image_size_px, get_pdf_size_pt
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+__KEY_SEPERATOR = "<<%?"  # some value that will surely never be part of a key in KIE
+
+
+def normalize(s: str) -> str:
+    return re.sub(r"\s+", " ", s.strip())
+
+
+def _find_best_fuzzy_match_span(
+    original_text: str,
+    pattern: str,
+    cutoff: float,
+    text_positions: list[tuple[int, int]],
+):
+    """
+    Returns (best_candidate_text, best_score, found, [bbox_indices])
+    """
+    clean_text = normalize(original_text)
+    clean_text_lower = clean_text.lower()
+    clean_pattern = normalize(pattern).lower()
+    pat_len = len(clean_pattern)
+
+    best_candidate = ""
+    best_score = -1
+    best_span = (0, 0)
+
+    for i in range(0, len(clean_text) - pat_len + 1):
+        candidate = clean_text_lower[i : i + pat_len]
+        dist = Levenshtein.distance(candidate, clean_pattern)
+        clen = max(len(clean_pattern), len(candidate))
+        if clen == 0:
+            continue
+        score = 1 - dist / clen
+        if score > best_score:
+            best_score = score
+            best_candidate = clean_text[i : i + pat_len]
+            best_span = (i, i + pat_len)
+
+    found = best_score >= cutoff
+
+    # Map char span → bbox indices
+    bbox_indices = []
+    if found:
+        span_start, span_end = best_span
+        for idx, (start, end) in enumerate(text_positions):
+            if end < span_start:
+                continue
+            if start > span_end:
+                break
+            bbox_indices.append(idx)
+
+    return best_candidate, best_score, found, bbox_indices
+
+
+def _find_best_fuzzy_match_span_restriced(
+    original_text: str,
+    pattern: str,
+    cutoff: float,
+    allowed_bbox_indices: list[int] | None,
+    text_positions: list[tuple[int, int]],
+):
+    """
+    Returns (best_candidate_text, best_score, found, [bbox_indices])
+    """
+    clean_text = normalize(original_text)
+    clean_text_lower = clean_text.lower()
+    clean_pattern = normalize(pattern).lower()
+    pat_len = len(clean_pattern)
+
+    best_candidate = ""
+    best_score = -1
+    best_span = (0, 0)
+
+    # Determine which character ranges are allowed
+    if allowed_bbox_indices is not None:
+        allowed_char_ranges = [
+            text_positions[i]
+            for i in allowed_bbox_indices
+            # if 0 <= i < len(text_positions)
+        ]
+        # Merge them into one list of allowed character indices
+        allowed_chars = set()
+        for start, end in allowed_char_ranges:
+            allowed_chars.update(range(start, end + 1))
+    else:
+        allowed_chars = set(range(len(clean_text)))
+
+    # Scan only candidate windows where *all chars* fall within allowed ranges
+    for i in range(0, len(clean_text) - pat_len + 1):
+        window_range = set(range(i, i + pat_len))
+        # if allowed_bbox_indices is not None:
+        #     input(
+        #         f"{i=} {window_range=} {window_range.issubset(allowed_chars)=} {allowed_chars=}"
+        #     )
+        if not window_range.issubset(allowed_chars):
+            continue  # skip if this substring crosses disallowed areas
+
+        candidate = clean_text_lower[i : i + pat_len]
+        dist = Levenshtein.distance(candidate, clean_pattern)
+        clen = max(len(clean_pattern), len(candidate))
+        if clen == 0:
+            continue
+        score = 1 - dist / clen
+        if score > best_score:
+            best_score = score
+            best_candidate = clean_text[i : i + pat_len]
+            best_span = (i, i + pat_len)
+
+    found = best_score >= cutoff
+
+    # Map char span → bbox indices
+    bbox_indices = []
+    if found:
+        span_start, span_end = best_span
+        for idx, (start, end) in enumerate(text_positions):
+            if end < span_start:
+                continue
+            if start > span_end:
+                break
+            # Only include bbox if it's allowed (if restricted)
+            if allowed_bbox_indices is None or idx in allowed_bbox_indices:
+                bbox_indices.append(idx)
+
+    return best_candidate, best_score, found, bbox_indices
+
+
+def _verify_dla_valid_labels(
+    layout_bboxes: list[LayoutBox], valid_labels: list[str]
+) -> bool:
+    """
+    Checks that all labels on the layout elements are valid.
+    Returns True if all labels are valid. False otherwise.
+    """
+    for b in layout_bboxes:
+        if b.label not in valid_labels:
+            return False
+
+    return True
+
+
+def _verify_dla_has_containment_or_overlap(
+    layout_bboxes: list[LayoutBox], overlap_threshold: float
+) -> bool:
+    """
+    Checks if there are layout elements contained within other layout elements and if there are strong overlaps.
+    Returns True if there are elements contained within another or if there are strong overlaps. False otherwise.
+    """
+    for box1, box2 in combinations(layout_bboxes, 2):
+        if LayoutBox.box_contains(box1, box2) or LayoutBox.box_contains(box2, box1):
+            return True
+
+        overlap_ratio = LayoutBox.calculate_overlap_ratio(box1, box2)
+        if overlap_ratio > overlap_threshold:
+            return True
+
+    return False
+
+
+def _dla_load_visual_elements(
+    document_id: str, dsfiles: SyntheticDatasetFileStructure
+) -> list[LayoutBox]:
+    """
+    Load all available visual elements for a document.
+    Returns the visual elements as a list of layout boxes.
+    """
+    visual_elements: list[LayoutBox] = []
+    ve_data_path = dsfiles.visual_element_definitions_directory / f"{document_id}.json"
+    if not ve_data_path.exists():
+        return []
+
+    data = json.loads(ve_data_path.read_text(encoding="utf-8"))
+    for d in data:
+        if d["error"] is not None:
+            continue
+
+        rect = d["rect"]
+        label = d["type"]
+        visual_elements.append(
+            LayoutBox(
+                x0=rect["x"],
+                y0=rect["y"],
+                x2=rect["x"] + rect["width"],
+                y2=rect["y"] + rect["height"],
+                label=label.lower(),
+            )
+        )
+
+    return visual_elements
+
+
+def _dla_merge_visual_elements_into_dla_annotations(
+    document_id: str,
+    dsdef: SynDatasetDefinition,
+    layout_bboxes: list[LayoutBox],
+    visual_elements: list[LayoutBox],
+    overlap_threshold: float,
+) -> list[LayoutBox]:
+    """
+    Merge visual elements into dla annotations if they are missing in the dla annotations.
+    We currently only merge figure/picture elements.
+    Returns a new list of layout boxes that is extended.
+    """
+    ds_has_figures = "LE-FIGURE" in dsdef.valid_labels
+    ds_has_pictures = "LE-PICTURE" in dsdef.valid_labels
+    visual_elements_figures = [
+        element
+        for element in visual_elements
+        if element.label == "figure" and (ds_has_figures or ds_has_pictures)
+    ]
+
+    result = list(layout_bboxes)
+    for figure in visual_elements_figures:
+        has_strong_overlap = any(
+            LayoutBox.calculate_overlap_ratio(figure, layout_box) > overlap_threshold
+            for layout_box in layout_bboxes
+        )
+        if has_strong_overlap:
+            # Already contained
+            continue
+
+        _label = "LE-FIGURE"
+        if ds_has_pictures and not ds_has_figures:
+            _label = "LE-PICTURE"
+        _element = LayoutBox(
+            x0=figure.x0, y0=figure.y0, x2=figure.x2, y2=figure.y2, label=_label
+        )
+        result.append(_element)
+        # print(f'[NOTE] {document_id}: Inserting {_label} visual element into layout element GT because it was missing!')
+
+    return result
+
+
+def _dla_get_pdf_size_pt(
+    document_id: str, dsfiles: SyntheticDatasetFileStructure
+) -> tuple[float, float]:
+    pdf_path = dsfiles.final_pdf_directory / f"{document_id}.pdf"
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    width_pt, height_pt = page.rect.width, page.rect.height
+    doc.close()
+    return width_pt, height_pt
+
+
+def prepare_and_verify_gt_dla(
+    document_id: str,
+    dsfiles: SyntheticDatasetFileStructure,
+    dsdef: SynDatasetDefinition,
+    params: PipelineParameters,
+) -> dict:
+    # written in pipeline_16_normalize_bboxes
+    gt_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+    data: list[dict] = json.loads(gt_path.read_text(encoding="utf-8"))
+    layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data]
+
+    # Check if we only have valid Layout element labels.
+    all_labels_known = _verify_dla_valid_labels(layout_bboxes, dsdef.valid_labels)
+
+    # Check that there are no layout elements contained within others or are strongly overlapping.
+    # has_containment_or_overlap = False
+    # if all_labels_known:
+    #     has_containment_or_overlap = _verify_dla_has_containment_or_overlap(
+    #         layout_bboxes, overlap_threshold=0.5
+    #     )
+    #     # if has_containment_or_overlap:
+    #     #     print(f'[ERROR]: Skipping {document_id} due to containment or overlap in the layout elements.')
+
+    gt_verification_passed = all_labels_known
+
+    # Merge layout elements generated by the LLM with visual elements generated by the LLM.
+    if gt_verification_passed:
+        visual_elements: list[LayoutBox] = _dla_load_visual_elements(
+            document_id, dsfiles
+        )
+        layout_bboxes = _dla_merge_visual_elements_into_dla_annotations(
+            document_id, dsdef, layout_bboxes, visual_elements, overlap_threshold=0.8
+        )
+
+        # Save post-processed GTs as new RAW-GT
+        gt_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+        gt_path.write_text(
+            json.dumps([asdict(box) for box in layout_bboxes], indent=2),
+            encoding="utf-8",
+        )
+
+        # Save post-processed GTs as new Normalized-GT
+        pdf_width_pt, pdf_height_pt = _dla_get_pdf_size_pt(document_id, dsfiles)
+        layout_bboxes_normalized = [
+            LayoutBox.normalize_to_pdf(
+                box, width_pt=pdf_width_pt, height_pt=pdf_height_pt, dpi=PDF_DPI
+            )
+            for box in layout_bboxes
+        ]
+
+        # Filter out problematic Bboxes
+        def is_valid_bbox(b: LayoutBox):
+            def val(v: float):
+                return v >= 0 and v <= 1
+
+            w = b.x2 - b.x0
+            h = b.y2 - b.y0
+            return (
+                val(w)
+                and val(h)
+                and val(b.x0)
+                and val(b.x2)
+                and val(b.y0)
+                and val(b.y2)
+            )
+
+        layout_bboxes_normalized_filtered = []
+        problematic_bboxes = []
+        for box in layout_bboxes_normalized:
+            if is_valid_bbox(box):
+                layout_bboxes_normalized_filtered.append(box)
+            else:
+                problematic_bboxes.append(box)
+
+        if len(problematic_bboxes) == 0:
+            # ...
+            # print(
+            #     f"Removed {len(problematic_bboxes)} buggy bboxes, remaining: {len(layout_bboxes_normalized_filtered)}!"
+            # )
+            # print(dsfiles.debug_pdf_layout_directory / f"{document_id}.pdf")
+            # input(document_id)
+
+            gt_path = dsfiles.gt_directory / f"{document_id}.json"
+            gt_path.write_text(
+                json.dumps(
+                    [asdict(box) for box in layout_bboxes_normalized_filtered], indent=2
+                ),
+                encoding="utf-8",
+            )
+
+            # Update the debug PDF
+            update_debug_pdfs = params.debug
+            if update_debug_pdfs:
+                from docgenie.generation.utils.debug import draw_geos_on_pdf
+
+                debug_pdf_file = (
+                    dsfiles.debug_pdf_layout_directory / f"{document_id}.pdf"
+                )
+                print(f"Updating: {debug_pdf_file}")
+                draw_geos_on_pdf(
+                    geos=[
+                        {
+                            "rect": {
+                                "x": box.x0,
+                                "y": box.y0,
+                                "width": box.x2 - box.x0,
+                                "height": box.y2 - box.y0,
+                            }
+                        }
+                        for box in layout_bboxes
+                    ],
+                    pdf_in=dsfiles.pdf_initial_directory / f"{document_id}.pdf",
+                    pdf_out=debug_pdf_file,
+                )
+        else:
+            gt_verification_passed = False
+
+    gt_validation_log = {
+        DocLogKey.gt_verification_confirmed_keys: [],
+        DocLogKey.gt_verification_similarities: [],
+        DocLogKey.gt_verification_passed: gt_verification_passed,
+        DocLogKey.gt_verification_skipped: False,
+    }
+    return gt_validation_log
+
+
+def prepare_and_verify_gt_classification(
+    document_id: str,
+    raw_annotations: dict,
+    dsfiles: SyntheticDatasetFileStructure,
+    dsdef: SynDatasetDefinition,
+) -> dict:
+    _, cls = next(iter(raw_annotations.items()), (None, None))
+    gt_data = {"label": cls}
+    gt_path = dsfiles.gt_directory / f"{document_id}.json"
+    gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8")
+
+    valid_label = cls in dsdef.valid_labels
+    if not valid_label:
+        print(f'Not a valid label "{cls}", not in {dsdef.valid_labels}')
+
+    gt_validation_log = {
+        DocLogKey.gt_verification_confirmed_keys: [],
+        DocLogKey.gt_verification_similarities: [],
+        DocLogKey.gt_verification_passed: valid_label,
+        DocLogKey.gt_verification_skipped: False,
+    }
+    return gt_validation_log
+
+
+def _postprocess_qa_gt_search_answer_indices(
+    gts: dict, document_text, cutoff: float, bboxes: list[OCRBox], text_positions
+):
+    verbatim_gts = dict()
+    similarities = dict()
+    keys_with_values_found = list()
+    bbox_indices_per_key = dict()
+
+    # Build document text and map each word's char span
+    document_text = ""
+    text_positions = []
+    pos = 0
+    for b in bboxes:
+        start = pos
+        document_text += b.text + " "
+        end = len(document_text) - 1
+        text_positions.append((start, end))
+        pos = len(document_text)
+
+    for k, v in gts.items():
+        if isinstance(v, dict):
+            for qa_key, qa_value in v.items():
+                best_text, similarity, found, bbox_indices = (
+                    _find_best_fuzzy_match_span(
+                        document_text,
+                        qa_value,
+                        cutoff=cutoff,
+                        text_positions=text_positions,
+                    )
+                )
+
+                full_key = f"{k}{__KEY_SEPERATOR}{qa_key}"
+                if found:
+                    keys_with_values_found.append(full_key)
+
+                verbatim_gts[full_key] = best_text.strip()
+                similarities[full_key] = similarity
+                bbox_indices_per_key[full_key] = bbox_indices
+
+        else:
+            best_text, similarity, found, bbox_indices = _find_best_fuzzy_match_span(
+                document_text,
+                v,
+                cutoff=cutoff,
+                text_positions=text_positions,
+            )
+
+            if found:
+                keys_with_values_found.append(k)
+
+            verbatim_gts[k] = best_text.strip()
+            similarities[k] = similarity
+            bbox_indices_per_key[k] = bbox_indices
+
+    return verbatim_gts, keys_with_values_found, similarities, bbox_indices_per_key
+
+
+def prepare_and_verify_gt_qa(
+    dsdef: SynDatasetDefinition,
+    dsfiles: SyntheticDatasetFileStructure,
+    document_id: str,
+    verbatim_gts: dict,
+    bbox_indices_per_key: dict,
+    keys_with_values_found: list,
+    similarities: dict,
+):
+    gt_data = []
+
+    for i, q in enumerate(keys_with_values_found):
+        answer_indices = bbox_indices_per_key[q]
+        a = verbatim_gts[q]
+        gt_data.append(
+            {
+                "question": q,
+                "answer": a,
+                "answer_bbox_indices": answer_indices,
+            }
+        )
+
+    # Save postprocessed GTs
+    gt_path = dsfiles.gt_directory / f"{document_id}.json"
+    gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8")
+
+    # Return GT validation log + bbox info
+    gt_validation_log = {
+        DocLogKey.gt_verification_confirmed_keys: keys_with_values_found,
+        DocLogKey.gt_verification_similarities: similarities,
+        DocLogKey.gt_verification_passed: len(keys_with_values_found) > 0,
+        DocLogKey.gt_verification_skipped: False,
+    }
+
+    return gt_validation_log
+
+def pdf_region_to_image(r):
+    scale = PDF_DPI / 72.0
+    x_px = r["x"] * scale
+    #y_px = (page_height_pt - y_pt - h_pt) * scale
+    y_px = r["y"] * scale
+    w_px = r["width"] * scale
+    h_px = r["height"] * scale
+    return{ "x": x_px, "y": y_px, "width": w_px, "height": h_px }
+
+
+def _postprocess_kie_gt_search_key_indices(
+    gts: list,
+    document_text,
+    cutoff: float,
+    bboxes: list[OCRBox],
+    text_positions,
+    doclog: SynDocumentLog,
+    dsfiles: SyntheticDatasetFileStructure,
+    is_annotation_task: bool,
+):
+    verbatim_gts = dict()
+    similarities = dict()
+    keys_with_values_found = list()
+    bbox_indices_per_key = dict()
+    key_to_label = dict()
+
+    for d in gts:
+        if d["error"]:
+            continue
+
+        g = d["group"]
+        k = d["key"]
+        lbl = k
+        if g is not None:
+            k = f"{k}_{g}"
+
+        key_to_label[k] = lbl
+        v = d["value"]
+        r = d["rect"]
+
+        if is_annotation_task and doclog.ocr_required:
+            r = pdf_region_to_image(r)
+
+        found = False
+        best_text = ""
+        similarity = -1
+        bbox_indices = None
+
+        if v:
+            bbox_indices_in_rect = (
+                [
+                    i
+                    for i, b in enumerate(bboxes)
+                    if is_in_rect(
+                        rect=r, bbox=b, threshold=BBOX_TO_GEO_MATCHING_THRESHOLD
+                    )
+                ]
+                if is_annotation_task
+                else None
+            )
+
+            best_text, similarity, found, bbox_indices = (
+                _find_best_fuzzy_match_span_restriced(
+                    document_text,
+                    v,
+                    cutoff=cutoff,
+                    allowed_bbox_indices=bbox_indices_in_rect,  # is None for JSON tasks und thus unrestricted
+                    text_positions=text_positions,
+                )
+            )
+
+            # if doclog.document_id == '74577486-4e36-425e-b733-8a745ca840f1_0':
+            #     print(f'{is_annotation_task=} {k=} {v=} {best_text=} {similarity=} {found=} {bbox_indices=} {bbox_indices_in_rect=}')
+            #     input()
+
+        # if not found:
+        #     print(
+        #         f"RESTRICTED\n{bbox_indices_in_rect=} {v=} {best_text=} {similarity=} {found=} {bbox_indices=}"
+        #     )
+        #     print(" ".join([bboxes[i].text for i in bbox_indices_in_rect]))
+        #     input()
+
+        if found:
+            keys_with_values_found.append(k)
+
+        verbatim_gts[k] = best_text.strip()
+        similarities[k] = similarity
+        bbox_indices_per_key[k] = bbox_indices
+
+    return (
+        verbatim_gts,
+        keys_with_values_found,
+        similarities,
+        bbox_indices_per_key,
+        key_to_label,
+    )
+
+
+def prepare_and_verify_gt_kie(
+    dsdef: SynDatasetDefinition,
+    dsfiles: SyntheticDatasetFileStructure,
+    document_id: str,
+    verbatim_gts: dict,
+    key_to_label: dict,
+    word_bboxes: list[OCRBox],
+    bbox_indices_per_key: dict,
+    keys_with_values_found: list,
+    similarities: dict,
+):
+    gt_data = dict()
+    gt_data["entities"] = []
+
+    # BIO Tagging: first collect all B- and I-
+    non_o = dict()
+    known_keys = set()
+    key: str
+    for key in keys_with_values_found:
+        k = key
+        lbl = key_to_label[k]
+        g = k.replace(f"{lbl}_", "")
+
+        # grouping is only relevant for entity linking tasks
+
+        known_keys.add(k)
+        answer_indices = bbox_indices_per_key[key]
+        value = verbatim_gts[key]
+
+        label_mapped = lbl
+        if dsdef.label_mapping is not None and len(dsdef.label_mapping) > 0:
+            label_mapped = dsdef.label_mapping[lbl]
+
+        gt_data["entities"].append(
+            {
+                "key": label_mapped,
+                "value": value,
+                "group": g,  # is '' when no group given, not None
+                "bbox_indices": answer_indices,
+                "similarity": similarities[k],
+            }
+        )
+
+        # print(f"{key=} {value=} {answer_indices=}")
+        for i, bidx in enumerate(answer_indices):
+            prefix = "B-" if i == 0 else "I-"
+            non_o[bidx] = f"{prefix}{lbl}"
+
+    # Then add all O Tags
+    word_labels = [non_o.get(i, "O") for i in range(len(word_bboxes))]
+    gt_data["word_labels"] = word_labels
+
+    # Save postprocessed GTs
+    gt_path = dsfiles.gt_directory / f"{document_id}.json"
+    gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8")
+
+    # Return GT validation log + bbox info
+    gt_validation_log = {
+        DocLogKey.gt_verification_confirmed_keys: keys_with_values_found,
+        DocLogKey.gt_verification_similarities: similarities,
+        DocLogKey.gt_verification_passed: len(keys_with_values_found) > 0,
+        DocLogKey.gt_verification_skipped: False,
+    }
+
+    return gt_validation_log
+
+
+def prepare_and_verify_gt(
+    dsdef: SynDatasetDefinition,
+    document_id: str,
+    cutoff: float,
+    params: PipelineParameters,
+) -> dict:
+    dsfiles = dsdef.get_file_structure()
+    dataset_task = DatasetTask(dsdef.task)
+    if dataset_task == DatasetTask.DLA:
+        return prepare_and_verify_gt_dla(
+            document_id=document_id, dsfiles=dsfiles, dsdef=dsdef, params=params
+        )
+
+    raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json"
+    raw_annotations = json.loads(raw_annotations_path.read_text(encoding="utf-8"))
+
+    if dataset_task == DatasetTask.CLASSIFICATION:
+        # Classification labels do not need to be searched in the OCR.
+        # Currently, we do not check if the generated labels are valid, it might contain hallucinations.
+        return prepare_and_verify_gt_classification(
+            document_id=document_id,
+            raw_annotations=raw_annotations,
+            dsfiles=dsfiles,
+            dsdef=dsdef,
+        )
+
+    bbox_path = dsfiles.get_final_bbox_path(level="word", doc_id=document_id)
+    bboxes: list[OCRBox] = read_syn_dataset_bboxes(box_path=bbox_path)
+    # Rect ist das Problem nicht bbox
+    #bboxes = [b.unnormalize(width_px=width_px, height_px=height_px) for b in bboxes_normalized]
+
+    # Build document text and map each word's char span
+    document_text = ""
+    text_positions = []
+    pos = 0
+    for b in bboxes:
+        start = pos
+        document_text += b.text + " "
+        end = len(document_text) - 1
+        text_positions.append((start, end))
+        pos = len(document_text)
+
+    if dataset_task == DatasetTask.QA:
+        verbatim_gts, keys_with_values_found, similarities, bbox_indices_per_key = (
+            _postprocess_qa_gt_search_answer_indices(
+                gts=raw_annotations,
+                document_text=document_text,
+                cutoff=cutoff,
+                bboxes=bboxes,
+                text_positions=text_positions,
+            )
+        )
+
+        return prepare_and_verify_gt_qa(
+            dsdef=dsdef,
+            dsfiles=dsfiles,
+            document_id=document_id,
+            verbatim_gts=verbatim_gts,
+            bbox_indices_per_key=bbox_indices_per_key,
+            keys_with_values_found=keys_with_values_found,
+            similarities=similarities,
+        )
+
+    if dataset_task == DatasetTask.KIE:
+        # SROIE is modeled as JSON, but CORD and FUNSD as annotation task
+        is_annotation_task = dsdef.prompt_task == "annotation"
+        (
+            verbatim_gts,
+            keys_with_values_found,
+            similarities,
+            bbox_indices_per_key,
+            key_to_label,
+        ) = _postprocess_kie_gt_search_key_indices(
+            gts=raw_annotations,
+            document_text=document_text,
+            cutoff=cutoff,
+            bboxes=bboxes,
+            text_positions=text_positions,
+            doclog=SynDocumentLog(document_id, dsfiles.document_logs_directory),
+            dsfiles=dsfiles,
+            is_annotation_task=is_annotation_task,
+        )
+
+        return prepare_and_verify_gt_kie(
+            dsdef=dsdef,
+            dsfiles=dsfiles,
+            document_id=document_id,
+            verbatim_gts=verbatim_gts,
+            key_to_label=key_to_label,
+            bbox_indices_per_key=bbox_indices_per_key,
+            word_bboxes=bboxes,
+            keys_with_values_found=keys_with_values_found,
+            similarities=similarities,
+            # is_annotation_task=is_annotation_task,
+        )
+
+    raise ValueError(f"Unknown synthetic dataset task: {dataset_task}")
+
+
+def pipeline_ground_truth_verification(params: PipelineParameters):
+    log_pipeline_level()
+
+    cutoff = PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF
+
+    dsdef = params.dsdef
+
+    # Get valid PDF paths (single page, not processed yet)
+    valid_document_ids = []
+    total_annotations_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_annotations_count += 1
+        gt_valid = (doclog.raw_json_gt_found and doclog.raw_json_gt_valid_json) or (
+            doclog.raw_annotation_gt_found
+            #and len(doclog.raw_annotation_gt_extraction_errors) == 0
+        )
+        if doclog.pdf_num_pages == 1 and doclog.ocr_found and gt_valid:
+            # annotations_path = dsfiles.gt_directory / f"{doclog.document_id}.json"
+            # if not annotations_path.exists():
+            valid_document_ids.append(doclog.document_id)
+
+    print(
+        f"{len(valid_document_ids)} out of {total_annotations_count} documents valid for GT preparation and verification."
+    )
+
+    with get_progress_bar() as progress:
+        verification_task = progress.add_task(
+            f"[red]Preparing and verifying {len(valid_document_ids)} document annotations...",
+            total=len(valid_document_ids),
+        )
+
+        for document_id in valid_document_ids:
+            dsfiles = dsdef.get_file_structure()
+            gt_log = prepare_and_verify_gt(
+                dsdef=dsdef, document_id=document_id, cutoff=cutoff, params=params
+            )
+            dsdef.write_to_document_log(
+                document_id=document_id,
+                vals=gt_log,
+            )
+            progress.update(verification_task, advance=1)
diff --git a/docgenie/generation/pipeline_18_analyze.py b/docgenie/generation/pipeline_18_analyze.py
new file mode 100755
index 0000000000000000000000000000000000000000..0fb4f6bcb011a097c4e001ae7866cc8153f73c54
--- /dev/null
+++ b/docgenie/generation/pipeline_18_analyze.py
@@ -0,0 +1,240 @@
+from collections import Counter
+import json
+from docgenie.generation.models import (
+    DocLogKey,
+    MessageProcessingLogKey,
+    PipelineParameters,
+    PromptMsgResultLogKey,
+)
+from docgenie.generation.pipeline_01.cost import print_cost_report
+from docgenie.generation.utils.log import log_pipeline_level
+
+
+def pipeline_analyze(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+
+    # Count results of each step
+    # STEP 1 - Prompt Message Results
+    dsfiles = dsdef.get_file_structure()
+    batches_count = sum(
+        [1 for f in dsfiles.prompt_batches_directory.iterdir() if f.is_file()]
+    )
+    messages_count = sum(
+        [1 for f in dsfiles.message_results_directory.iterdir() if f.is_file()]
+    )
+    errored_messages_count = 0
+    succeeded_messages_count = 0
+    total_usage_output_tokens = 0
+    total_usage_input_tokens = 0
+    for f in dsfiles.message_results_directory.iterdir():
+        msg_result = json.loads(f.read_text(encoding="utf-8"))
+
+        total_usage_input_tokens += msg_result[PromptMsgResultLogKey.usage_input_tokens]
+        total_usage_output_tokens += msg_result[
+            PromptMsgResultLogKey.usage_output_tokens
+        ]
+
+        if msg_result[PromptMsgResultLogKey.result_type] == "errored":
+            errored_messages_count += 1
+        elif msg_result[PromptMsgResultLogKey.result_type] == "succeeded":
+            succeeded_messages_count += 1
+
+    # STEP 2 - Message Processing Logs
+    num_documents_expected = 0
+    num_documents_found = 0
+    for f in dsfiles.message_processing_logs_directory.iterdir():
+        msg_processing_log = json.loads(f.read_text(encoding="utf-8"))
+        if msg_processing_log[MessageProcessingLogKey.result_type] == "succeeded":
+            num_documents_expected += msg_processing_log[
+                MessageProcessingLogKey.num_documents_expected
+            ]
+            num_documents_found += msg_processing_log[
+                MessageProcessingLogKey.num_documents_found
+            ]
+
+    prompting_log = {
+        "batches_count": batches_count,
+        "messages_count": messages_count,
+        "total_usage_input_tokens": total_usage_input_tokens,
+        "total_usage_output_tokens": total_usage_output_tokens,
+        "succeeded_messages_count": succeeded_messages_count,
+        "errored_messages_count": errored_messages_count,
+        "num_documents_expected": num_documents_expected,
+        "num_documents_found": num_documents_found,
+    }
+
+    # STEP 3 - Document Logs
+    def list_to_entry(items: set):
+        return {"total": len(items), "items": sorted(items)}
+
+    class DocumentError:
+        is_multipage = "is_multipage"
+        invalid_raw_gt = "invalid_raw_gt"
+        cannot_map_chars_to_words = "cannot_map_chars_to_words"
+        visual_elements_extraction_error = "visual_elements_extraction_error"
+        handwriting_extraction_error = "handwriting_extraction_error"
+        missing_handwriting_images = "missing_handwriting_images"
+        missing_ocr = "missing_ocr"
+        gt_verification_failed = "gt_verification_failed"
+        no_text = "no_text"
+
+    has_no_valid_gt = set()
+    has_multiple_pages_pass1 = set()
+    cannot_map_chars_to_words = set()
+    has_visual_element_extraction_errors = set()
+    has_handwriting_extraction_errors = set()
+    has_missing_handwriting = set()
+    has_missing_ocr = set()
+    has_failed_gt_verification = set()
+    has_no_text = set()
+
+    has_handwriting = set()
+    has_no_handwriting = set()
+    has_ve = set()
+    has_no_ve = set()
+    has_handwriting_and_ve = set()
+
+    doc_level_stats_counter = (
+        Counter()
+    )  # handwriting_num_elements visual_elements_num_elements
+    valid_samples = set()
+    document_errors = dict()
+
+    # Fetch perfect documents
+    total_documents = 0
+    min_annotation_count = 99999
+    max_annotation_count = 0
+    for doclog in dsdef.get_document_logs():
+        did = doclog.document_id
+        total_documents += 1
+
+        gt_valid = (doclog.raw_json_gt_found and doclog.raw_json_gt_valid_json) or (
+            doclog.raw_annotation_gt_found
+        )
+        if not gt_valid:
+            has_no_valid_gt.add(did)
+            document_errors[did] = DocumentError.invalid_raw_gt
+            continue
+
+        if not doclog.pdf_num_pages == 1:
+            has_multiple_pages_pass1.add(did)
+            document_errors[did] = DocumentError.is_multipage
+            continue
+
+        if not doclog.can_map_chars_to_words:
+            cannot_map_chars_to_words.add(did)
+            document_errors[did] = DocumentError.cannot_map_chars_to_words
+            continue
+
+        if len(doclog.visual_elements_extraction_errors) != 0:
+            has_visual_element_extraction_errors.add(did)
+            document_errors[did] = DocumentError.visual_elements_extraction_error
+            continue
+
+        if len(doclog.handwriting_element_extraction_errors) != 0:
+            has_handwriting_extraction_errors.add(did)
+            document_errors[did] = DocumentError.handwriting_extraction_error
+            continue
+
+        if len(doclog.handwriting_missing_images) != 0:
+            has_missing_handwriting.add(did)
+            document_errors[did] = DocumentError.missing_handwriting_images
+            continue
+
+        if not doclog.ocr_found:
+            has_missing_ocr.add(did)
+            document_errors[did] = DocumentError.missing_ocr
+            continue
+
+        if not doclog.gt_verification_passed:
+            has_failed_gt_verification.add(did)
+            document_errors[did] = DocumentError.gt_verification_failed
+            continue
+
+        if doclog.num_word_bboxes == 0:
+            has_no_text.add(did)
+            document_errors[did] = DocumentError.no_text
+            continue
+
+        if doclog.handwriting_num_elements > 0:
+            doc_level_stats_counter[DocLogKey.handwriting_num_elements] += (
+                doclog.handwriting_num_elements
+            )
+            has_handwriting.add(did)
+        else:
+            has_no_handwriting.add(did)
+
+        if doclog.visual_elements_num_elements > 0:
+            doc_level_stats_counter[DocLogKey.visual_elements_num_elements] += (
+                doclog.visual_elements_num_elements
+            )
+            has_ve.add(did)
+
+            if doclog.handwriting_num_elements > 0:
+                has_handwriting_and_ve.add(did)
+        else:
+            has_no_ve.add(did)
+
+        if doclog.num_word_bboxes != -1:
+            doc_level_stats_counter["num_words"] += doclog.num_word_bboxes
+
+        if doclog.num_char_bboxes != -1:
+            doc_level_stats_counter["num_chars"] += doclog.num_char_bboxes
+
+        annotations_count = doclog.annotations_count
+        doc_level_stats_counter["annotations_count"] += annotations_count
+        min_annotation_count = min(min_annotation_count, annotations_count)
+        max_annotation_count = max(max_annotation_count, annotations_count)
+
+        valid_samples.add(did)
+
+    # Divide all counts by divisor (keep as float)
+    normalized = {k: v / len(valid_samples) for k, v in doc_level_stats_counter.items()}
+
+    total_cost_summary = print_cost_report(
+        batch_data_directory=dsfiles.prompt_batches_directory,
+        dataset_log_path=dsfiles.ds_log_path,
+    )
+
+    dataset_log = {
+        "prompting": prompting_log,
+        "total_cost_summary": total_cost_summary,
+        "valid_samples_stats": {
+            "total": doc_level_stats_counter,
+            "avg": normalized,
+            "min_annotation_count": min_annotation_count,
+            "max_annotation_count": max_annotation_count,
+        },
+        "total_samples": total_documents,
+        "valid_samples": list_to_entry(valid_samples),
+        "valid_samples_by_category": {
+            "has_handwriting": list_to_entry(has_handwriting),
+            "has_visual_elements": list_to_entry(has_ve),
+            "has_handwriting_and_visual_elements": list_to_entry(
+                has_handwriting_and_ve
+            ),
+            "no_handwriting": list_to_entry(has_no_handwriting),
+            "no_visual_elements": list_to_entry(has_no_ve),
+        },
+        "errors": {
+            "has_no_valid_gt": list_to_entry(has_no_valid_gt),
+            "has_multiple_pages_pass1": list_to_entry(has_multiple_pages_pass1),
+            "cannot_map_chars_to_words": list_to_entry(cannot_map_chars_to_words),
+            "has_visual_element_extraction_errors": list_to_entry(
+                has_visual_element_extraction_errors
+            ),
+            "has_handwriting_extraction_errors": list_to_entry(
+                has_handwriting_extraction_errors
+            ),
+            "has_missing_handwriting": list_to_entry(has_missing_handwriting),
+            "has_missing_ocr": list_to_entry(has_missing_ocr),
+            "has_failed_gt_verification": list_to_entry(has_failed_gt_verification),
+            "has_no_text": list_to_entry(has_no_text),
+        },
+        "docid_to_error": document_errors,
+    }
+    print(f"Valid samples: {len(valid_samples)}, errors: {len(document_errors)}")
+
+    dsfiles.ds_log_path.write_text(json.dumps(dataset_log, indent=2), encoding="utf-8")
diff --git a/docgenie/generation/pipeline_19_create_debug_data.py b/docgenie/generation/pipeline_19_create_debug_data.py
new file mode 100755
index 0000000000000000000000000000000000000000..2324df29be98dfb31529c69b9ae106dc2c190289
--- /dev/null
+++ b/docgenie/generation/pipeline_19_create_debug_data.py
@@ -0,0 +1,152 @@
+import json
+import pathlib
+import shutil
+
+import fitz
+from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
+from docgenie.generation.models import (
+    OCRBox,
+    PipelineParameters,
+    SynDatasetDefinition,
+    SynDocumentLog,
+)
+from rich.progress import (
+    Progress,
+    TimeElapsedColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+)
+from PIL import Image
+
+from docgenie.generation.utils.bboxes import (
+    draw_bboxes_on_image,
+    draw_bboxes_on_pdf,
+    read_syn_dataset_bboxes,
+    save_bboxes,
+)
+from docgenie.generation.utils.geos import rect_to_ocrbox
+from docgenie.generation.utils.log import log_pipeline_level
+from docgenie.generation.utils.status import get_progress_bar
+
+
+def mm_to_px(mm: int | float):
+    return mm * 72 / 25.4
+
+
+def draw_visual_elements_debug(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+    bboxes = []
+    data_path = dsfiles.visual_element_definitions_directory / f"{docid}.json"
+    data = json.loads(data_path.read_text(encoding="utf-8"))
+    for d in data:
+        if d["error"] is None:
+            b = rect_to_ocrbox(d["rect"])
+            bboxes.append(b)
+
+    draw_bboxes_on_pdf(
+        dsfiles.final_pdf_directory / f"{docid}.pdf",
+        dsfiles.debug_pdf_visual_elements_directory / f"{docid}.pdf",
+        bboxes,
+        color=(0, 0, 1),  # visual elements blue
+    )
+
+
+def unnormalize_bboxes(bboxes: list[OCRBox], width: float, height: float):
+    for b in bboxes:
+        yield OCRBox(
+            x0=b.x0 * width,
+            y0=b.y0 * height,
+            x2=b.x2 * width,
+            y2=b.y2 * height,
+            text=b.text,
+            block_no=b.block_no,
+            line_no=b.line_no,
+            word_no=b.word_no,
+        )
+
+
+def draw_bbox_final_debug(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+
+    bbox_norm_path = dsfiles.get_final_normalized_bbox_path(
+        level="segment", doc_id=docid
+    )
+    bbox_norm = read_syn_dataset_bboxes(bbox_norm_path)
+
+    img_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
+    img = Image.open(img_path)
+    width, height = img.size
+    bbox_unnorm = list(unnormalize_bboxes(bboxes=bbox_norm, width=width, height=height))
+
+    try:
+        img_altered = draw_bboxes_on_image(img, bbox_unnorm, show_text=True)
+        img_altered.save(
+            dsfiles.debug_pdf_bboxes_final_directory / f"{docid}.{IMAGE_RENDER_EXT}"
+        )
+    except Exception as err:
+        print(f"[ERROR]: Skipping debug PDF: {str(err)}")
+
+
+def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str):
+    dsfiles = dsdef.get_file_structure()
+
+    bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid)
+    bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path)
+
+    pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf"
+    outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf"
+
+    try:
+        draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm)
+    except Exception as err:
+        print(f"[ERROR]: Skipping debug PDF: {str(err)}")
+
+
+def pipeline_create_debug_data(params: PipelineParameters):
+    log_pipeline_level()
+
+    dsdef = params.dsdef
+    dsfiles = dsdef.get_file_structure()
+
+    # Get valid documents
+    valid_documents = []
+    total_pdfs_count = 0
+    for doclog in dsdef.get_document_logs():
+        total_pdfs_count += 1
+
+        if doclog.pdf_num_pages == 1:
+            valid_documents.append(doclog)
+
+    print(f"Found {len(valid_documents)} documents valid for debug PDF/Img drawing.")
+
+    with get_progress_bar() as progress:
+        task = progress.add_task(
+            "[white]Draw Debug PDF/Images...", total=len(valid_documents)
+        )
+
+        doclog: SynDocumentLog
+        for doclog in valid_documents:
+            docid = doclog.document_id
+
+            # Copy raw HTML to debug directory
+            src = dsfiles.raw_html_directory / f"{docid}.html"
+            tgt = dsfiles.debug_html_raw_directory / f"{docid}.html"
+            shutil.copy(src, tgt)
+
+            if doclog.visual_elements_num_elements > 0:
+                draw_visual_elements_debug(dsdef=dsdef, docid=docid)
+
+            # Handwriting debug is created when handwriting is inserted
+
+            if doclog.ocr_found:
+                draw_bbox_final_debug(dsdef=dsdef, docid=docid)
+
+            progress.update(task, advance=1)
+
+        # Copy debug script into debug html directory
+        debug_script_fname = "debug.js"
+        src_dir = pathlib.Path(__file__).parent
+        src_path = src_dir / debug_script_fname
+        dst_path = dsfiles.debug_html_raw_directory / debug_script_fname
+        shutil.copy(src_path, dst_path)
diff --git a/docgenie/generation/utils/bboxes.py b/docgenie/generation/utils/bboxes.py
new file mode 100755
index 0000000000000000000000000000000000000000..cda360a313652c0eab4f35cb75401284789f431d
--- /dev/null
+++ b/docgenie/generation/utils/bboxes.py
@@ -0,0 +1,128 @@
+from pathlib import Path
+
+from PIL import Image, ImageDraw, ImageFont
+import pymupdf
+
+from docgenie.generation.models import OCRBox
+from docgenie.generation.models._bbox import LayoutBox
+
+
+def is_in_rect(rect: dict, bbox: OCRBox, threshold: float, document_id: str | None = None):
+    # Convert back PDF points to pixels
+    r_x0 = rect["x"] - threshold
+    r_y0 = rect["y"] - threshold
+    r_x2 = r_x0 + rect["width"] + 2 * threshold
+    r_y2 = r_y0 + rect["height"] + 2 * threshold
+
+    left = bbox.x0 >= r_x0
+    top = bbox.y0 >= r_y0
+    right = bbox.x2 <= r_x2
+    bottom = bbox.y2 <= r_y2
+
+    # if document_id == '74577486-4e36-425e-b733-8a745ca840f1_0':
+    #     print(f'{left=} {top=} {right=} {bottom=} {bbox.as_string} {rect=}')
+
+    return left and top and right and bottom
+
+
+def save_bboxes(
+    bboxes: list[OCRBox],
+    bbox_path: Path,
+):
+    bbox_path.parent.mkdir(exist_ok=True, parents=True)
+    with bbox_path.open(mode="w", encoding="utf-8") as f:
+        for i, box in enumerate(bboxes):
+            line = box.as_string()
+            if i < len(bboxes) - 1:
+                line += "\n"
+            f.write(line)
+
+
+def read_syn_dataset_bbox_str(line: str) -> OCRBox:
+    parts = line.split(",", 4)
+    x0 = float(parts[0])
+    y0 = float(parts[1])
+    x2 = float(parts[2])
+    y2 = float(parts[3])
+    txt = parts[4]
+    parts = txt.rsplit(",", 3)
+    txt = parts[0]
+    block_no = int(parts[1])
+    line_no = int(parts[2])
+    word_no = int(parts[3])
+    return OCRBox(
+        x0=x0,
+        y0=y0,
+        x2=x2,
+        y2=y2,
+        text=txt,
+        block_no=block_no,
+        line_no=line_no,
+        word_no=word_no,
+    )
+
+
+def read_syn_dataset_bboxes(box_path) -> list[OCRBox]:
+    """
+    Reads bboxes from synthetic datasets
+    """
+    bboxes = []
+    line: str
+    for line in box_path.read_text(encoding="utf-8").splitlines():
+        bboxes.append(read_syn_dataset_bbox_str(line))
+    return bboxes
+
+
+def draw_pdf_bboxes_on_pdf(pdf_path, outpath: Path):
+    doc = pymupdf.open(pdf_path)
+    for page_num, page in enumerate(doc.pages()):
+        for block in page.get_text("words"):
+            x0, y0, x1, y1, txt = block[:5]
+            # rect = pymupdf.Rect(block[:4])
+            block = (round(x0), round(y0), round(x1), round(y1))
+            rect = pymupdf.Rect(block)
+            print(",".join([str(x) for x in block]))
+            page.draw_rect(rect, color=(1, 0, 0))  # Red box
+
+        doc.save(outpath)
+
+
+def draw_bboxes_on_pdf(
+    pdf_path: Path, outpath: Path, bboxes: list[OCRBox], color=(1, 0, 0)
+):
+    doc = pymupdf.open(pdf_path)
+    for page_num, page in enumerate(doc.pages()):
+        for bbox in bboxes:
+            # rect = pymupdf.Rect(block[:4])
+            block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2))
+            rect = pymupdf.Rect(block)
+            page.draw_rect(rect, color=color)  # Red box
+
+        doc.save(outpath)
+
+
+def draw_bboxes_on_image(
+    image, bboxes: list[OCRBox], color="red", width=3, show_text=True
+) -> Image.Image:
+    """
+    Draws bounding boxes on a given Pillow image.
+
+    :param image: Pillow Image object
+    :param bboxes: List of bounding boxes [(x0, y0, x1, y1), ...]
+    :param color: Color of the bounding box (default: red)
+    :param width: Line width (default: 3)
+    :return: Image with bounding boxes
+    """
+    draw = ImageDraw.Draw(image)
+
+    bbox: OCRBox
+    for bbox in bboxes:
+        box = (bbox.x0, bbox.y0, bbox.x2, bbox.y2)
+        draw.rectangle(box, outline=color, width=width)
+
+        # font = ImageFont.truetype("sans-serif.ttf", 16)
+        if show_text:
+            font = ImageFont.load_default(32)
+            draw.text(box, bbox.text, (255, 0, 255), font=font)  # type: ignore
+
+    return image
diff --git a/docgenie/generation/utils/debug.py b/docgenie/generation/utils/debug.py
new file mode 100755
index 0000000000000000000000000000000000000000..cc424302509cc339e46f440fcdbfbde258b06ec2
--- /dev/null
+++ b/docgenie/generation/utils/debug.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+
+import pymupdf
+
+from docgenie.generation.models._bbox import OCRBox
+from docgenie.generation.utils.bboxes import draw_bboxes_on_pdf
+
+
+def draw_geos_on_pdf(geos: list[dict], pdf_in: Path, pdf_out: Path):
+    bboxes = []
+    for g in geos:
+        x0 = float(g["rect"]["x"])
+        y0 = float(g["rect"]["y"])
+        x2 = x0 + float(g["rect"]["width"])
+        y2 = y0 + float(g["rect"]["height"])
+        b = OCRBox(
+            x0=x0,
+            y0=y0,
+            x2=x2,
+            y2=y2,
+            text="",
+            block_no=-1,
+            line_no=-1,
+            word_no=-1,
+        )
+        bboxes.append(b)
+
+    draw_bboxes_on_pdf(
+        pdf_path=pdf_in,
+        outpath=pdf_out,
+        bboxes=bboxes,
+    )
+
+
+def draw_geos_and_bboxes_on_pdf(
+    geos: list[dict], bboxes_: list[OCRBox], pdf_in: Path, pdf_out: Path, verbose: bool
+):
+    bboxes = []
+    for g in geos:
+        x0 = float(g["rect"]["x"])
+        y0 = float(g["rect"]["y"])
+        x2 = x0 + float(g["rect"]["width"])
+        y2 = y0 + float(g["rect"]["height"])
+        b = OCRBox(
+            x0=x0,
+            y0=y0,
+            x2=x2,
+            y2=y2,
+            text="",
+            block_no=-1,
+            line_no=-1,
+            word_no=-1,
+        )
+        bboxes.append(b)
+
+    doc = pymupdf.open(pdf_in)
+    for page_num, page in enumerate(doc.pages()):
+        # geos red
+        color = (1, 0, 0)
+        for bbox in bboxes:
+            # rect = pymupdf.Rect(block[:4])
+            block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2))
+            rect = pymupdf.Rect(block)
+            page.draw_rect(rect, color=color)  # Red box
+
+            if verbose:
+                print(bbox)
+
+        # bboxes green
+        color = (0, 1, 0)
+        for bbox in bboxes_:
+            # rect = pymupdf.Rect(block[:4])
+            block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2))
+            rect = pymupdf.Rect(block)
+            page.draw_rect(rect, color=color)  # Red box
+
+            if verbose:
+                print(bbox)
+
+        doc.save(pdf_out)
diff --git a/docgenie/generation/utils/documentsize.py b/docgenie/generation/utils/documentsize.py
new file mode 100755
index 0000000000000000000000000000000000000000..7d4115ffb07aba3761b5b132d95a236d1a38a6e9
--- /dev/null
+++ b/docgenie/generation/utils/documentsize.py
@@ -0,0 +1,33 @@
+from PIL import Image
+import fitz
+
+from docgenie.generation.constants import IMAGE_RENDER_EXT
+from docgenie.generation.models._file import SyntheticDatasetFileStructure
+from docgenie.generation.models._log import SynDocumentLog
+
+
+def get_pdf_size_pt(docid: str, dsfiles: SyntheticDatasetFileStructure):
+    pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf"
+    doc = fitz.open(pdf_path)
+    page = doc[0]
+    width_pt, height_pt = page.rect.width, page.rect.height
+    width_px = width_pt
+    height_px = height_pt
+    doc.close()
+    return width_px, height_px
+
+def get_image_size_px(docid: str, dsfiles: SyntheticDatasetFileStructure):
+    # Take size from image -> the bboxes we have are extracted from Image
+    image_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
+    img = Image.open(image_path)
+    width_px, height_px = img.size  # in pixels
+    return width_px, height_px
+
+def get_document_size_for_bbox_unnormalization(docid: str, dsfiles: SyntheticDatasetFileStructure):
+    doclog = SynDocumentLog(document_id=docid, logdir=dsfiles.document_logs_directory)
+    if doclog.ocr_required:
+        # Take size from image -> the bboxes we have are extracted from Image
+        return get_image_size_px(docid=docid, dsfiles=dsfiles)
+    else:
+        # Take size from PDF -> the bboxes we have are extracted from PDF
+        return get_pdf_size_pt(docid, dsfiles)
\ No newline at end of file
diff --git a/docgenie/generation/utils/geos.py b/docgenie/generation/utils/geos.py
new file mode 100755
index 0000000000000000000000000000000000000000..fa4647fa7af33f2790e8aa9283028d3fb026a809
--- /dev/null
+++ b/docgenie/generation/utils/geos.py
@@ -0,0 +1,66 @@
+"""
+{
+    "id": null,
+    "tag": "div",
+    "classes": "signature handwritten author1",
+    "rect": {
+      "x": 521.5546875,
+      "y": 814.7109375,
+      "width": 357.1640625,
+      "height": 31.1953125
+    },
+    "visibility": "visible",
+    "dataContent": null,
+    "dataPlaceholder": null,
+    "style": null,
+    "text": "James Wellington",
+    "selectorType": "handwriting"
+  },
+"""
+
+import json
+from pathlib import Path
+from typing import Iterable
+
+from docgenie.generation.models._bbox import OCRBox
+
+
+def read_visual_elements_from_geos(geo_path: Path) -> Iterable[dict]:
+    data = json.loads(geo_path.read_text(encoding="utf-8"))
+    for d in data:
+        if "visual_element" in d["selectorTypes"]:
+            yield d
+
+
+def read_handwriting_elements_from_geos(geo_path: Path) -> Iterable[dict]:
+    data = json.loads(geo_path.read_text(encoding="utf-8"))
+    for d in data:
+        if "handwriting" in d["selectorTypes"]:
+            yield d
+
+
+def read_layout_elements_from_geos(geo_path: Path) -> Iterable[dict]:
+    data = json.loads(geo_path.read_text(encoding="utf-8"))
+    for d in data:
+        if "layout_element" in d["selectorTypes"]:
+            yield d
+
+
+def read_custom_elements_from_geos(geo_path: Path) -> Iterable[dict]:
+    data = json.loads(geo_path.read_text(encoding="utf-8"))
+    for d in data:
+        if "custom" in d["selectorTypes"]:
+            yield d
+
+
+def rect_to_ocrbox(r: dict, text=None) -> OCRBox:
+    return OCRBox(
+        x0=r["x"],
+        y0=r["y"],
+        x2=r["x"] + r["width"],
+        y2=r["y"] + r["height"],
+        text=text,  # type: ignore
+        block_no=-1,
+        line_no=-1,
+        word_no=-1,
+    )
diff --git a/docgenie/generation/utils/handwriting.py b/docgenie/generation/utils/handwriting.py
new file mode 100755
index 0000000000000000000000000000000000000000..576cee217493bfefc454a20daf0f0a782633a893
--- /dev/null
+++ b/docgenie/generation/utils/handwriting.py
@@ -0,0 +1,26 @@
+from bs4 import Tag
+from docgenie.generation.constants import HANDWRITING_CLASS_NAME
+
+
+def get_author_id_from_field(field: Tag) -> str | None:
+    all_classes = field.get("class", [])  # type: ignore
+    return get_author_id(all_classes)
+
+
+def get_author_id(all_classes: list[str]) -> str | None:
+    other_classes = [c for c in all_classes if c != HANDWRITING_CLASS_NAME]  # type: ignore
+    valid_author_ids = [c for c in other_classes if c.startswith("author")]
+    author_id = valid_author_ids[0] if valid_author_ids else None
+    return author_id
+
+
+def get_all_author_ids(soup) -> set[str]:
+    fields = soup.find_all(class_=HANDWRITING_CLASS_NAME)
+
+    # Extract text content
+    result = set()
+    for i, field in enumerate(fields):
+        author_id = get_author_id_from_field(field)
+        result.add(author_id)
+
+    return result
diff --git a/docgenie/generation/utils/html.py b/docgenie/generation/utils/html.py
new file mode 100755
index 0000000000000000000000000000000000000000..6c4e3f34f9d420aab64fc3080568e085be8c2efe
--- /dev/null
+++ b/docgenie/generation/utils/html.py
@@ -0,0 +1,15 @@
+from bs4 import Tag
+
+
+def get_field_text(field: Tag) -> str:
+    """
+    Extract text from a BeautifulSoup Tag.
+
+    Works for:
+    - <input> elements (uses 'value' attribute)
+    - Other tags (uses inner text)
+    """
+    if field.name == "input":
+        return field.get("value", "").strip()  # type: ignore
+    else:
+        return field.text.strip()
diff --git a/docgenie/generation/utils/image.py b/docgenie/generation/utils/image.py
new file mode 100755
index 0000000000000000000000000000000000000000..65c4d63fe194656618d49e772baca2a510b073c5
--- /dev/null
+++ b/docgenie/generation/utils/image.py
@@ -0,0 +1,52 @@
+import pathlib
+
+import cv2
+import numpy as np
+from PIL import Image
+
+
+def img_write_to_bytes(array, fmt: str = '.png') -> bytes:
+    return cv2.imencode(fmt, array)[1].tobytes()
+
+
+def img_read(input_file: pathlib.Path, flags: int = cv2.IMREAD_COLOR):
+    return cv2.imdecode(np.fromfile(input_file, np.uint8), flags)
+
+
+def downscale_image(img, max_width):
+    # Get current dimensions
+    width, height = img.size
+
+    # Check if resizing is needed
+    if width <= max_width:
+        return img
+
+    # Calculate the new height to maintain aspect ratio
+    new_height = int((max_width / width) * height)
+
+    # Resize the image
+    img_resized = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
+
+    return img_resized
+
+
+def downscale_and_compress_from_path(
+        old_path: pathlib.Path,
+        new_path: pathlib.Path,
+        max_width: int = 500,
+        quality: int = 80,
+):
+    # Open the image
+    img = Image.open(old_path)
+    img = downscale_image(img, max_width=max_width)
+    img.save(new_path, format="JPEG", quality=quality)
+
+
+def downscale_and_compress(
+        img: Image.Image,
+        save_to_path: pathlib.Path,
+        max_width: int = 500,
+        quality: int = 80,
+):
+    img = downscale_image(img, max_width=max_width)
+    img.save(save_to_path, format="JPEG", quality=quality)
diff --git a/docgenie/generation/utils/log.py b/docgenie/generation/utils/log.py
new file mode 100755
index 0000000000000000000000000000000000000000..28ca183274455bcf0bae79f958a92274a6c34e42
--- /dev/null
+++ b/docgenie/generation/utils/log.py
@@ -0,0 +1,11 @@
+import inspect
+from pathlib import Path
+
+
+def log_pipeline_level():
+    # Get the previous frame (the caller)
+    frame = inspect.stack()[1]
+    caller_file = frame.filename
+    level_name = Path(caller_file).name
+    print(f"\n-----> {level_name.upper()} <-----")
+    # input("PRESS KEY")
diff --git a/docgenie/generation/utils/ocr.py b/docgenie/generation/utils/ocr.py
new file mode 100755
index 0000000000000000000000000000000000000000..62757df55724abc7de319d2a6ddac42c255cbba6
--- /dev/null
+++ b/docgenie/generation/utils/ocr.py
@@ -0,0 +1,90 @@
+import os
+from pathlib import Path
+
+import numpy as np
+import requests
+
+from docgenie.generation.utils.image import img_read, img_write_to_bytes
+from docgenie.utils.ocr import MicrosoftOCR, MicrosoftOCRWord
+
+OCR_ENGINE = 'microsoft_di'
+OCR_PORT_ENV = os.getenv('DOCGENIE_OCR_PORT')
+OCR_PORT = OCR_PORT_ENV or '8000'
+OCR_URL = 'http://localhost:' + OCR_PORT
+OCR_POSTFIX = '0.MicrosoftOcrService.json'
+
+
+def get_ocr_cache_path(image_path: Path, postfix: str) -> Path:
+    return image_path.parent / f'{image_path.name}.{postfix}'
+
+
+def call_ocr_service_from_image(image: np.ndarray,
+                                url: str = OCR_URL,
+                                engine: str = OCR_ENGINE,
+                                client_caching: bool = True,
+                                image_path: Path | None = None) -> MicrosoftOCR:
+    headers = {'accept': 'application/json'}
+
+    cache_path = None
+    if client_caching:
+        cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX)
+        if cache_path.exists():
+            return MicrosoftOCR.load_from_file(cache_path)
+
+    encoded_image = img_write_to_bytes(image)
+    files = {'image': encoded_image, 'type': 'image/png'}
+    endpoint = f'{url}/v1/sync/ocr/{engine}'
+    response = requests.post(url=endpoint, headers=headers, files=files)
+    response.raise_for_status()
+
+    data = response.json()
+    first_page = data['ocr']['pages'][0]
+    ocr = MicrosoftOCR(
+        angle=first_page['angle'],
+        width=first_page['imageWidth'],
+        height=first_page['imageHeight'],
+        words=[
+            MicrosoftOCRWord(
+                text=proto['text'],
+                confidence=proto['confidence'],
+                geo=proto['geo']
+            )
+            for proto in first_page['words']
+        ],
+        lines=[
+            MicrosoftOCRWord(
+                text=proto['text'],
+                confidence=proto['confidence'],
+                geo=proto['geo']
+            )
+            for proto in first_page['lines']
+        ],
+    )
+
+    if client_caching and cache_path:
+        ocr.save_to_file(cache_path)
+
+    return ocr
+
+
+def call_ocr_service_from_file(image_path: Path,
+                               url: str = OCR_URL,
+                               engine: str = OCR_ENGINE,
+                               client_caching: bool = True) -> MicrosoftOCR:
+    if client_caching:
+        cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX)
+        if cache_path.exists():
+            return MicrosoftOCR.load_from_file(cache_path)
+
+    image = img_read(image_path)
+    return call_ocr_service_from_image(image, url, engine, client_caching=client_caching, image_path=image_path)
+
+
+if __name__ == '__main__':
+    base_dir = Path("data/temp/OCR/test-dataset")
+    image_file = base_dir / "04276b91-eb12-4b47-80a6-666f6d09b6ce_1.jpg"
+
+    # client_caching: True will also write the OCR file next to the image.
+    ocr: MicrosoftOCR = call_ocr_service_from_file(image_file, client_caching=True)
+    # ocr.save_to_file(...)
+    print(ocr.words)
diff --git a/docgenie/generation/utils/pdfjs.py b/docgenie/generation/utils/pdfjs.py
new file mode 100755
index 0000000000000000000000000000000000000000..f195bc81abddc47ede614cc04a6a83e0f0b5d8a2
--- /dev/null
+++ b/docgenie/generation/utils/pdfjs.py
@@ -0,0 +1,396 @@
+MEASURE_DIMENSIONS_V1 = """
+                            () => {
+                                const body = document.body;
+                                const html = document.documentElement;
+
+                                // Force layout calculation
+                                body.offsetHeight;
+
+                                // Get body's computed style to extract margins
+                                const bodyStyle = window.getComputedStyle(body);
+                                const marginTop = parseFloat(bodyStyle.marginTop) || 0;
+                                const marginBottom = parseFloat(bodyStyle.marginBottom) || 0;
+                                const marginLeft = parseFloat(bodyStyle.marginLeft) || 0;
+                                const marginRight = parseFloat(bodyStyle.marginRight) || 0;
+
+                                const bodyRect = body.getBoundingClientRect();
+
+                                // Find the furthest extent of content
+                                let maxY = bodyRect.bottom;
+                                let maxX = bodyRect.right;
+
+                                const allElements = body.querySelectorAll('*');
+                                allElements.forEach(el => {
+                                    const rect = el.getBoundingClientRect();
+                                    const style = window.getComputedStyle(el);
+
+                                    if (style.display === 'none' ||
+                                        style.visibility === 'hidden' ||
+                                        rect.width === 0 || rect.height === 0) {
+                                        return;
+                                    }
+
+                                    if (rect.bottom > maxY) maxY = rect.bottom;
+                                    if (rect.right > maxX) maxX = rect.right;
+                                });
+
+                                // CRITICAL FIX: Include body margins in the total size
+                                // The PDF needs to be tall enough to contain the margins too!
+                                const totalWidth = Math.ceil(maxX - bodyRect.left + marginRight + 5);
+                                const totalHeight = Math.ceil(maxY + marginBottom + 5);
+
+                                return {
+                                    width: totalWidth,
+                                    height: totalHeight,
+                                    debug: {
+                                        marginTop,
+                                        marginBottom,
+                                        marginLeft,
+                                        marginRight,
+                                        bodyRectTop: bodyRect.top,
+                                        bodyRectBottom: bodyRect.bottom,
+                                        maxY,
+                                        contentHeightWithoutMargin: Math.ceil(maxY - bodyRect.top)
+                                    }
+                                };
+                            }
+                            """
+
+MEASURE_DIMENSIONS_V2 = """
+() => {
+    const body = document.body;
+    const html = document.documentElement;
+
+    // Force layout calculation
+    body.offsetHeight;
+    html.offsetHeight;
+
+    // Get body's computed style to extract margins
+    const bodyStyle = window.getComputedStyle(body);
+    const marginTop = parseFloat(bodyStyle.marginTop) || 0;
+    const marginBottom = parseFloat(bodyStyle.marginBottom) || 0;
+    const marginLeft = parseFloat(bodyStyle.marginLeft) || 0;
+    const marginRight = parseFloat(bodyStyle.marginRight) || 0;
+
+    // Strategy: Find the bounding box of ALL visible content
+    // This works for narrow receipts, wide tables, multi-column, everything
+
+    let minX = Infinity;
+    let minY = Infinity;
+    let maxX = -Infinity;
+    let maxY = -Infinity;
+
+    // Check body itself
+    const bodyRect = body.getBoundingClientRect();
+    if (bodyRect.width > 0 && bodyRect.height > 0) {
+        minX = Math.min(minX, bodyRect.left);
+        minY = Math.min(minY, bodyRect.top);
+        maxX = Math.max(maxX, bodyRect.right);
+        maxY = Math.max(maxY, bodyRect.bottom);
+    }
+
+    // Check all elements to find true content bounds
+    const allElements = document.querySelectorAll('*');
+    allElements.forEach(el => {
+        const rect = el.getBoundingClientRect();
+        const style = window.getComputedStyle(el);
+
+        // Skip hidden elements
+        if (style.display === 'none' ||
+            style.visibility === 'hidden' ||
+            rect.width === 0 ||
+            rect.height === 0) {
+            return;
+        }
+
+        // Skip script/style tags
+        if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') {
+            return;
+        }
+
+        minX = Math.min(minX, rect.left);
+        minY = Math.min(minY, rect.top);
+        maxX = Math.max(maxX, rect.right);
+        maxY = Math.max(maxY, rect.bottom);
+    });
+
+    // Fallback if no content found
+    if (minX === Infinity) {
+        minX = 0;
+        minY = 0;
+        maxX = bodyRect.right;
+        maxY = bodyRect.bottom;
+    }
+
+    // Calculate total dimensions
+    // Width: from leftmost to rightmost content + right margin
+    // Height: from topmost to bottommost content + bottom margin
+    const buffer = 5; // Small safety buffer
+
+    const totalWidth = Math.ceil(maxX - minX + marginRight + buffer);
+    const totalHeight = Math.ceil(maxY - minY + marginBottom + buffer);
+
+    return {
+        width: totalWidth,
+        height: totalHeight,
+        debug: {
+            marginTop,
+            marginBottom,
+            marginLeft,
+            marginRight,
+            minX,
+            minY,
+            maxX,
+            maxY,
+            bodyWidth: bodyRect.width,
+            bodyHeight: bodyRect.height
+        }
+    };
+}
+"""
+
+MEASURE_DIMENSIONS_V3 = """
+() => {
+    const body = document.body;
+
+    // Force layout
+    body.offsetHeight;
+
+    const bodyRect = body.getBoundingClientRect();
+
+    // For receipts/documents with body padding, the body rect already includes everything
+    // Just add a small buffer
+    const buffer = 5;
+
+    return {
+        width: Math.ceil(bodyRect.width + buffer),
+        height: Math.ceil(bodyRect.height + buffer)
+    };
+}
+"""
+
+
+MEASURE_DIMENSIONS_V4 = """
+() => {
+    const body = document.body;
+    const html = document.documentElement;
+
+    // Force layout calculation
+    body.offsetHeight;
+    html.offsetHeight;
+
+    // Get body's computed style to extract margins
+    const bodyStyle = window.getComputedStyle(body);
+    const marginTop = parseFloat(bodyStyle.marginTop) || 0;
+    const marginBottom = parseFloat(bodyStyle.marginBottom) || 0;
+    const marginLeft = parseFloat(bodyStyle.marginLeft) || 0;
+    const marginRight = parseFloat(bodyStyle.marginRight) || 0;
+
+    // Get body padding as well
+    const paddingTop = parseFloat(bodyStyle.paddingTop) || 0;
+    const paddingBottom = parseFloat(bodyStyle.paddingBottom) || 0;
+    const paddingLeft = parseFloat(bodyStyle.paddingLeft) || 0;
+    const paddingRight = parseFloat(bodyStyle.paddingRight) || 0;
+
+    // Strategy: Find the bounding box of ALL visible content
+    let minX = Infinity;
+    let minY = Infinity;
+    let maxX = -Infinity;
+    let maxY = -Infinity;
+
+    // Check body itself
+    const bodyRect = body.getBoundingClientRect();
+    if (bodyRect.width > 0 && bodyRect.height > 0) {
+        minX = Math.min(minX, bodyRect.left);
+        minY = Math.min(minY, bodyRect.top);
+        maxX = Math.max(maxX, bodyRect.right);
+        maxY = Math.max(maxY, bodyRect.bottom);
+    }
+
+    // Check all elements to find true content bounds
+    const allElements = document.querySelectorAll('*');
+    allElements.forEach(el => {
+        const rect = el.getBoundingClientRect();
+        const style = window.getComputedStyle(el);
+
+        // Skip hidden elements
+        if (style.display === 'none' ||
+            style.visibility === 'hidden' ||
+            rect.width === 0 ||
+            rect.height === 0) {
+            return;
+        }
+
+        // Skip script/style tags
+        if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') {
+            return;
+        }
+
+        minX = Math.min(minX, rect.left);
+        minY = Math.min(minY, rect.top);
+        maxX = Math.max(maxX, rect.right);
+        maxY = Math.max(maxY, rect.bottom);
+    });
+
+    // Fallback if no content found
+    if (minX === Infinity) {
+        minX = 0;
+        minY = 0;
+        maxX = bodyRect.right;
+        maxY = bodyRect.bottom;
+    }
+
+    // Calculate total dimensions
+    // CRITICAL FIX: The viewport starts at 0,0 but content might be offset
+    // We need the full document size, not just content span
+
+    // For width: take the maximum of either the rightmost content or body width
+    // For height: take the maximum of either the bottommost content or body height
+    const buffer = 5;
+
+    // Option A: Measure from viewport origin (0,0) to furthest content
+    const totalWidth = Math.ceil(maxX + buffer);
+    const totalHeight = Math.ceil(maxY + buffer);
+
+    // Option B: Also consider body's full width (in case body is wider than content)
+    const bodyFullWidth = bodyRect.width;
+    const bodyFullHeight = bodyRect.height;
+
+    // Use whichever is larger
+    const finalWidth = Math.max(totalWidth, bodyFullWidth);
+    const finalHeight = Math.max(totalHeight, bodyFullHeight);
+
+    return {
+        width: finalWidth,
+        height: finalHeight,
+        debug: {
+            marginTop,
+            marginBottom,
+            marginLeft,
+            marginRight,
+            paddingTop,
+            paddingBottom,
+            paddingLeft,
+            paddingRight,
+            minX,
+            minY,
+            maxX,
+            maxY,
+            bodyWidth: bodyRect.width,
+            bodyHeight: bodyRect.height,
+            bodyLeft: bodyRect.left,
+            bodyTop: bodyRect.top,
+            totalWidth,
+            totalHeight,
+            bodyFullWidth,
+            bodyFullHeight
+        }
+    };
+}
+"""
+
+MEASURE_DIMENSIONS = """
+() => {
+    const body = document.body;
+    const html = document.documentElement;
+
+    // Force layout
+    body.offsetHeight;
+    html.offsetHeight;
+
+    const bodyStyle = window.getComputedStyle(body);
+    const paddingTop = parseFloat(bodyStyle.paddingTop) || 0;
+    const paddingBottom = parseFloat(bodyStyle.paddingBottom) || 0;
+    const paddingLeft = parseFloat(bodyStyle.paddingLeft) || 0;
+    const paddingRight = parseFloat(bodyStyle.paddingRight) || 0;
+
+    // Strategy: Find bounding box of ALL visible content
+    let minX = Infinity;
+    let minY = Infinity;
+    let maxX = -Infinity;
+    let maxY = -Infinity;
+
+    const bodyRect = body.getBoundingClientRect();
+
+    // Check all elements (not just body children, in case of deep nesting)
+    const allElements = document.querySelectorAll('body *');
+    let hasContent = false;
+
+    allElements.forEach(el => {
+        // Skip scripts, styles, and hidden elements
+        if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') return;
+
+        const rect = el.getBoundingClientRect();
+        const style = window.getComputedStyle(el);
+
+        if (style.display === 'none' || style.visibility === 'hidden' ||
+            rect.width === 0 || rect.height === 0) {
+            return;
+        }
+
+        hasContent = true;
+        minX = Math.min(minX, rect.left);
+        minY = Math.min(minY, rect.top);
+        maxX = Math.max(maxX, rect.right);
+        maxY = Math.max(maxY, rect.bottom);
+    });
+
+    // Fallback if no content found
+    if (!hasContent || minX === Infinity) {
+        return {
+            width: Math.ceil(bodyRect.width + 5),
+            height: Math.ceil(bodyRect.height + 5)
+        };
+    }
+
+    // Now decide: do we measure from content bounds or from body bounds?
+
+    // Approach 1: Content-based (for narrow receipts)
+    // Width = actual content span + left padding + right padding
+    const contentWidth = maxX - minX;
+    const contentHeight = maxY - minY;
+    const contentBasedWidth = contentWidth + paddingLeft + paddingRight;
+    const contentBasedHeight = contentHeight + paddingTop + paddingBottom;
+
+    // Approach 2: Body-based (for full-width documents)
+    // Width = body's full width
+    const bodyBasedWidth = bodyRect.width;
+    const bodyBasedHeight = bodyRect.height;
+
+    // Decision logic:
+    // If content is significantly narrower than body (e.g., < 70% of body width),
+    // it's likely a centered narrow layout like a receipt
+    // Otherwise, it's a full-width document
+
+    const contentWidthRatio = contentWidth / bodyRect.width;
+    const isNarrowCentered = contentWidthRatio < 0.7;
+
+    let finalWidth, finalHeight;
+
+    if (isNarrowCentered) {
+        // Use content-based measurement (receipt-style)
+        finalWidth = contentBasedWidth;
+        finalHeight = Math.max(contentBasedHeight, bodyBasedHeight); // Use max for height
+    } else {
+        // Use body-based measurement (full-width document)
+        finalWidth = bodyBasedWidth;
+        finalHeight = bodyBasedHeight;
+    }
+
+    const buffer = 5;
+
+    return {
+        width: Math.ceil(finalWidth + buffer),
+        height: Math.ceil(finalHeight + buffer),
+        debug: {
+            isNarrowCentered,
+            contentWidthRatio: contentWidthRatio.toFixed(2),
+            contentWidth,
+            contentHeight,
+            bodyWidth: bodyRect.width,
+            bodyHeight: bodyRect.height,
+            approach: isNarrowCentered ? 'content-based' : 'body-based'
+        }
+    };
+}
+"""
diff --git a/docgenie/generation/utils/serialization.py b/docgenie/generation/utils/serialization.py
new file mode 100755
index 0000000000000000000000000000000000000000..681946c9ee5fdb96f5f30272cf540208c141b8f4
--- /dev/null
+++ b/docgenie/generation/utils/serialization.py
@@ -0,0 +1,41 @@
+from dataclasses import is_dataclass
+import pathlib
+import base64
+from typing import get_type_hints
+
+
+def from_dict(cls, data: dict):
+    """
+    Recursively parse a dictionary into a dataclass instance.
+    Handles nested dataclasses and special types like pathlib.Path.
+    """
+    type_hints = get_type_hints(cls)
+    kwargs = {}
+
+    for field_name, field_type in type_hints.items():
+        value = data.get(field_name)
+
+        if value is None:
+            kwargs[field_name] = None
+        elif getattr(field_type, "__origin__", None) is list:
+            subtype = field_type.__args__[0]
+            if is_dataclass(subtype):
+                kwargs[field_name] = [from_dict(subtype, v) for v in value]
+            elif subtype == pathlib.Path:
+                kwargs[field_name] = [pathlib.Path(v) for v in value]
+            else:
+                kwargs[field_name] = value
+        elif is_dataclass(field_type):
+            kwargs[field_name] = from_dict(field_type, value)
+        elif field_type == pathlib.Path:
+            kwargs[field_name] = pathlib.Path(value)
+        else:
+            kwargs[field_name] = value
+
+    return cls(**kwargs)
+
+
+def image_to_base64(imgpath: pathlib.Path) -> str:
+    with open(imgpath, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+        return encoded_string.decode("utf-8")
diff --git a/docgenie/generation/utils/stamp.py b/docgenie/generation/utils/stamp.py
new file mode 100755
index 0000000000000000000000000000000000000000..88735587ad4b8ac27c25ee65d3783dc229d65c72
--- /dev/null
+++ b/docgenie/generation/utils/stamp.py
@@ -0,0 +1,466 @@
+from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageChops
+import PIL
+import PIL.Image
+import numpy as np
+import math
+import random
+from typing import Tuple, Optional
+
+
+def _hex_to_rgb(hex_color: str):
+    hex_color = hex_color.lstrip("#")
+    lv = len(hex_color)
+    return tuple(int(hex_color[i : i + lv // 3], 16) for i in range(0, lv, lv // 3))
+
+
+def _make_noise_image(size, mean=0.7, std=0.22, blur=2, contrast=1.0):
+    """Return L-mode noise image (0-255)."""
+    w, h = size
+    arr = np.clip(np.random.normal(loc=mean, scale=std, size=(h, w)), 0.0, 1.0)
+    img = Image.fromarray((arr * 255).astype(np.uint8), mode="L")
+    if blur > 0:
+        img = img.filter(ImageFilter.GaussianBlur(blur))
+    if contrast != 1.0:
+        a = np.asarray(img).astype(np.float32)
+        a = 128 + (a - 128) * contrast
+        a = np.clip(a, 0, 255).astype(np.uint8)
+        img = Image.fromarray(a, mode="L")
+    return img
+
+
+def _bias_noise_towards_opaque(noise_img: Image.Image, min_val=200):
+    """
+    Bias a noise image so values fall in [min_val..255], preserving local variation
+    but ensuring the noise doesn't make the stamp too transparent.
+    """
+    assert 0 <= min_val <= 255
+    return noise_img.point(lambda p: min_val + (p * (255 - min_val) // 255))
+
+
+def _draw_text_on_arc(
+    target_img: Image.Image,
+    text: str,
+    center: Tuple[int, int],
+    radius: float,
+    font: ImageFont.FreeTypeFont,
+    color: Tuple[int, int, int, int],
+    start_angle_deg: float = 0.0,
+    inward: bool = False,
+):
+    """
+    Draw text along an arc centered at `center` with given `radius`.
+    Characters are placed and rotated tangentially for realism.
+    """
+    draw = ImageDraw.Draw(target_img)
+    # measure each character width using textbbox
+    char_widths = []
+    for ch in text:
+        bbox = draw.textbbox((0, 0), ch, font=font)
+        w = bbox[2] - bbox[0]
+        char_widths.append(max(w, 1))
+
+    angs = [(w / radius) * (180.0 / math.pi) for w in char_widths]
+    total_arc = sum(angs)
+    angle = start_angle_deg - total_arc / 2.0
+
+    cx, cy = center
+    for i, ch in enumerate(text):
+        char_ang = angs[i]
+        angle += char_ang / 2.0
+        theta = math.radians(angle)
+
+        x = cx + radius * math.cos(theta)
+        y = cy + radius * math.sin(theta)
+
+        bbox = draw.textbbox((0, 0), ch, font=font)
+        cw = bbox[2] - bbox[0]
+        chh = bbox[3] - bbox[1]
+        pad = int(max(cw, chh) * 1.6) + 6
+        char_img = Image.new("RGBA", (pad, pad), (0, 0, 0, 0))
+        cd = ImageDraw.Draw(char_img)
+        cd.text((pad // 2, pad // 2), ch, font=font, fill=color, anchor="mm")
+
+        rot_angle = -angle + 90
+        if inward:
+            rot_angle += 180
+
+        rot = char_img.rotate(rot_angle, resample=Image.BICUBIC, expand=True)
+        px = int(x - rot.width / 2)
+        py = int(y - rot.height / 2)
+        target_img.paste(rot, (px, py), rot)
+
+        angle += char_ang / 2.0
+
+
+def _wrap_text_to_fit(text, font, max_width):
+    """
+    Automatically wrap text by inserting line breaks to fit within max_width.
+    Returns text with line breaks inserted.
+    """
+    # If text already has line breaks, process each line separately
+    existing_lines = text.split("\n")
+    wrapped_lines = []
+
+    temp_img = Image.new("RGBA", (1, 1))
+    temp_draw = ImageDraw.Draw(temp_img)
+
+    for line in existing_lines:
+        words = line.split()
+        if not words:
+            wrapped_lines.append("")
+            continue
+
+        current_line = []
+        for word in words:
+            test_line = " ".join(current_line + [word])
+            bbox = temp_draw.textbbox((0, 0), test_line, font=font)
+            width = bbox[2] - bbox[0]
+
+            if width <= max_width:
+                current_line.append(word)
+            else:
+                if current_line:
+                    wrapped_lines.append(" ".join(current_line))
+                    current_line = [word]
+                else:
+                    # Single word is too long, just add it anyway
+                    wrapped_lines.append(word)
+                    current_line = []
+
+        if current_line:
+            wrapped_lines.append(" ".join(current_line))
+
+    return "\n".join(wrapped_lines)
+
+
+def create_realistic_stamp(
+    text_top: str = "APPROVED",
+    text_bottom: Optional[str] = None,
+    inner_text: Optional[str] = None,
+    shape: str = "circle",  # "circle" or "rectangle"
+    size: Tuple[int, int] = (800, 800),  # final (width, height)
+    color: str = "#C42828",  # hex or "r,g,b"
+    border_thickness_ratio: float = 0.08,  # relative to min(width,height)
+    font_path: Optional[str] = None,
+    font_size: Optional[int] = None,  # base font size
+    random_seed: Optional[int] = None,
+    supersample: int = 3,  # supersampling factor
+    rot_angle: float | None = None,
+):
+    """
+    Generate a realistic-looking stamp PNG with transparent background.
+    - Automatically adjusts font size to fit text
+    - Fixes text cutoff issues
+    """
+    if random_seed is not None:
+        random.seed(random_seed)
+        np.random.seed(random_seed)
+
+    w, h = size
+    scale = max(1, int(supersample))
+    W, H = w * scale, h * scale
+
+    if isinstance(color, str):
+        if "," in color:
+            color_rgb = tuple(int(x) for x in color.split(","))
+        else:
+            color_rgb = _hex_to_rgb(color)
+    else:
+        color_rgb = tuple(color)
+
+    # big canvas (supersampled)
+    stamp = Image.new("RGBA", (W, H), (0, 0, 0, 0))
+    shape_layer = Image.new("RGBA", (W, H), (0, 0, 0, 0))
+    d_shape = ImageDraw.Draw(shape_layer)
+
+    min_side = min(W, H)
+    border_w = max(2 * scale, int(min_side * border_thickness_ratio))
+
+    jitter_x = random.randint(-int(min_side * 0.005), int(min_side * 0.005))
+    jitter_y = random.randint(-int(min_side * 0.005), int(min_side * 0.005))
+
+    # Draw the ring/rectangle onto shape_layer
+    if shape.lower() == "circle":
+        outer = [
+            (border_w // 2 + jitter_x, border_w // 2 + jitter_y),
+            (W - border_w // 2 + jitter_x, H - border_w // 2 + jitter_y),
+        ]
+        inner = [
+            (border_w * 3 + jitter_x, border_w * 3 + jitter_y),
+            (W - border_w * 3 + jitter_x, H - border_w * 3 + jitter_y),
+        ]
+        for i in range(border_w):
+            off = random.randint(-scale, scale)
+            d_shape.ellipse(
+                [
+                    (outer[0][0] + i + off, outer[0][1] + i + off),
+                    (outer[1][0] - i + off, outer[1][1] - i + off),
+                ],
+                outline=color_rgb + (255,),
+            )
+        d_shape.ellipse(inner, outline=color_rgb + (220,), width=max(1, border_w // 6))
+    else:
+        pad = border_w // 2
+        for i in range(border_w):
+            off = random.randint(-scale, scale)
+            rect = [
+                pad + i + off + jitter_x,
+                pad + i + off + jitter_y,
+                W - (pad + i) + jitter_x,
+                H - (pad + i) + jitter_y,
+            ]
+            d_shape.rounded_rectangle(
+                rect, radius=max(6 * scale, border_w), outline=color_rgb + (255,)
+            )
+
+    # Blur the shape layer
+    bleed_radius = max(1.0 * scale, scale * 0.9)
+    shape_layer = shape_layer.filter(ImageFilter.GaussianBlur(radius=bleed_radius))
+    stamp.alpha_composite(shape_layer, (0, 0))
+
+    # Font loading helper
+    def _try_load_ttf(desired_size):
+        try:
+            if font_path:
+                return ImageFont.truetype(font_path, desired_size)
+            else:
+                return ImageFont.truetype("DejaVuSans-Bold.ttf", desired_size)
+        except Exception:
+            return ImageFont.load_default()
+
+    # Calculate available space for inner text
+    if inner_text:
+        # Define text area boundaries
+        if shape.lower() == "circle":
+            # For circle: use area inside inner ring
+            text_area_width = W - (border_w * 6)
+            text_area_height = H - (border_w * 6)
+        else:
+            # For rectangle: use area inside borders with padding
+            text_area_width = W - (border_w * 4)
+            text_area_height = H - (border_w * 4)
+
+        # Calculate initial font size
+        if font_size:
+            inner_font_size = int(font_size * 1.6 * scale)
+        else:
+            inner_font_size = int(min_side * 0.20)
+
+        inner_font = _try_load_ttf(inner_font_size)
+
+        # Wrap text to fit width
+        inner_text = _wrap_text_to_fit(inner_text, inner_font, text_area_width * 0.95)
+
+    # Small font for curved text
+    if font_size:
+        small_font_size = max(10 * scale, int(font_size * 0.6 * scale))
+    else:
+        small_font_size = max(10 * scale, int(min_side * 0.055))
+    small_font = _try_load_ttf(small_font_size)
+
+    d = ImageDraw.Draw(stamp)
+
+    # Curved text (circle)
+    if shape.lower() == "circle" and text_top:
+        center = (W // 2 + jitter_x, H // 2 + jitter_y)
+        radius = (min_side // 2) - border_w - int(min_side * 0.03)
+        _draw_text_on_arc(
+            stamp,
+            text_top.upper(),
+            center,
+            radius,
+            small_font,
+            color_rgb + (255,),
+            start_angle_deg=-90,
+        )
+        if text_bottom:
+            _draw_text_on_arc(
+                stamp,
+                text_bottom.upper(),
+                center,
+                radius,
+                small_font,
+                color_rgb + (255,),
+                start_angle_deg=90,
+                inward=True,
+            )
+
+    # Inner/center text - FIXED VERTICAL POSITIONING
+    if inner_text:
+        centerx, centery = W // 2 + jitter_x, H // 2 + jitter_y
+        lines = inner_text.split("\n")
+
+        # Calculate total height and individual line metrics
+        draw_tmp = ImageDraw.Draw(stamp)
+        line_metrics = []
+        total_h = 0
+
+        for ln in lines:
+            bbox = draw_tmp.textbbox((0, 0), ln, font=inner_font)
+            # Use actual bbox for accurate height including descenders
+            line_height = bbox[3] - bbox[1]
+            line_metrics.append(
+                {
+                    "text": ln,
+                    "bbox": bbox,
+                    "width": bbox[2] - bbox[0],
+                    "height": line_height,
+                    "y_offset": -bbox[1],  # Offset to account for font baseline
+                }
+            )
+            total_h += line_height
+
+        # Start from top, centered vertically
+        y = centery - total_h // 2
+
+        for metric in line_metrics:
+            ln = metric["text"]
+            tw = metric["width"]
+            th = metric["height"]
+            y_off = metric["y_offset"]
+
+            # Create image with extra padding to prevent cutoff
+            padding = 30
+            txt_img = Image.new(
+                "RGBA", (tw + padding * 2, th + padding * 2), (0, 0, 0, 0)
+            )
+            td = ImageDraw.Draw(txt_img)
+
+            # Draw text with proper baseline offset
+            td.text(
+                (padding, padding + y_off), ln, font=inner_font, fill=color_rgb + (255,)
+            )
+
+            angle = random.uniform(-1.0, 1.0)
+            txt_img = txt_img.rotate(angle, resample=Image.BICUBIC, expand=True)
+
+            paste_x = int(centerx - txt_img.width / 2)
+            paste_y = int(y - padding)
+
+            stamp.paste(txt_img, (paste_x, paste_y), txt_img)
+            y += th
+
+    # Add subtle overlay strokes
+    overlay = Image.new("RGBA", (W, H), (0, 0, 0, 0))
+    od = ImageDraw.Draw(overlay)
+    if shape.lower() == "circle":
+        try:
+            od.ellipse(
+                [(border_w, border_w), (W - border_w, H - border_w)],
+                outline=color_rgb + (180,),
+                width=max(1, border_w // 6),
+            )
+        except Exception:
+            pass
+    else:
+        try:
+            od.rounded_rectangle(
+                [border_w, border_w, W - border_w, H - border_w],
+                radius=max(6 * scale, border_w),
+                outline=color_rgb + (180,),
+                width=max(1, border_w // 6),
+            )
+        except Exception:
+            pass
+    stamp.alpha_composite(overlay)
+
+    # Add noise texture
+    noise = _make_noise_image(
+        (W, H), mean=0.78, std=0.18, blur=2 * scale, contrast=1.05
+    )
+    noise_biased = _bias_noise_towards_opaque(noise, min_val=210)
+
+    orig_alpha = stamp.split()[-1]
+    new_alpha = ImageChops.multiply(orig_alpha, noise_biased)
+    a_arr = np.asarray(new_alpha).astype(np.float32)
+    a_arr = np.clip(a_arr * 1.03, 0, 255).astype(np.uint8)
+    new_alpha = Image.fromarray(a_arr, mode="L")
+    stamp.putalpha(new_alpha)
+
+    # Slight blur for ink bleed effect
+    stamp = stamp.filter(ImageFilter.GaussianBlur(radius=0.4 * scale))
+
+    # Add light speckle holes
+    speck = _make_noise_image((W, H), mean=0.5, std=0.9, blur=0.6 * scale, contrast=1.6)
+    speck_arr = np.asarray(speck)
+    speck_mask = (speck_arr > 252).astype(np.uint8) * 255
+    speck_img = Image.fromarray(speck_mask, mode="L")
+    if speck_img.getbbox() is not None:
+        alpha = stamp.split()[-1]
+        alpha = ImageChops.subtract(alpha, speck_img)
+        stamp.putalpha(alpha)
+
+    # Random rotation
+    rot_angle = rot_angle or random.uniform(-2.2, 2.2)
+    stamp = stamp.rotate(rot_angle, resample=Image.Resampling.BICUBIC, expand=True)
+
+    # Downsample to final size
+    final = stamp.resize((w, h), resample=Image.Resampling.LANCZOS)
+
+    # Final sharpening
+    final = final.filter(ImageFilter.UnsharpMask(radius=0.6, percent=120, threshold=2))
+
+    return final
+
+
+def create_stamp_alt(text: str) -> PIL.Image.Image:
+    coin = random.random() <= 0.5
+    if coin:
+        return create_realistic_stamp(
+            "",
+            text_bottom="",
+            inner_text=text,
+            shape="circle",
+            size=(900, 900),
+            color="#a81f1f",
+            font_path=None,
+            font_size=60,
+            random_seed=42,
+            supersample=3,
+        )
+    else:
+        return create_realistic_stamp(
+            text_top="",
+            inner_text=text,
+            shape="rectangle",
+            size=(1100, 500),
+            color="#1f7a1f",
+            font_size=56,
+            random_seed=7,
+            supersample=3,
+        )
+
+
+def create_stamp(
+    text: str, width: float, height: float, rot_angle: float | None
+) -> PIL.Image.Image:
+    coin = random.random() <= 0.5
+    width = int(width)
+    height = int(height)
+    size_mult = 11  # previous default values were along 900/1000, but real sizes are around 100, which the text resizing cant handle
+    if coin:
+        return create_realistic_stamp(
+            "",
+            text_bottom="",
+            inner_text=text,
+            shape="circle",
+            size=(width * size_mult, height * size_mult),
+            color="#a81f1f",
+            font_path=None,
+            font_size=60,
+            random_seed=42,
+            supersample=3,
+            rot_angle=rot_angle,
+        )
+    else:
+        return create_realistic_stamp(
+            text_top="",
+            inner_text=text,
+            shape="rectangle",
+            size=(width * size_mult, height * size_mult),
+            color="#1f7a1f",
+            font_size=56,
+            random_seed=7,
+            supersample=3,
+            rot_angle=rot_angle,
+        )
diff --git a/docgenie/generation/utils/status.py b/docgenie/generation/utils/status.py
new file mode 100755
index 0000000000000000000000000000000000000000..062a04ebdc2912a2061d7f344b9836905d2b6e3b
--- /dev/null
+++ b/docgenie/generation/utils/status.py
@@ -0,0 +1,85 @@
+import itertools
+import sys
+import time
+import threading
+
+
+class StatusLine:
+    _BAR = "|/-\\"
+    _DOTS = "▖▘▝▗"
+    _ARROW = "←↖↑↗→↘↓↙"
+    _BRAILLE = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
+    _CIRCLE = "◐◓◑◒"
+    _DASH = "━╾╼━"
+
+    def __init__(self, delay: float = 0.1):
+        self._stop = False
+        self._spinner = itertools.cycle(self._BRAILLE)  # rotating chars
+        self._delay = delay
+        self.message = ""
+        self.lock = threading.Lock()
+        self.thread = threading.Thread(target=self._run, daemon=True)
+
+    def start(self):
+        self.start_time = time.time()
+        self.thread.start()
+
+    def stop(self):
+        self._stop = True
+        self.thread.join()
+        # clear line on exit
+        sys.stdout.write("\r" + " " * 80 + "\r")
+        sys.stdout.flush()
+
+    def update_message(self, msg: str):
+        with self.lock:
+            self.message = msg
+
+    def log(self, msg: str):
+        # Print a normal log above the status line
+        sys.stdout.write("\r" + " " * 80 + "\r")  # clear status line
+        sys.stdout.write(msg + "\n")
+        sys.stdout.flush()
+
+    def _run(self):
+        while not self._stop:
+            with self.lock:
+                spinner_char = next(self._spinner)
+                elapsed = int(time.time() - self.start_time)
+                line = f"[{spinner_char}] {self.message} | waiting {elapsed}s"
+            sys.stdout.write("\r" + line[:79])  # overwrite line
+            sys.stdout.flush()
+            time.sleep(self._delay)
+
+
+# Example usage
+if __name__ == "__main__":
+    status = StatusLine()
+    status.start()
+
+    for i in range(5):
+        status.update_message(f"Awaiting {5 - i} batches")
+        time.sleep(3)
+        status.log(f"Batch {i + 1} ended!")
+
+    status.stop()
+    print("Done.")
+
+
+def get_progress_bar():
+    from rich.progress import (
+        Progress,
+        TimeElapsedColumn,
+        BarColumn,
+        TaskProgressColumn,
+        TimeRemainingColumn,
+    )
+
+    return Progress(
+        "[progress.description]{task.description}",
+        BarColumn(),
+        TaskProgressColumn(),
+        "[yellow]({task.completed}/{task.total})",
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+    )
diff --git a/docgenie/generation/utils/visualelement.py b/docgenie/generation/utils/visualelement.py
new file mode 100755
index 0000000000000000000000000000000000000000..2484c3f77dc55896409e5d4de729732937280a1a
--- /dev/null
+++ b/docgenie/generation/utils/visualelement.py
@@ -0,0 +1,13 @@
+__PREFIX = "&?ve"
+
+
+def get_visual_element_id(i: int) -> str:
+    return f"{__PREFIX}{i}"
+
+
+def is_visual_element_id(s: str) -> bool:
+    if s.startswith(__PREFIX):
+        s = s.replace(__PREFIX, "")
+        return s.isdigit()
+    else:
+        return False
diff --git a/docgenie/logging.py b/docgenie/logging.py
new file mode 100755
index 0000000000000000000000000000000000000000..97bae71877f61946b21ddd859bf9ef51307c38b4
--- /dev/null
+++ b/docgenie/logging.py
@@ -0,0 +1 @@
+from atria_core.logger import get_logger  # noqa: F401
diff --git a/docgenie/utils/ocr.py b/docgenie/utils/ocr.py
new file mode 100755
index 0000000000000000000000000000000000000000..035967be1eb6f6251a22259da18ef156c88c6573
--- /dev/null
+++ b/docgenie/utils/ocr.py
@@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+class MicrosoftOCRWord:
+    def __init__(self, text: str, confidence: float, geo: list[int]):
+        self.text = text
+        self.confidence = confidence
+        self.geo = geo
+
+
+class MicrosoftOCR:
+    def __init__(
+            self,
+            angle: float,
+            width: int,
+            height: int,
+            words: list[MicrosoftOCRWord],
+            lines: list[MicrosoftOCRWord] | None = None
+    ):
+        self.angle = angle
+        self.width = width
+        self.height = height
+        self.words = words
+        self.lines = lines if lines is not None else []
+
+    @staticmethod
+    def _geo_from_polygon(polygon_vals: list[float | int], scale: float) -> list[int]:
+        # [tl.x, tl.y, tr.x, tr.y, br.x, br.y, bl.x, bl.y]
+        # [   0,    1,    2,    3,    4,    5,    6,    7]
+        x = [polygon_vals[0], polygon_vals[2], polygon_vals[4], polygon_vals[6]]
+        y = [polygon_vals[1], polygon_vals[3], polygon_vals[5], polygon_vals[7]]
+        left = int(round(min(x) * scale))
+        top = int(round(min(y) * scale))
+        right = int(round(max(x) * scale))
+        bottom = int(round(max(y) * scale))
+
+        width = right - left + 1
+        height = bottom - top + 1
+        return [left, top, width, height]
+
+    @staticmethod
+    def load_from_file(path: Path) -> MicrosoftOCR:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        first_page = data["analyzeResult"]["pages"][0]
+        return MicrosoftOCR(
+            angle=first_page["angle"],
+            width=first_page["width"],
+            height=first_page["height"],
+            words=[
+                MicrosoftOCRWord(
+                    text=word["content"],
+                    confidence=word["confidence"],
+                    geo=MicrosoftOCR._geo_from_polygon(word["polygon"], scale=1.0),
+                )
+                for word in first_page['words']
+            ],
+            lines=[
+                MicrosoftOCRWord(
+                    text=line["content"],
+                    confidence=-1,
+                    geo=MicrosoftOCR._geo_from_polygon(line["polygon"], scale=1.0),
+                )
+                for line in first_page['lines']
+            ],
+        )
+
+    def save_to_file(self, path: Path) -> None:
+        data = {
+            'analyzeResult': {
+                'pages': [
+                    {
+                        'angle': self.angle,
+                        'width': self.width,
+                        'height': self.height,
+                        'words': [
+                            {
+                                'content': word.text,
+                                'confidence': word.confidence,
+                                'polygon': [
+                                    word.geo[0],  # tl.x
+                                    word.geo[1],  # tl.y
+                                    word.geo[0] + word.geo[2] - 1,  # tr.x
+                                    word.geo[1],  # tr.y
+                                    word.geo[0] + word.geo[2] - 1,  # br.x
+                                    word.geo[1] + word.geo[3] - 1,  # br.y
+                                    word.geo[0],  # bl.x
+                                    word.geo[1] + word.geo[3] - 1  # bl.y
+                                ]
+                                # Input:
+                                # [tl.x, tl.y, w, h]
+                                # [   0,    1, 2, 3]
+                                # Output:
+                                # [tl.x, tl.y, tr.x, tr.y, br.x, br.y, bl.x, bl.y]
+                                # [   0,    1,    2,    3,    4,    5,    6,    7]
+                            }
+                            for word in self.words
+                        ],
+                        'lines': [
+                            {
+                                'content': line.text,
+                                'confidence': -1,
+                                'polygon': [
+                                    line.geo[0],  # tl.x
+                                    line.geo[1],  # tl.y
+                                    line.geo[0] + line.geo[2] - 1,  # tr.x
+                                    line.geo[1],  # tr.y
+                                    line.geo[0] + line.geo[2] - 1,  # br.x
+                                    line.geo[1] + line.geo[3] - 1,  # br.y
+                                    line.geo[0],  # bl.x
+                                    line.geo[1] + line.geo[3] - 1  # bl.y
+                                ]
+                                # Input:
+                                # [tl.x, tl.y, w, h]
+                                # [   0,    1, 2, 3]
+                                # Output:
+                                # [tl.x, tl.y, tr.x, tr.y, br.x, br.y, bl.x, bl.y]
+                                # [   0,    1,    2,    3,    4,    5,    6,    7]
+                            }
+                            for line in self.lines
+                        ]
+                    }
+                ]
+            }
+        }
+        path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
+
+
+# Example showing how to load MS OCR files
+if __name__ == "__main__":
+    base_dir = Path("data/temp/OCR/test-dataset")
+    ocr_file = base_dir / "04276b91-eb12-4b47-80a6-666f6d09b6ce_1.jpg.0.MicrosoftOcrService.json"
+
+    ocr = MicrosoftOCR.load_from_file(ocr_file)
+    for word in ocr.words:
+        print(f"{word.text:<20} | {word.confidence:>6.2f} | {word.geo}")
+    for line in ocr.lines:
+        print(f"{line.text:<20} | {line.confidence:>6.2f} | {line.geo}")
+
+    print("=" * 128)
+    print(" ".join(word.text for word in ocr.words))
+
+    # PRECISION            |   0.99 | [1209, 68, 495, 69]
+    # SAFETY               |   0.99 | [1735, 68, 331, 69]
+    # ...
+    # Seal                 |   0.99 | [2993, 2566, 122, 55]
+    # ================================================================================================================================
+    # PRECISION SAFETY INSPECTIONS FORKLIFT SAFETY INSPECTION REPORT Licensed Inspector . State Certified . ...
+
+    # Check if loading and saving gives the same result
+    tmp_ocr_file = Path("/tmp/test.0.MicrosoftOcrService.json")
+    ocr.save_to_file(tmp_ocr_file)
+    reloaded_ocr = MicrosoftOCR.load_from_file(tmp_ocr_file)
+    for word in reloaded_ocr.words:
+        print(f"{word.text:<20} | {word.confidence:>6.2f} | {word.geo}")
+    for line in reloaded_ocr.lines:
+        print(f"{line.text:<20} | {line.confidence:>6.2f} | {line.geo}")
+
+    if not all([
+        reloaded_ocr_word.__dict__ == ocr_word.__dict__
+        for reloaded_ocr_word, ocr_word in zip(reloaded_ocr.words, ocr.words)
+    ]):
+        raise AssertionError('Saving and loading a file does not work!')
+
+    if not all([
+        reloaded_ocr_line.__dict__ == ocr_line.__dict__
+        for reloaded_ocr_line, ocr_line in zip(reloaded_ocr.lines, ocr.lines)
+    ]):
+        raise AssertionError('Saving and loading a file does not work!')
diff --git a/docgenie/utils/ocr_quality.py b/docgenie/utils/ocr_quality.py
new file mode 100755
index 0000000000000000000000000000000000000000..faac1c9712cb03cf45f189cb30281712d6c734dc
--- /dev/null
+++ b/docgenie/utils/ocr_quality.py
@@ -0,0 +1,168 @@
+import json
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+import editdistance
+import matplotlib.pyplot as plt
+import numpy as np
+from rich.console import Console
+from rich.table import Table
+from rich.progress import track
+from docgenie.utils.ocr import MicrosoftOCRWord, MicrosoftOCR
+
+
+class OCRQualityMetric(ABC):
+    def __init__(self, gt: list[MicrosoftOCRWord], pred: list[MicrosoftOCRWord]):
+        self._gt = gt
+        self._pred = pred
+
+    @abstractmethod
+    def compute(self) -> float:
+        raise NotImplementedError
+
+
+class WordCharacterError(OCRQualityMetric):
+    def compute(self) -> float:
+        # for gt_word, pred_word in zip(self._gt, self._pred):
+        #     err = editdistance.eval(gt_word.text, pred_word.text) / len(gt_word.text)
+        #     print(f'[CER]: {err:.2f} "{gt_word.text}"  |  "{pred_word.text}"')
+        return np.mean(
+            [
+                editdistance.eval(gt_word.text, pred_word.text) / len(gt_word.text)
+                for gt_word, pred_word in zip(self._gt, self._pred)
+            ],
+            dtype=float,
+        )
+
+
+class WordErrorRate(OCRQualityMetric):
+    def compute(self) -> float:
+        # err = editdistance.eval(self._gt, self._pred) / len(self._gt)
+        # print(f'[WER]: {err:.2f} {len(self._gt)} GT words  |  {len(self._pred)} PRED words')
+        # for gt, pred in zip(self._gt, self._pred):
+        #     if  gt.text != pred.text:
+        #         print(gt.text, pred.text)
+        return sum(
+            gt.text != pred.text for gt, pred in zip(self._gt, self._pred)
+        ) / len(self._gt)
+
+
+def get_ocr_for_all_files(
+    base_dir: Path, ocr_listing_path: Path
+) -> dict[str, MicrosoftOCR]:
+    entries = [
+        json.loads(line)
+        for line in ocr_listing_path.read_text(encoding="utf-8").splitlines()
+    ]
+    print(f"Reading ({len(entries)}) ocr files ...")
+    return {
+        entry["image_path"]: MicrosoftOCR.load_from_file(
+            base_dir / entry["ms_ocr_path"]
+        )
+        for entry in track(entries, description=f"Reading OCR files ...")
+    }
+
+
+def ocr_to_word(ocr: MicrosoftOCR) -> MicrosoftOCRWord:
+    if len(ocr.words) == 1:
+        return ocr.words[0]
+
+    return MicrosoftOCRWord(
+        text=" ".join(word.text for word in ocr.words),
+        confidence=np.mean([word.confidence for word in ocr.words], dtype=float),
+        geo=[0, 0, 0, 0],
+    )
+
+
+def visualize_stats(writer_stats: dict[str, dict]) -> None:
+    sorted_stats = sorted(writer_stats.items(), key=lambda writer: writer[1]["cer"])
+
+    def _show_table() -> None:
+        console = Console()
+        table = Table(
+            title="OCR Error Rates by Writer", show_header=True, header_style="bold"
+        )
+        table.add_column("Writer ID", justify="left")
+        table.add_column("WER", justify="right")
+        table.add_column("CER", justify="right")
+        for writer_id, stats in sorted_stats:
+            table.add_row(writer_id, f"{stats['wer']:.1f}", f"{stats['cer']:.1f}")
+
+        console.print(table)
+
+    def _show_figure() -> None:
+        writer_ids = [writer_id for writer_id, _ in sorted_stats]
+        wer_values = [stats["wer"] for _, stats in sorted_stats]
+        cer_values = [stats["cer"] for _, stats in sorted_stats]
+
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(280, 24))
+
+        x = range(len(writer_ids))
+        width = 0.35
+
+        ax1.bar([i - width / 2 for i in x], wer_values, width, label="WER", alpha=0.8)
+        ax1.bar([i + width / 2 for i in x], cer_values, width, label="CER", alpha=0.8)
+        ax1.set_xlabel("Writer ID")
+        ax1.set_ylabel("Error Rate")
+        ax1.set_title("WER vs CER by Writer")
+        ax1.set_xticks(x)
+        ax1.set_xticklabels(writer_ids, rotation=45, ha="right")
+        ax1.legend()
+        ax1.grid(axis="y", alpha=0.3)
+
+        ax2.scatter(cer_values, wer_values, alpha=0.6, s=100)
+        ax2.set_xlabel("CER")
+        ax2.set_ylabel("WER")
+        ax2.set_title("WER vs CER Correlation")
+        ax2.grid(True, alpha=0.3)
+
+        max_val = max(max(cer_values), max(wer_values))
+        ax2.plot([0, max_val], [0, max_val], "r--", alpha=0.5, label="WER = CER")
+        ax2.legend()
+
+        plt.tight_layout()
+        plt.show()
+
+    _show_table()
+    _show_figure()
+
+
+def main() -> None:
+    base_dir = Path("ANON")
+
+    gt_manifest_path = base_dir / "generations/writer_style_manifest.json"
+    gt_manifest = json.loads(gt_manifest_path.read_text(encoding="utf-8"))
+
+    ocr_listing_path = base_dir / "generations.ocr.jsonl"
+    pred_ocr_files = get_ocr_for_all_files(base_dir / "generations", ocr_listing_path)
+
+    writer_stats = {}
+    for writer in track(
+        gt_manifest.get("writers", []), description="Processing writers..."
+    ):
+        writer_gt_samples = writer.get("samples", [])
+        writer_gt_words = [
+            MicrosoftOCRWord(text=sample["gt"], confidence=1.0, geo=[0, 0, 0, 0])
+            for sample in writer_gt_samples
+        ]
+        # print(pred_ocr_files)
+        writer_pred_words = [
+            MicrosoftOCRWord(
+                text=ocr_to_word(pred_ocr_files[sample["image"]]).text,
+                confidence=1.0,
+                geo=[0, 0, 0, 0],
+            )
+            for sample in writer_gt_samples
+        ]
+        wer_metric = WordErrorRate(gt=writer_gt_words, pred=writer_pred_words)
+        cer_metric = WordCharacterError(gt=writer_gt_words, pred=writer_pred_words)
+        writer_stats[writer["writer_id"]] = {
+            "wer": wer_metric.compute(),
+            "cer": cer_metric.compute(),
+        }
+
+    visualize_stats(writer_stats)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1678f395589a1c43d3465ab82a9c490e941008c5
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,79 @@
+version: '3.8'
+
+services:
+  # Redis for background job queue
+  redis:
+    image: redis:7-alpine
+    ports:
+      - "6379:6379"
+    volumes:
+      - redis_data:/data
+    command: redis-server --appendonly yes
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 3s
+      retries: 3
+
+  # DocGenie API Server
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "8000:8000"
+    environment:
+      - REDIS_URL=redis://redis:6379
+      - HANDWRITING_SERVICE_URL=http://handwriting:8080
+      - PORT=8000
+    env_file:
+      - api/.env
+    depends_on:
+      - redis
+    command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
+    working_dir: /app/api
+    volumes:
+      - ./api:/app/api
+      - ./docgenie:/app/docgenie
+
+  # Background Worker for async jobs
+  worker:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    environment:
+      - REDIS_URL=redis://redis:6379
+      - HANDWRITING_SERVICE_URL=http://handwriting:8080
+    env_file:
+      - api/.env
+    depends_on:
+      - redis
+    command: rq worker --url redis://redis:6379
+    working_dir: /app/api
+    volumes:
+      - ./api:/app/api
+      - ./docgenie:/app/docgenie
+
+  # Handwriting Service (GPU)
+  # Note: Requires nvidia-docker for GPU access
+  handwriting:
+    build:
+      context: handwriting_service
+      dockerfile: Dockerfile
+    ports:
+      - "8080:8080"
+    environment:
+      - DEVICE=cuda
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    volumes:
+      - ./handwriting_service:/app
+      - ./WordStylist:/app/WordStylist
+
+volumes:
+  redis_data:
diff --git a/handwriting_service/.dockerignore b/handwriting_service/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..9ee88131a6ed3dc6e27a9789100e367a755be259
--- /dev/null
+++ b/handwriting_service/.dockerignore
@@ -0,0 +1,48 @@
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.dylib
+*.log
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+
+# Git/IDE
+.git/
+.gitignore
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+
+# Documentation
+README.md
+*.md
+!requirements*.txt
+
+# Testing
+.pytest_cache/
+test_output/
+
+# WordStylist - Exclude large unnecessary files (saves 432MB!)
+WordStylist/models/optim.pt       # 277MB - optimizer state not needed for inference
+WordStylist/models/ckpt.pt        # 155MB - only ema_ckpt.pt is used
+WordStylist/__pycache__/
+WordStylist/.git/
+WordStylist/figs/                 # Example images
+WordStylist/gt/                   # Ground truth data
+WordStylist/test_output/          # Test outputs
+
+# Preparation scripts
+prepare_build.sh
diff --git a/handwriting_service/.gitignore b/handwriting_service/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..68d52992393c8cc1a03e6c9a2dc7ac20fb6ad23f
--- /dev/null
+++ b/handwriting_service/.gitignore
@@ -0,0 +1,37 @@
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.dylib
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+
+# Logs
+*.log
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+
+# Model outputs
+test_output/
+
+# WordStylist optimization - only ignore the large files we don't need
+WordStylist/models/optim.pt
+WordStylist/models/ckpt.pt
+WordStylist/figs/
+WordStylist/gt/
+WordStylist/test_*.py
diff --git a/handwriting_service/DOCKER_SIZE_OPTIMIZATION.md b/handwriting_service/DOCKER_SIZE_OPTIMIZATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bfc874dda7ed919d3d231c5a6fee40d7079b6c3
--- /dev/null
+++ b/handwriting_service/DOCKER_SIZE_OPTIMIZATION.md
@@ -0,0 +1,94 @@
+# 🐋 Docker Image Size Optimization
+
+## Current Size Analysis
+
+```
+Total image size: ~11GB
+├── PyTorch & dependencies: 7.91GB (necessary for GPU)
+├── Base CUDA image: ~2.4GB (necessary for GPU)
+├── System packages: ~800MB (necessary)
+├── WordStylist + code: 617MB ⚠️ (TOO LARGE)
+└── Other dependencies: 403MB
+```
+
+## Optimization Applied
+
+### Excluded from WordStylist (saves 432MB):
+- `WordStylist/models/optim.pt` (277MB) - optimizer state, not needed for inference
+- `WordStylist/models/ckpt.pt` (155MB) - base checkpoint, code only uses `ema_ckpt.pt`
+- `WordStylist/figs/` - example images
+- `WordStylist/gt/` - ground truth data
+- `WordStylist/test_output/` - test outputs
+- `WordStylist/__pycache__/` - Python cache
+
+### After Optimization:
+```
+WordStylist size: 589MB → 157MB (73% reduction!)
+Total image size: ~11GB → ~10.6GB
+```
+
+## Why PyTorch is 8GB
+
+PyTorch for CUDA is intentionally large:
+- CUDA kernels and libraries: ~4GB
+- PyTorch framework: ~2GB  
+- cuDNN optimizations: ~2GB
+
+This is **normal and unavoidable** for GPU inference. Alternatives:
+
+1. **PyTorch Lite** (not recommended for diffusion models)
+2. **ONNX Runtime** (requires model conversion)
+3. **CPU-only PyTorch** (~500MB but no GPU acceleration)
+
+## Rebuild Instructions
+
+After updating `.dockerignore`, rebuild:
+
+```bash
+cd handwriting_service
+
+# Remove old WordStylist copy
+rm -rf WordStylist
+
+# Copy fresh (prepare_build.sh does this automatically)
+./prepare_build.sh
+
+# Rebuild - now 432MB smaller!
+docker buildx build --platform linux/amd64 \
+  -t ahadhassan/docgenie-handwriting:latest \
+  --build-arg BUILDKIT_INLINE_CACHE=1 \
+  .
+
+# Push
+docker push ahadhassan/docgenie-handwriting:latest
+```
+
+## Size Comparison
+
+| Component | Before | After | Savings |
+|-----------|--------|-------|---------|
+| WordStylist layer | 617MB | 185MB | **432MB (70%)** |
+| Total image | 11GB | 10.6GB | 432MB |
+| Download time (100Mbps) | ~15 min | ~14 min | ~1 min |
+
+## Further Optimizations (Not Recommended)
+
+### Multi-stage build
+Could reduce by ~500MB but increases build complexity and time.
+
+### Remove CUDA base layers
+Reduces to ~3GB but **loses GPU support** - defeats the purpose!
+
+### Use distroless images
+Saves ~200MB but makes debugging impossible.
+
+## Conclusion
+
+**10.6GB is reasonable for a GPU-enabled PyTorch service.**
+
+RunPod and other GPU platforms expect large images. The optimizations above save 432MB which helps with:
+- ✅ Faster uploads to Docker Hub
+- ✅ Faster cold starts on RunPod
+- ✅ Less storage costs
+
+But the bulk (PyTorch + CUDA) is necessary for GPU acceleration.
diff --git a/handwriting_service/Dockerfile b/handwriting_service/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..15a6f1d82e15ef4970e9fd6d2790331602f3c4da
--- /dev/null
+++ b/handwriting_service/Dockerfile
@@ -0,0 +1,69 @@
+# ============================================
+# Handwriting Service - GPU-enabled Dockerfile
+# ============================================
+# For deployment to RunPod, Vast.ai, or EC2 with GPU
+# Image size: ~5-6GB (PyTorch base includes CUDA + cuDNN)
+# GPU: Requires NVIDIA GPU with CUDA 11.8+
+
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies (Python 3.10 already included in base image)
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Remove conflicting brotlipy from base image (conflicts with brotli)
+RUN pip uninstall -y brotlipy || true
+
+# Copy requirements first for Docker layer caching
+COPY requirements-light.txt .
+
+# Install remaining dependencies (PyTorch already installed in base image)
+RUN pip install --no-cache-dir -r requirements-light.txt
+
+# Copy handwriting service code
+COPY . .
+
+# Download actual model file if Git LFS pointer detected
+# RunPod clones repo but doesn't pull LFS files, so we download directly
+RUN MODEL_PATH="WordStylist/models/ema_ckpt.pt" && \
+    if [ -f "$MODEL_PATH" ]; then \
+        FILE_SIZE=$(stat -c%s "$MODEL_PATH") && \
+        echo "📦 Model file found: $FILE_SIZE bytes" && \
+        if [ "$FILE_SIZE" -lt 100000000 ]; then \
+            echo "⚠️  File too small ($FILE_SIZE bytes) - likely a Git LFS pointer" && \
+            echo "📥 Downloading actual model (155MB) from GitHub LFS..." && \
+            curl -L --progress-bar \
+                -o "$MODEL_PATH" \
+                "https://media.githubusercontent.com/media/Ahadhassan-2003/FYP/main/docgenie/handwriting_service/WordStylist/models/ema_ckpt.pt" \
+            && echo "✅ Download complete: $(stat -c%s $MODEL_PATH) bytes"; \
+        else \
+            echo "✅ Model file already present (not an LFS pointer)"; \
+        fi; \
+    else \
+        echo "❌ Model file not found at $MODEL_PATH!" && exit 1; \
+    fi
+
+# Note: For RunPod Git build, WordStylist should be in parent directory
+# For local Docker build, run ./prepare_build.sh first to copy WordStylist here
+# The .dockerignore excludes unnecessary large files (optim.pt, ckpt.pt, etc.)
+
+# Expose port
+EXPOSE 8080
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:8080/health')" || exit 1
+
+# Set device environment variable (can be overridden at runtime)
+ENV DEVICE=cuda
+ENV PYTHONUNBUFFERED=1
+
+# Run RunPod serverless handler
+# For local FastAPI testing, use: uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1
+CMD ["python", "handler.py"]
diff --git a/handwriting_service/README.md b/handwriting_service/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b37d516efd6a91ca3f4526562c2b08c2bb6456d
--- /dev/null
+++ b/handwriting_service/README.md
@@ -0,0 +1,386 @@
+# Handwriting Inference API Service
+
+GPU-accelerated handwriting generation service using **WordStylist** diffusion model for DocGenie pipeline.
+
+## Overview
+
+This service runs on an EC2 GPU instance and provides REST API endpoints for generating handwriting images. It's designed to be called by the main DocGenie API during Stage 3 (Feature Synthesis).
+
+**Model:** WordStylist (ICDAR 2023) - Latent diffusion model trained on IAM dataset
+- **Size:** ~150MB
+- **Writer Styles:** 339 unique styles
+- **Architecture:** UNet with VAE (stabilityai/sd-vae-ft-mse)
+- **Output:** 64x256 latent → ~64px height handwriting
+- **Character Set:** A-Z, a-z (52 characters + PAD token)
+
+## Models Used in Inference
+
+**Only 3 components are loaded and used:**
+
+1. **EMA Model** (`ema_ckpt.pt`) - **PRIMARY INFERENCE MODEL** (~150MB)
+   - Exponential Moving Average version of the UNet
+   - More stable than base model
+   - This is the ONLY UNet model loaded
+
+2. **VAE** (`stabilityai/sd-vae-ft-mse`) - Latent encoder/decoder (~160MB)
+   - Converts images to/from latent space
+   - Cached locally to avoid downloads
+
+3. **Diffusion Scheduler** - DDPM sampling algorithm
+   - 1000 fixed timesteps (not configurable in WordStylist)
+
+**Optimization Note:** The base checkpoint (`ckpt.pt`) is **NOT loaded** (skipped for performance). Only the EMA checkpoint is used, reducing memory usage and startup time by ~150MB and ~2-3 seconds.
+
+## Feature Parity with Original Pipeline
+
+This service maintains compatibility with the original handwriting generation pipeline:
+
+| Feature | Status | Notes |
+|---------|--------|-------|
+| **Blur** | ✅ Supported | Gaussian blur (0.6-1.8 radius) matching `add_handwriting_blur.py` |
+| **Cropping** | ✅ Supported | Automatic whitespace removal with Otsu thresholding |
+| **Style Consistency** | ✅ Supported | Author IDs (0-656) mapped to 339 WordStylist styles |
+| **VAE Encoding** | ✅ Supported | Using stabilityai/sd-vae-ft-mse |
+| **EMA Model** | ✅ Supported | Exponential moving average for stable generation |
+| **Batching** | ⚠️ Sequential | WordStylist processes each text separately (see below) |
+| **Inference Steps** | ⚠️ Fixed | Always 1000 steps (parameter ignored) |
+| **Temperature** | ⚠️ Not Used | WordStylist uses fixed noise schedule |
+| **Width/Height** | ⚠️ Fixed Aspect | Output is always ~64px height, width varies by text |
+
+### Batching Behavior
+
+**WordStylist limitation:** The model only supports batching for generating the same text with different writer styles. For different texts (the typical use case), each text must be processed sequentially.
+
+- **Original model:** True GPU batching for different texts
+- **WordStylist:** Sequential processing (no GPU batching across different texts)
+- **Impact:** Slightly slower than original model, but still efficient
+- **Mitigation:** Uses optimized EMA model and VAE caching
+
+## Architecture
+
+```
+DocGenie API (Local/VPS)
+    ↓ HTTP POST /generate-handwriting-batch
+Handwriting Service (EC2 GPU)
+    ↓ Batched Diffusion Inference
+Base64-encoded PNG Images
+    ↓ HTTP Response
+DocGenie API continues pipeline
+```
+
+## Setup on EC2
+
+### 1. Launch EC2 Instance
+
+**Recommended instance:** `g4dn.xlarge` (1 GPU, 16GB VRAM)
+- **Region:** us-east-1
+- **AMI:** Deep Learning AMI (Ubuntu) - GPU PyTorch
+- **Instance Type:** g4dn.xlarge
+- **Storage:** 50GB gp3
+- **Security Group:** Open port 8080
+
+**Cost:** ~$0.526/hour on-demand, ~$0.16/hour spot
+
+### 2. Install Dependencies
+
+```bash
+# SSH to EC2
+ssh -i your-key.pem ubuntu@<ec2-ip>
+
+# Clone repository
+git clone <your-repo-url>
+cd docgenie
+
+# Install Python dependencies
+pip install -r handwriting_service/requirements.txt
+
+# Verify GPU
+python -c "import torch; print(torch.cuda.is_available())"
+```
+
+### 3. Upload Model Files
+
+```bash
+# From your local machine, upload WordStylist model to EC2
+scp -i your-key.pem -r WordStylist/models ubuntu@<ec2-ip>:~/docgenie/WordStylist/
+scp -i your-key.pem -r data/models/handwriting ubuntu@<ec2-ip>:~/docgenie/data/models/
+```
+
+**Required files:**
+- `WordStylist/models/ema_ckpt.pt` - **EMA model (USED FOR INFERENCE)**
+- `WordStylist/models/ckpt.pt` - Base model (optional, not used after optimization)
+- `WordStylist/letter2index.json` - Character vocabulary mapping
+- `data/models/handwriting/cached_vae/` - Cached VAE model (stabilityai/sd-vae-ft-mse)
+
+**Note:** Only `ema_ckpt.pt` is loaded during inference. The base `ckpt.pt` is skipped for faster startup.
+
+### 4. Configure Environment
+
+```bash
+# Create .env file
+cat > handwriting_service/.env << EOF
+HANDWRITING_MODEL_DIR=data/models/handwriting
+HANDWRITING_CHECKPOINT_PATH=model_checkpoint.pt
+DEVICE=cuda
+EOF
+```
+
+### 5. Start Service
+
+```bash
+# Start service
+cd handwriting_service
+uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1
+
+# Or use screen/tmux for persistent session
+screen -S handwriting
+uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1
+# Ctrl+A, D to detach
+```
+
+### 6. Test Service
+
+```bash
+# Health check
+curl http://<ec2-ip>:8080/health
+
+# Generate single handwriting
+curl -X POST http://<ec2-ip>:8080/generate-handwriting \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Hello World",
+    "author_id": "author1",
+    "width": 300,
+    "height": 40
+  }'
+
+# Generate batch
+curl -X POST http://<ec2-ip>:8080/generate-handwriting-batch \
+  -H "Content-Type: application/json" \
+  -d '{
+    "requests": [
+      {"text": "John", "author_id": "author1", "width": 150, "height": 40},
+      {"text": "Smith", "author_id": "author1", "width": 200, "height": 40}
+    ]
+  }'
+```
+
+## Auto-Shutdown Setup (Cost Optimization)
+
+To avoid paying for idle GPU time, set up auto-shutdown:
+
+```bash
+# Create check script
+sudo cat > /usr/local/bin/check-idle.sh << 'EOF'
+#!/bin/bash
+IDLE_MIN=5
+LOG_FILE="/var/log/handwriting-api.log"
+
+# Check for recent requests
+RECENT=$(journalctl -u handwriting-api --since "${IDLE_MIN} minutes ago" | grep -c "POST /generate")
+
+if [ "$RECENT" -eq 0 ]; then
+    echo "$(date): No requests in ${IDLE_MIN} minutes. Shutting down." >> $LOG_FILE
+    sudo shutdown -h now
+fi
+EOF
+
+sudo chmod +x /usr/local/bin/check-idle.sh
+
+# Create systemd timer
+sudo cat > /etc/systemd/system/auto-shutdown.timer << EOF
+[Unit]
+Description=Auto-shutdown check timer
+
+[Timer]
+OnBootSec=10min
+OnUnitActiveSec=5min
+
+[Install]
+WantedBy=timers.target
+EOF
+
+sudo cat > /etc/systemd/system/auto-shutdown.service << EOF
+[Unit]
+Description=Auto-shutdown check service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/local/bin/check-idle.sh
+EOF
+
+# Enable timer
+sudo systemctl daemon-reload
+sudo systemctl enable auto-shutdown.timer
+sudo systemctl start auto-shutdown.timer
+```
+
+## API Endpoints
+
+### `GET /` or `/health`
+Health check endpoint.
+
+**Response:**
+```json
+{
+  "status": "healthy",
+  "message": "Service is ready",
+  "model_loaded": true,
+  "device": "cuda"
+}
+```
+
+### `POST /generate-handwriting`
+Generate single handwriting image.
+
+**Request:**
+```json
+{
+  "text": "John Smith",
+  "author_id": 123,
+  "width": 300,
+  "height": 40,
+  "num_inference_steps": 50,
+  "temperature": 1.0,
+  "apply_blur": true,
+  "blur_radius": 1.2
+}
+```
+
+**Response:**
+```json
+{
+  "image_base64": "iVBORw0KGgoAAAANSUhEUgAA...",
+  "width": 298,
+  "height": 64,
+  "text": "John Smith",
+  "author_id": 123
+}
+```
+
+**Note:** `width`, `height`, `num_inference_steps`, and `temperature` are kept for API compatibility but not used by WordStylist. The model outputs ~64px height images with automatic width based on text length.
+
+### `POST /generate-handwriting-batch` (RECOMMENDED)
+Generate multiple handwriting images in a batch.
+
+**Note:** Due to WordStylist architecture, texts are processed sequentially (each text requires separate forward pass). However, using this endpoint still provides benefits: single API call, shared model initialization, and efficient VAE caching.
+
+**Request:**
+```json
+{
+  "requests": [
+    {
+      "text": "John",
+      "author_id": 123,
+      "apply_blur": true,
+      "blur_radius": 1.2
+    },
+    {
+      "text": "Smith", 
+      "author_id": 123,
+      "apply_blur": true,
+      "blur_radius": 1.2
+    },
+    {
+      "text": "Date:",
+      "author_id": 456,
+      "apply_blur": false
+    }
+  ]
+}
+```
+
+**Response:**
+```json
+{
+  "images": [
+    {
+      "image_base64": "...",
+      "width": 85,
+      "height": 64,
+      "text": "John",
+      "author_id": 123
+    },
+    {
+      "image_base64": "...",
+      "width": 102,
+      "height": 64,
+      "text": "Smith",
+      "author_id": 123
+    },
+    {
+      "image_base64": "...",
+      "width": 76,
+      "height": 64,
+      "text": "Date:",
+      "author_id": 456
+    }
+  ],
+  "total_generated": 3
+}
+```
+
+## Performance
+
+**WordStylist Performance:**
+- Generation time: ~2-3 seconds per text (with 1000 diffusion steps)
+- Sequential processing: Each text processed independently
+- GPU utilization: Optimized with EMA model and cached VAE
+- Memory: ~2-3GB VRAM per generation
+
+**Batch Endpoint Benefits:**
+- Single API call for multiple texts
+- Shared model initialization overhead
+- Efficient VAE caching across texts
+- Simplified error handling
+
+**Expected Throughput:**
+- Single text: ~2.5 seconds
+- 3 texts (batch): ~7-8 seconds total (2.3-2.7 sec/text)
+- 10 texts (batch): ~25-30 seconds total (2.5-3 sec/text)
+
+**Instance Recommendations:**
+- **g4dn.xlarge** (16GB VRAM): Recommended for production
+- **g4dn.2xlarge** (16GB VRAM): No benefit (single-text processing)
+- Larger instances don't improve speed due to sequential processing
+
+## Monitoring
+
+```bash
+# View logs
+journalctl -u handwriting-api -f
+
+# Check GPU usage
+nvidia-smi -l 1
+
+# Check service status
+systemctl status handwriting-api
+```
+
+## Troubleshooting
+
+**Model not loading:**
+- Check file paths in .env
+- Verify model files uploaded correctly
+- Check GPU availability: `nvidia-smi`
+
+**Out of memory:**
+- Reduce batch size
+- Lower num_inference_steps
+- Check other processes using GPU
+
+**Slow inference:**
+- Verify GPU is being used (check logs)
+- Reduce num_inference_steps (30-40 may be sufficient)
+- Check network latency between DocGenie API and EC2
+
+## Production Checklist
+
+- [ ] Use spot instances for cost savings
+- [ ] Set up auto-shutdown timer
+- [ ] Configure CloudWatch alarms for monitoring
+- [ ] Add API key authentication
+- [ ] Set up load balancer for multiple instances
+- [ ] Configure auto-scaling based on demand
+- [ ] Set up CloudWatch logs
+- [ ] Enable HTTPS (use ALB/nginx)
diff --git a/handwriting_service/WordStylist/LICENSE b/handwriting_service/WordStylist/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3ba67d8355fbc3b2f6e52aec4045e569abfd3878
--- /dev/null
+++ b/handwriting_service/WordStylist/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Konstantina Nikolaidou
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/handwriting_service/WordStylist/README.md b/handwriting_service/WordStylist/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad8b82bcf7c0ed7e9abe9548eb87865f1f61ee6
--- /dev/null
+++ b/handwriting_service/WordStylist/README.md
@@ -0,0 +1,89 @@
+# Official PyTorch Implementation of "WordStylist: Styled Verbatim Handwritten Text Generation with Latent Diffusion Models" - ICDAR 2023
+
+<!-- 
+[arXiv](https://arxiv.org/pdf/2303.16576.pdf) 
+  -->
+ <p align='center'>
+  <b>
+    <a href="https://arxiv.org/pdf/2303.16576.pdf">ArXiv Paper</a>
+  </b>
+</p> 
+<!-- 
+[paper](https://link.springer.com/chapter/10.1007/978-3-031-41679-8_22) 
+  -->
+ <p align='center'>
+  <b>
+    <a href="https://link.springer.com/chapter/10.1007/978-3-031-41679-8_22">ICDAR Paper</a>
+  </b>
+</p> 
+
+ 
+ <p align="center">
+<img src=figs/wordstylist.png width="600"/>
+</p>
+
+> **Abstract:** 
+>*Text-to-Image synthesis is the task of generating an image according to a specific text description. Generative Adversarial Networks have been considered the standard method for image synthesis virtually since their introduction. Denoising Diffusion Probabilistic Models are recently setting a new baseline, with remarkable results in Text-to-Image synthesis, among other fields. Aside its usefulness per se, it can also be particularly relevant as a tool for data augmentation to aid training models for other document image processing tasks. In this work, we present a latent diffusion-based method for styled text-to-text-content-image generation on word-level. Our proposed method is able to generate realistic word image samples from different writer styles, by using class index styles and text content prompts without the need of adversarial training, writer recognition, or text recognition. We gauge system performance with the Fréchet Inception Distance, writer recognition accuracy, and writer retrieval. We show that the proposed model produces samples that are aesthetically pleasing, help boosting text recognition performance, and get similar writer retrieval score as real data.*
+
+
+## Dataset & Pre-processing
+
+Download the ```data/words.tgz``` of IAM Handwriting Database: https://fki.tic.heia-fr.ch/databases/iam-handwriting-database.
+
+Then, pre-process the word images by running:
+```
+python prepare_images.py
+```
+Before running the ```prepare_images.py``` code make sure you have changed the ```iam_path``` and ```save_dir``` to the corresponding ```data/words.tgz``` and the path to save the processed images.
+
+## Training from scratch
+
+To train the diffusion model run:
+```
+python train.py --iam_path path/to/processed/images --save_path path/to/save/models/and/results
+```
+
+## Trained Model
+
+We provide the weights of a trained model, which you can download from: [trained_model](https://drive.google.com/drive/folders/15jdDCoYuWAohKW_OEjD2LXWce0pxM7ux?usp=sharing) (Download models folder with the 3 pt. files).
+
+## Sampling - Regenerating IAM
+
+If you want to regenerate the full IAM training set you can run:
+```
+python full_sampling.py --save_path path/to/save/generated/images --models_path /path/to/trained/models
+```
+
+## Sampling - Single image
+
+If you want to generate a single word with a random style you can run:
+```
+python sampling.py --save_path path/to/save/generated/images --models_path /path/to/trained/models --words ['hello']
+```
+
+## Citation
+
+If you find the code useful for your research, please cite our paper:
+```
+@inproceedings{nikolaidou2023wordstylist,
+  title={Wordstylist: styled verbatim handwritten text generation with latent diffusion models},
+  author={Nikolaidou, Konstantina and Retsinas, George and Christlein, Vincent and Seuret, Mathias and Sfikas, Giorgos and Smith, Elisa Barney and Mokayed, Hamam and Liwicki, Marcus},
+  booktitle={International Conference on Document Analysis and Recognition},
+  pages={384--401},
+  year={2023},
+  organization={Springer}
+}
+```
+
+```
+@article{nikolaidou2023wordstylist,
+  title={{WordStylist: Styled Verbatim Handwritten Text Generation with Latent Diffusion Models}},
+  author={Nikolaidou, Konstantina and Retsinas, George and Christlein, Vincent and Seuret, Mathias and Sfikas, Giorgos and Smith, Elisa Barney and Mokayed, Hamam and Liwicki, Marcus},
+  journal={arXiv preprint arXiv:2303.16576},
+  year={2023}
+}
+```
+
+## Acknowledgements
+
+We would like to thank the researchers of [Stable Diffusion](https://github.com/CompVis/stable-diffusion), [GANwriting](https://github.com/omni-us/research-GANwriting/tree/9e0d8a3a8327f00c67029dbf4a2fc1b0a88f730d), [SmartPatch](https://github.com/MattAlexMiracle/SmartPatch), and [HTR best practices](https://github.com/georgeretsi/HTR-best-practices/tree/main) for releasing their code.
diff --git a/handwriting_service/WordStylist/full_sampling.py b/handwriting_service/WordStylist/full_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a20aa3470d30554adb4d18190248c322dbac514
--- /dev/null
+++ b/handwriting_service/WordStylist/full_sampling.py
@@ -0,0 +1,163 @@
+import torch
+import torch.nn as nn
+import argparse
+import copy
+from torch import optim
+from train import setup_logging, Diffusion, EMA 
+from unet import UNetModel
+from diffusers import AutoencoderKL
+import os
+import random
+import torchvision
+from PIL import Image
+import cv2
+import numpy as np
+import json
+
+def crop_whitespace(img):
+    img_gray = img.convert("L")
+    img_gray = np.array(img_gray)
+    ret, thresholded = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    coords = cv2.findNonZero(thresholded)
+    x, y, w, h = cv2.boundingRect(coords)
+    rect = img.crop((x, y, x + w, y + h))
+    return np.array(rect)
+
+def save_images(images, path, args, **kwargs):
+    grid = torchvision.utils.make_grid(images, **kwargs)
+    if args.latent == True:
+        im = torchvision.transforms.ToPILImage()(grid)
+    else:
+        ndarr = grid.permute(1, 2, 0).to('cpu').numpy()
+        im = Image.fromarray(ndarr)
+        
+    im.save(path)
+    return im
+
+def save_single_images(images, path, args, **kwargs):
+    #grid = torchvision.utils.make_grid(images, **kwargs)
+    image = images.squeeze(0)
+    #print('images', image.shape)
+    white_crop = False
+    if args.latent == True:
+        im = torchvision.transforms.ToPILImage()(image)
+        #im = image.permute(1, 2, 0).to('cpu').numpy()
+        if white_crop == True:
+            im = crop_whitespace(im)
+            im = Image.fromarray(im)
+        else:
+            im = im.convert("L")
+            #img_gray = np.array(img_gray)
+        #im = crop_whitespace(im)
+        
+    else:
+        print('no latent')
+        #ndarr = grid.permute(1, 2, 0).to('cpu').numpy()
+        #im = Image.fromarray(ndarr)
+    im.save(path)
+    return im
+
+def main():
+    '''Main function'''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default='cuda:7') 
+    parser.add_argument('--img_size', type=int, default=(64, 256)) 
+    parser.add_argument('--save_path', type=str, default='/path/to/save/generated/images')
+    parser.add_argument('--channels', type=int, default=4)
+    parser.add_argument('--emb_dim', type=int, default=320)
+    parser.add_argument('--num_heads', type=int, default=4)
+    parser.add_argument('--num_res_blocks', type=int, default=1)
+    parser.add_argument('--latent', type=bool, default=True)
+    parser.add_argument('--single_image', type=bool, default=True)
+    parser.add_argument('--interpolation', type=bool, default=False)
+    parser.add_argument('--mix_rate', type=int, default=1)
+    parser.add_argument('--gt_train', type=str, default='./gt/gan.iam.tr_va.gt.filter27')
+    parser.add_argument('--stable_dif_path', type=str, default='./stable-diffusion-v1-5')
+    parser.add_argument('--models_path', type=str, default='/path/to/trained/models')
+    
+    args = parser.parse_args()
+    
+    setup_logging(args)
+    
+    if args.single_image == True:
+        print('single image')
+    else:
+        print('16 classes')
+        labels = torch.arange(16).long().to(args.device)
+    
+    diffusion = Diffusion(img_size=args.img_size, args=args)
+    
+    
+    
+    num_classes = 339 
+    vocab_size = 53 
+    if args.latent == True:
+        unet = UNetModel(image_size = args.img_size, in_channels=4, model_channels=args.emb_dim, out_channels=4, num_res_blocks=1, attention_resolutions=(1, 1), channel_mult=(1, 1), num_heads=args.num_heads, num_classes=num_classes, context_dim=args.emb_dim, vocab_size=vocab_size, args=args).to(args.device)
+    else:
+        unet = UNetModel(image_size = args.img_size, in_channels=3, model_channels=128, out_channels=3, num_res_blocks=1, attention_resolutions=(1, 2), num_heads=1, num_classes=num_classes, context_dim=128, vocab_size=vocab_size).to(args.device)
+    #unet = nn.DataParallel(unet, device_ids = [7,5]) #,5,6,7])
+    optimizer = optim.AdamW(unet.parameters(), lr=0.0001)
+    unet.load_state_dict(torch.load(f'{args.models_path}/models/ckpt.pt'))
+    optimizer.load_state_dict(torch.load(f'{args.models_path}/models/optim.pt'))
+    
+    unet.eval()
+    
+    ema = EMA(0.995)
+    ema_model = copy.deepcopy(unet).eval().requires_grad_(False)
+    ema_model.load_state_dict(torch.load(f'{args.models_path}/models/ema_ckpt.pt'))
+    ema_model.eval()
+    
+    if args.latent==True:
+        print('VAE is true')
+        vae = AutoencoderKL.from_pretrained(args.stable_dif_path, subfolder="vae")
+        vae = vae.to(args.device)
+        # Freeze vae and text_encoder
+        vae.requires_grad_(False)
+    else:
+        vae = None
+
+    
+    with open(f'{args.gt_train}', 'r') as f:
+        train_data = f.readlines()
+        train_data = [i.strip().split(' ') for i in train_data]
+        style_word_dict = {}
+        wr_index = 0
+        idx = 0
+        for i in train_data:
+            s_id = i[0].split(',')[0]
+            image = i[0].split(',')[1] #+ '.png'
+            transcription = i[1]
+            style_word_dict[idx] = {'s_id': s_id, 'label':transcription, 'image':image}
+            idx += 1
+            
+        print('num of writers', len(style_word_dict))
+
+    for idx in style_word_dict:
+        mix_rate = random.uniform(0, 1)
+        st = style_word_dict[idx]['s_id']
+        print('st', st)
+        image_name = style_word_dict[idx]['image']
+        print('image_name', image_name)
+        with open("./writers_dict_train.json", "r") as f:
+            wr_dict = json.load(f)
+        new_dict = {value:key for key, value in wr_dict.items()}
+    
+        
+        #uncomment for RANDOM STYLE
+        #s=[random.randint(0, 338)]
+        s = [wr_dict[st]]
+        
+        x_text = style_word_dict[idx]['label']
+        labels = torch.tensor(s).long().to(args.device)
+        if not args.single_image:
+            ema_sampled_images = diffusion.sample(ema_model, vae, n=len(labels), x_text=x_text, labels=labels, args=args)
+            sampled_ema = save_images(ema_sampled_images, os.path.join(args.save_path, 'images', f"{x_text}.jpg"), args)
+        else:
+            print('final sampling')
+            ema_sampled_images = diffusion.sampling(ema_model, vae, n=len(labels), x_text=x_text, labels=labels, args=args, mix_rate=mix_rate)
+            #image_name = f'{st}_{x_text}'
+            sampled_ema = save_single_images(ema_sampled_images, os.path.join(args.save_path, 'images', f'{image_name}.png'), args)
+            
+    
+if __name__ == "__main__":
+    main()
diff --git a/handwriting_service/WordStylist/index2letter.json b/handwriting_service/WordStylist/index2letter.json
new file mode 100644
index 0000000000000000000000000000000000000000..48c8a8a75bf7f2e472aa114d7979f923da9978f5
--- /dev/null
+++ b/handwriting_service/WordStylist/index2letter.json
@@ -0,0 +1 @@
+{"0": "A", "1": "B", "2": "C", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J", "10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T", "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z", "26": "a", "27": "b", "28": "c", "29": "d", "30": "e", "31": "f", "32": "g", "33": "h", "34": "i", "35": "j", "36": "k", "37": "l", "38": "m", "39": "n", "40": "o", "41": "p", "42": "q", "43": "r", "44": "s", "45": "t", "46": "u", "47": "v", "48": "w", "49": "x", "50": "y", "51": "z"}
\ No newline at end of file
diff --git a/handwriting_service/WordStylist/letter2index.json b/handwriting_service/WordStylist/letter2index.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf8841981f6957c22171dbc555832749982c304
--- /dev/null
+++ b/handwriting_service/WordStylist/letter2index.json
@@ -0,0 +1 @@
+{"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "H": 7, "I": 8, "J": 9, "K": 10, "L": 11, "M": 12, "N": 13, "O": 14, "P": 15, "Q": 16, "R": 17, "S": 18, "T": 19, "U": 20, "V": 21, "W": 22, "X": 23, "Y": 24, "Z": 25, "a": 26, "b": 27, "c": 28, "d": 29, "e": 30, "f": 31, "g": 32, "h": 33, "i": 34, "j": 35, "k": 36, "l": 37, "m": 38, "n": 39, "o": 40, "p": 41, "q": 42, "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, "y": 50, "z": 51}
\ No newline at end of file
diff --git a/handwriting_service/WordStylist/models/ema_ckpt.pt b/handwriting_service/WordStylist/models/ema_ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bfe198f03c512ed896b5258ee9b71ba9421795e9
--- /dev/null
+++ b/handwriting_service/WordStylist/models/ema_ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6238ecd59a718e48db076b88ed0353a9d43ae4b0c20783e3afa57b74a8eba7c7
+size 161550165
diff --git a/handwriting_service/WordStylist/prepare_images.py b/handwriting_service/WordStylist/prepare_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b0d1da6f935750facde60c81557c1e607d97c3f
--- /dev/null
+++ b/handwriting_service/WordStylist/prepare_images.py
@@ -0,0 +1,46 @@
+import PIL
+from PIL import Image, ImageOps
+import os
+
+IMG_WIDTH = 256
+IMG_HEIGHT = 64
+
+def resize_pad(image):
+    (img_width, img_height) = image.size
+    #resize image to height 64 keeping aspect ratio
+    image = image.resize((int(img_width * 64 / img_height), 64), Image.ANTIALIAS)
+    (img_width, img_height) = image.size
+    
+    # pad image if the width is less than the max width
+    if img_width > IMG_WIDTH:
+        image = image.resize((IMG_WIDTH, 64), Image.ANTIALIAS)
+    else:
+        outImg = ImageOps.pad(image, size=(IMG_WIDTH, 64), color= "white")#, centering=(0,0)) uncommment to pad right
+        image = outImg
+    return image
+
+
+iam_path = '/path/to/iam/words'
+save_dir = '/path/to/save/processed/images'
+
+image_paths = [os.path.join(root, file)
+               for root, _, files in os.walk(iam_path)
+               for file in files]
+
+for image_path in image_paths:
+    image_name = image_path.split('/')[-1]
+    print(image_name)
+    
+    save_path = os.path.join(save_dir, image_name)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    try:
+        image = Image.open(image_path)
+        if image.mode != 'RGB': 
+            image = image.convert('RGB')
+        image = resize_pad(image)
+        image.save(save_path)
+    except PIL.UnidentifiedImageError:
+        print('Error', image_path)
+        continue
+        
\ No newline at end of file
diff --git a/handwriting_service/WordStylist/sampling.py b/handwriting_service/WordStylist/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d0abd9c3f56255763af8c67a63816d0b303ec60
--- /dev/null
+++ b/handwriting_service/WordStylist/sampling.py
@@ -0,0 +1,131 @@
+import torch
+import torch.nn as nn
+import argparse
+import copy
+from torch import optim
+from train import setup_logging, Diffusion, EMA
+from unet import UNetModel
+from diffusers import AutoencoderKL
+import os
+import random
+import torchvision
+from PIL import Image
+import cv2
+import numpy as np
+
+def crop_whitespace(img):
+    img_gray = img.convert("L")
+    img_gray = np.array(img_gray)
+    ret, thresholded = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    coords = cv2.findNonZero(thresholded)
+    x, y, w, h = cv2.boundingRect(coords)
+    rect = img.crop((x, y, x + w, y + h))
+    return np.array(rect)
+
+def save_images(images, path, args, **kwargs):
+    grid = torchvision.utils.make_grid(images, **kwargs)
+    if args.latent == True:
+        im = torchvision.transforms.ToPILImage()(grid)
+    else:
+        ndarr = grid.permute(1, 2, 0).to('cpu').numpy()
+        im = Image.fromarray(ndarr)
+        
+    im.save(path)
+    return im
+
+def save_single_images(images, path, args, **kwargs):
+    #grid = torchvision.utils.make_grid(images, **kwargs)
+    image = images.squeeze(0)
+    print('images', image.shape)
+    
+    if args.latent == True:
+        im = torchvision.transforms.ToPILImage()(image)
+        #im = image.permute(1, 2, 0).to('cpu').numpy()
+        im = crop_whitespace(im)
+        im = Image.fromarray(im)
+    else:
+        print('no latent')
+
+    im.save(path)
+    return im
+
+def main():
+    '''Main function'''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default='cuda:0') 
+    parser.add_argument('--img_size', type=int, default=(64, 256)) 
+    parser.add_argument('--save_path', type=str, default='/path/to/save/generated/images')
+    parser.add_argument('--channels', type=int, default=4)
+    parser.add_argument('--emb_dim', type=int, default=320)
+    parser.add_argument('--num_heads', type=int, default=4)
+    parser.add_argument('--num_res_blocks', type=int, default=1)
+    parser.add_argument('--latent', type=bool, default=True)
+    parser.add_argument('--single_image', type=bool, default=True)
+    parser.add_argument('--interpolation', type=bool, default=False)
+    parser.add_argument('--mix_rate', type=int, default=1)
+    parser.add_argument('--stable_dif_path', type=str, default='./stable-diffusion-v1-5')
+    parser.add_argument('--models_path', type=str, default='/path/to/trained/models')
+    parser.add_argument('--words', type=list, default=['hello', 'MOVE'])
+    
+    args = parser.parse_args()
+    
+    setup_logging(args)
+    if args.single_image == True:
+        s=[random.randint(0, 339)] #style index for random style or pick a value from 0 to 339 for a specific style
+        
+        labels = torch.tensor(s).long().to(args.device)
+        print('style', labels)
+    else:
+        print('16 classes')
+        labels = torch.arange(16).long().to(args.device)
+    words = args.words #produce, greater, music, queer, clearly, edifice, freedom, MOVE, life, sweet, several, months
+    
+    print('words', words)
+    diffusion = Diffusion(img_size=args.img_size, args=args)
+    
+    num_classes = 339 
+    vocab_size = 53 
+    if args.latent == True:
+        unet = UNetModel(image_size = args.img_size, in_channels=4, model_channels=args.emb_dim, out_channels=4, num_res_blocks=1, attention_resolutions=(1, 1), channel_mult=(1, 1), num_heads=args.num_heads, num_classes=num_classes, context_dim=args.emb_dim, vocab_size=vocab_size, args=args).to(args.device)
+    else:
+        unet = UNetModel(image_size = args.img_size, in_channels=3, model_channels=128, out_channels=3, num_res_blocks=1, attention_resolutions=(1, 2), num_heads=1, num_classes=num_classes, context_dim=128, vocab_size=vocab_size).to(args.device)
+    #unet = nn.DataParallel(unet, device_ids = [0,1,2,3,4]) #,5,6,7])
+    optimizer = optim.AdamW(unet.parameters(), lr=0.0001)
+    unet.load_state_dict(torch.load(f'{args.models_path}/models/ckpt.pt'))
+    optimizer.load_state_dict(torch.load(f'{args.models_path}/models/optim.pt'))
+    
+    unet.eval()
+    
+    
+    ema = EMA(0.995)
+    ema_model = copy.deepcopy(unet).eval().requires_grad_(False)
+    ema_model.load_state_dict(torch.load(f'{args.models_path}/models/ema_ckpt.pt'))
+    #ema_model = ema_model.to(args.device)
+    ema_model.eval()
+    
+    if args.latent==True:
+        print('VAE is true')
+        vae = AutoencoderKL.from_pretrained(args.stable_dif_path, subfolder="vae")
+        vae = vae.to(args.device)
+        
+        # Freeze vae and text_encoder
+        vae.requires_grad_(False)
+    else:
+        vae = None
+
+    
+    for x_text in words:
+        if not args.single_image:
+            ema_sampled_images = diffusion.sample(ema_model, vae, n=len(labels), x_text=x_text, labels=labels, args=args)
+            sampled_ema = save_images(ema_sampled_images, os.path.join(args.save_path, 'images', f"{x_text}.jpg"), args)
+        else:
+            if args.interpolation == True:
+                ema_sampled_images = diffusion.sampling(ema_model, vae, n=len(labels), x_text=x_text, labels=labels, args=args)
+                sampled_ema = save_single_images(ema_sampled_images, os.path.join(args.save_path, 'images', f"{x_text}_{args.mix_rate}.png"), args)
+            else:
+                #for i in range(10):
+                ema_sampled_images = diffusion.sampling(ema_model, vae, n=len(labels), x_text=x_text, labels=labels, args=args)
+                sampled_ema = save_single_images(ema_sampled_images, os.path.join(args.save_path, 'images', f"{x_text}_{s[0]}.png"), args)
+    
+if __name__ == "__main__":
+    main()
diff --git a/handwriting_service/WordStylist/train.py b/handwriting_service/WordStylist/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..82053bf33c9bcdfbff142bb1789070a05dd76d41
--- /dev/null
+++ b/handwriting_service/WordStylist/train.py
@@ -0,0 +1,403 @@
+import os
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+import torchvision
+from tqdm import tqdm
+from torch import optim
+import copy
+import argparse
+import json
+from diffusers import AutoencoderKL
+from unet import UNetModel
+
+# wandb is only needed for training, make it optional for inference
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+MAX_CHARS = 10
+OUTPUT_MAX_LEN = MAX_CHARS #+ 2  # <GO>+groundtruth+<END>
+c_classes = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+cdict = {c:i for i,c in enumerate(c_classes)}
+icdict = {i:c for i,c in enumerate(c_classes)}
+
+
+def setup_logging(args):
+    os.makedirs(args.save_path, exist_ok=True)
+    os.makedirs(os.path.join(args.save_path, 'models'), exist_ok=True)
+    os.makedirs(os.path.join(args.save_path, 'images'), exist_ok=True)
+
+### Borrowed from GANwriting ###
+def label_padding(labels, num_tokens):
+    new_label_len = []
+    ll = [letter2index[i] for i in labels]
+    new_label_len.append(len(ll) + 2)
+    ll = np.array(ll) + num_tokens
+    ll = list(ll)
+    #ll = [tokens["GO_TOKEN"]] + ll + [tokens["END_TOKEN"]]
+    num = OUTPUT_MAX_LEN - len(ll)
+    if not num == 0:
+        ll.extend([tokens["PAD_TOKEN"]] * num)  # replace PAD_TOKEN
+    return ll
+
+
+def labelDictionary():
+    labels = list(c_classes)
+    letter2index = {label: n for n, label in enumerate(labels)}
+    # create json object from dictionary if you want to save writer ids
+    json_dict_l = json.dumps(letter2index)
+    l = open("letter2index.json","w")
+    l.write(json_dict_l)
+    l.close()
+    index2letter = {v: k for k, v in letter2index.items()}
+    json_dict_i = json.dumps(index2letter)
+    l = open("index2letter.json","w")
+    l.write(json_dict_i)
+    l.close()
+    return len(labels), letter2index, index2letter
+
+
+char_classes, letter2index, index2letter = labelDictionary()
+tok = False
+if not tok:
+    tokens = {"PAD_TOKEN": 52}
+else:
+    tokens = {"GO_TOKEN": 52, "END_TOKEN": 53, "PAD_TOKEN": 54}
+num_tokens = len(tokens.keys())
+print('num_tokens', num_tokens)
+
+
+print('num of character classes', char_classes)
+vocab_size = char_classes + num_tokens
+
+
+def save_images(images, path, args, **kwargs):
+    grid = torchvision.utils.make_grid(images, **kwargs)
+    if args.latent == True:
+        im = torchvision.transforms.ToPILImage()(grid)
+    else:
+        ndarr = grid.permute(1, 2, 0).to('cpu').numpy()
+        im = Image.fromarray(ndarr)
+    im.save(path)
+    return im
+
+class IAMDataset(Dataset):
+    def __init__(self, full_dict, image_path, writer_dict, args, transforms=None):
+
+        self.data_dict = full_dict
+        self.image_path = image_path
+        self.writer_dict = writer_dict
+    
+        self.transforms = transforms
+        self.output_max_len = OUTPUT_MAX_LEN
+        self.max_len = MAX_CHARS
+        self.n_samples_per_class = 16
+        self.indices = list(full_dict.keys())
+        
+            
+    def __len__(self):
+        return len(self.indices)
+            
+
+    
+    def __getitem__(self, idx):
+        image_name = self.data_dict[self.indices[idx]]['image']
+        label = self.data_dict[self.indices[idx]]['label']
+        wr_id = self.data_dict[self.indices[idx]]['s_id']
+        wr_id = torch.tensor(self.writer_dict[wr_id]).to(torch.int64)
+        img_path = os.path.join(self.image_path, image_name)
+        
+        image = Image.open(img_path).convert('RGB')
+        image = self.transforms(image)
+        
+        word_embedding = label_padding(label, num_tokens) 
+        word_embedding = np.array(word_embedding, dtype="int64")
+        word_embedding = torch.from_numpy(word_embedding).long()    
+        
+        return image, word_embedding, wr_id
+
+
+
+class EMA:
+    '''
+    EMA is used to stabilize the training process of diffusion models by 
+    computing a moving average of the parameters, which can help to reduce 
+    the noise in the gradients and improve the performance of the model.
+    '''
+    def __init__(self, beta):
+        super().__init__()
+        self.beta = beta
+        self.step = 0
+
+    def update_model_average(self, ma_model, current_model):
+        for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
+            old_weight, up_weight = ma_params.data, current_params.data
+            ma_params.data = self.update_average(old_weight, up_weight)
+
+    def update_average(self, old, new):
+        if old is None:
+            return new
+        return old * self.beta + (1 - self.beta) * new
+
+    def step_ema(self, ema_model, model, step_start_ema=2000):
+        if self.step < step_start_ema:
+            self.reset_parameters(ema_model, model)
+            self.step += 1
+            return
+        self.update_model_average(ema_model, model)
+        self.step += 1
+
+    def reset_parameters(self, ema_model, model):
+        ema_model.load_state_dict(model.state_dict())
+
+
+
+class Diffusion:
+    def __init__(self, noise_steps=1000, beta_start=1e-4, beta_end=0.02, img_size=(64, 128), args=None):
+        self.noise_steps = noise_steps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+
+        self.beta = self.prepare_noise_schedule().to(args.device)
+        self.alpha = 1. - self.beta
+        self.alpha_hat = torch.cumprod(self.alpha, dim=0)
+
+        self.img_size = img_size
+        self.device = args.device
+
+    def prepare_noise_schedule(self):
+        return torch.linspace(self.beta_start, self.beta_end, self.noise_steps)
+
+    def noise_images(self, x, t):
+        sqrt_alpha_hat = torch.sqrt(self.alpha_hat[t])[:, None, None, None]
+        sqrt_one_minus_alpha_hat = torch.sqrt(1 - self.alpha_hat[t])[:, None, None, None]
+        Ɛ = torch.randn_like(x)
+        return sqrt_alpha_hat * x + sqrt_one_minus_alpha_hat * Ɛ, Ɛ
+
+    def sample_timesteps(self, n):
+        return torch.randint(low=1, high=self.noise_steps, size=(n,))
+
+
+    def sampling(self, model, vae, n, x_text, labels, args, mix_rate=None, cfg_scale=3):
+        model.eval()
+        tensor_list = []
+        #if mix_rate is not None:
+         #   print('mix rate', mix_rate)
+        with torch.no_grad():
+            
+            words = [x_text]*n
+            for word in words:
+                transcript = label_padding(word, num_tokens) #self.transform_text(transcript)
+                word_embedding = np.array(transcript, dtype="int64")
+                word_embedding = torch.from_numpy(word_embedding).long()#float()
+                tensor_list.append(word_embedding)
+            text_features = torch.stack(tensor_list)
+            text_features = text_features.to(args.device)
+            
+            if args.latent == True:
+                x = torch.randn((n, 4, self.img_size[0] // 8, self.img_size[1] // 8)).to(args.device)
+            else:
+                x = torch.randn((n, 3, self.img_size[0], self.img_size[1])).to(args.device)
+            
+            for i in tqdm(reversed(range(1, self.noise_steps)), position=0):
+                t = (torch.ones(n) * i).long().to(self.device)
+                predicted_noise = model(x, None, t, text_features, labels, mix_rate=mix_rate)
+                if cfg_scale > 0:
+                    # uncond_predicted_noise = model(x, t, text_features, sid)
+                    # predicted_noise = torch.lerp(uncond_predicted_noise, predicted_noise, cfg_scale)
+                    uncond_predicted_noise = model(x, None, t, text_features, labels, mix_rate=mix_rate)
+                    predicted_noise = torch.lerp(uncond_predicted_noise, predicted_noise, cfg_scale)
+                alpha = self.alpha[t][:, None, None, None]
+                alpha_hat = self.alpha_hat[t][:, None, None, None]
+                beta = self.beta[t][:, None, None, None]
+                if i > 1:
+                    noise = torch.randn_like(x)
+                else:
+                    noise = torch.zeros_like(x)
+                x = 1 / torch.sqrt(alpha) * (x - ((1 - alpha) / (torch.sqrt(1 - alpha_hat))) * predicted_noise) + torch.sqrt(beta) * noise
+                
+        model.train()
+        if args.latent==True:
+            latents = 1 / 0.18215 * x
+            image = vae.decode(latents).sample
+
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()
+    
+            image = torch.from_numpy(image)
+            x = image.permute(0, 3, 1, 2)
+        else:
+            x = (x.clamp(-1, 1) + 1) / 2
+            x = (x * 255).type(torch.uint8)
+        return x
+
+def train(diffusion, model, ema, ema_model, vae, optimizer, mse_loss, loader, num_classes, vocab_size, transforms, args):
+    model.train()
+    
+    print('Training started....')
+    for epoch in range(args.epochs):
+        print('Epoch:', epoch)
+        pbar = tqdm(loader)
+        
+        for i, (images, word, s_id) in enumerate(pbar):
+            images = images.to(args.device)
+            original_images = images
+            text_features = word.to(args.device)
+            
+            s_id = s_id.to(args.device)
+            
+            if args.latent == True:
+                images = vae.encode(images.to(torch.float32)).latent_dist.sample()
+                images = images * 0.18215
+                latents = images
+            
+            t = diffusion.sample_timesteps(images.shape[0]).to(args.device)
+            x_t, noise = diffusion.noise_images(images, t)
+            
+            if np.random.random() < 0.1:
+                labels = None
+            
+            predicted_noise = model(x_t, original_images=original_images, timesteps=t, context=text_features, y=s_id, or_images=None)
+            
+            loss = mse_loss(noise, predicted_noise)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            ema.step_ema(ema_model, model)
+            pbar.set_postfix(MSE=loss.item())
+            
+    
+        if epoch % 100 == 0:
+            # if args.img_feat is True:
+            #     n=16
+            #     labels = image_features
+            # else:
+            labels = torch.arange(16).long().to(args.device)
+            n=len(labels)
+        
+            
+            words = ['text', 'getting', 'prop']
+            for x_text in words: 
+                ema_sampled_images = diffusion.sampling(ema_model, vae, n=n, x_text=x_text, labels=labels, args=args)
+                sampled_ema = save_images(ema_sampled_images, os.path.join(args.save_path, 'images', f"{x_text}_{epoch}.jpg"), args)
+                if args.wandb_log==True and wandb is not None:
+                    wandb_sampled_ema= wandb.Image(sampled_ema, caption=f"{x_text}_{epoch}")
+                    wandb.log({f"Sampled images": wandb_sampled_ema})
+            torch.save(model.state_dict(), os.path.join(args.save_path,"models", "ckpt.pt"))
+            torch.save(ema_model.state_dict(), os.path.join(args.save_path,"models", "ema_ckpt.pt"))
+            torch.save(optimizer.state_dict(), os.path.join(args.save_path,"models", "optim.pt"))   
+
+
+def main():
+    '''Main function'''
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=1000)
+    parser.add_argument('--batch_size', type=int, default=224)
+    parser.add_argument('--num_workers', type=int, default=4) 
+    parser.add_argument('--img_size', type=int, default=(64, 256))  
+    parser.add_argument('--dataset', type=str, default='iam', help='iam or other dataset') 
+    parser.add_argument('--iam_path', type=str, default='/path/to/iam/images/', help='path to iam dataset (images 64x256)')
+    parser.add_argument('--gt_train', type=str, default='./gt/gan.iam.tr_va.gt.filter27')
+    #UNET parameters
+    parser.add_argument('--channels', type=int, default=4, help='if latent is True channels should be 4, else 3')  
+    parser.add_argument('--emb_dim', type=int, default=320)
+    parser.add_argument('--num_heads', type=int, default=4)
+    parser.add_argument('--num_res_blocks', type=int, default=1)
+    parser.add_argument('--save_path', type=str, default='./save_path/')
+    parser.add_argument('--device', type=str, default='cuda:0') 
+    parser.add_argument('--wandb_log', type=bool, default=False)
+    parser.add_argument('--latent', type=bool, default=True)
+    parser.add_argument('--img_feat', type=bool, default=True)
+    parser.add_argument('--interpolation', type=bool, default=False)
+    parser.add_argument('--writer_dict', type=str, default='./writers_dict.json')
+    parser.add_argument('--stable_dif_path', type=str, default='./stable-diffusion-v1-5', help='path to stable diffusion')
+    
+    
+    args = parser.parse_args()
+    if args.wandb_log==True and wandb is not None:
+        runs = wandb.init(project='DIFFUSION_IAM', name=f'{args.save_path}', config=args)
+
+        wandb.config.update(args)
+    
+    #create save directories
+    setup_logging(args)
+
+    print('character vocabulary size', vocab_size)
+    
+    if args.dataset == 'iam':
+        class_dict = {}
+        for i, j in enumerate(os.listdir(f'{args.iam_path}')):
+            class_dict[j] = i
+
+        transforms = torchvision.transforms.Compose([
+                        torchvision.transforms.ToTensor(),
+                        torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+                            ])
+
+        with open(args.gt_train, 'r') as f:
+            train_data = f.readlines()
+            train_data = [i.strip().split(' ') for i in train_data]
+            wr_dict = {}
+            full_dict = {}
+            image_wr_dict = {}
+            img_word_dict = {}
+            wr_index = 0
+            idx = 0
+            for i in train_data:
+                s_id = i[0].split(',')[0]
+                image = i[0].split(',')[1] + '.png'
+                transcription = i[1]
+                #print(s_id)
+                full_dict[idx] = {'image': image, 's_id': s_id, 'label':transcription}
+                image_wr_dict[image] = s_id
+                img_word_dict[image] = transcription
+                idx += 1
+                if s_id not in wr_dict.keys():
+                    wr_dict[s_id] = wr_index
+                    wr_index += 1
+        
+            print('number of train writer styles', len(wr_dict))
+            style_classes=len(wr_dict)
+        
+        # create json object from dictionary if you want to save writer ids
+        json_dict = json.dumps(wr_dict)
+        f = open("writers_dict_train.json","w")
+        f.write(json_dict)
+        f.close()
+        
+        train_ds = IAMDataset(full_dict, args.iam_path, wr_dict, args, transforms=transforms)
+        
+        train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
+    
+    unet = UNetModel(image_size = args.img_size, in_channels=args.channels, model_channels=args.emb_dim, out_channels=args.channels, num_res_blocks=args.num_res_blocks, attention_resolutions=(1,1), channel_mult=(1, 1), num_heads=args.num_heads, num_classes=style_classes, context_dim=args.emb_dim, vocab_size=vocab_size, args=args, max_seq_len=OUTPUT_MAX_LEN).to(args.device)    
+    
+    optimizer = optim.AdamW(unet.parameters(), lr=0.0001)
+
+    mse_loss = nn.MSELoss()
+    diffusion = Diffusion(img_size=args.img_size, args=args)
+    
+    ema = EMA(0.995)
+    ema_model = copy.deepcopy(unet).eval().requires_grad_(False)
+    
+    if args.latent==True:
+        print('Latent is true - Working on latent space')
+        vae = AutoencoderKL.from_pretrained(args.stable_dif_path, subfolder="vae")
+        vae = vae.to(args.device)
+        
+        # Freeze vae and text_encoder
+        vae.requires_grad_(False)
+    else:
+        print('Latent is false - Working on pixel space')
+        vae = None
+
+    train(diffusion, unet, ema, ema_model, vae, optimizer, mse_loss, train_loader, style_classes, vocab_size, transforms, args)
+
+
+if __name__ == "__main__":
+    main()
+  
+  
diff --git a/handwriting_service/WordStylist/unet.py b/handwriting_service/WordStylist/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc7f8a5bc7dc3cb2673adbfe161e3daba3ad33b
--- /dev/null
+++ b/handwriting_service/WordStylist/unet.py
@@ -0,0 +1,1144 @@
+from abc import abstractmethod
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+from einops import rearrange, repeat
+from inspect import isfunction
+import math
+import random
+
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        
+        ctx.input_tensors = [x.float().detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+
+
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias = False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        
+        k = self.to_k(context)
+        v = self.to_v(context)
+        
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        
+        if exists(mask):
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            sim.masked_fill_(~mask, max_neg_value)
+
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+        
+        
+def get_subsequent_mask(seq):
+    ''' For masking out the subsequent info. '''
+    #'seq shape', seq.shape)
+    sz_b, len_s = seq.size()
+    subsequent_mask = torch.triu(
+        torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)
+    subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1)  # b x ls x ls
+
+    return subsequent_mask
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
+        super().__init__()
+        
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention for the image
+        self.attnc = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention for the context
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
+                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+    
+    def _forward(self, x, context=None):
+        
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context, mask=None) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head,
+                 depth=1, dropout=0., context_dim=None, part='encoder', vocab_size=None):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+
+        self.proj_in = nn.Conv2d(in_channels,
+                                 inner_dim,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
+                for d in range(depth)]
+        )
+
+        self.proj_out = zero_module(nn.Conv2d(inner_dim,
+                                              in_channels,
+                                              kernel_size=1,
+                                              stride=1,
+                                              padding=0))
+        self.part = part
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        #print('x spatial trans in', x.shape)
+        
+        
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        if self.part != 'sca':
+            x = rearrange(x, 'b c h w -> b (h w) c')
+    
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        if self.part != 'sca':
+            x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        x = self.proj_out(x)
+        return x + x_in
+
+
+
+# dummy replace
+def convert_module_to_f16(x):
+    pass
+
+def convert_module_to_f32(x):
+    pass
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb, context):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+                
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+                
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+class TransposedUpsample(nn.Module):
+    'Learned 2x upsampling without padding'
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+
+        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
+
+    def forward(self,x):
+        return self.up(x)
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = nn.Conv2d(#dims,
+                 self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = nn.AvgPool2d(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            nn.Conv2d(channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                nn.Conv2d(self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = nn.Conv2d(
+                channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = nn.Conv2d(channels, self.out_channels, 1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+            
+        # if context is None:
+        #     context= torch.zeros(emb.shape).to(emb.device)
+        
+        # emb = torch.cat([emb, context], dim=-1)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = nn.Conv2d(channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(nn.Conv2d(channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        #return pt_checkpoint(self._forward, x)  # pytorch
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += torch.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+##################################################################################
+
+    
+class Word_Attention(nn.Module):
+    def __init__(self, input_size, hidden_size):
+        super(Word_Attention, self).__init__()
+        self.linear_query = nn.Linear(input_size, hidden_size)
+        self.linear_key = nn.Linear(input_size, hidden_size)
+        self.linear_value = nn.Linear(input_size, hidden_size)
+        self.softmax = nn.Softmax(dim=-1)
+        
+    def forward(self, x):
+        # x shape: (batch_size, seq_len, input_size)
+        query = self.linear_query(x)
+        key = self.linear_key(x)
+        value = self.linear_value(x)
+        
+        # Calculate attention scores
+        scores = query @ key.transpose(-2, -1)
+        scores = self.softmax(scores)
+        
+        # Calculate weighted sum of the values
+        word_embedding = scores @ value
+        return word_embedding
+
+
+class CharacterEncoder(nn.Module):
+    def __init__(self, input_size, hidden_size, max_seq_len):
+        super(CharacterEncoder, self).__init__()
+        self.embedding = nn.Embedding(input_size, hidden_size)
+        self.attention = Word_Attention(hidden_size, hidden_size)
+
+        self.embedding_dim = hidden_size
+        self.max_seq_len = max_seq_len
+        self.positional_encoding = self.get_positional_encoding()
+
+    def forward(self, x):
+        # x shape: (batch_size, seq_len)
+        x = self.embedding(x)
+        #Remove positional encoding for ablation study
+        x += self.positional_encoding[:x.size(1), :].to(x.device)
+        word_embedding = self.attention(x)
+        return word_embedding
+    
+    def get_positional_encoding(self):
+        positional_encoding = torch.zeros(self.max_seq_len, self.embedding_dim)
+        for pos in range(self.max_seq_len):
+            for i in range(0, self.embedding_dim, 2):
+                positional_encoding[pos, i] = math.sin(pos / (10000 ** (i / self.embedding_dim)))
+                positional_encoding[pos, i + 1] = math.cos(pos / (10000 ** ((i + 1) / self.embedding_dim)))
+        return positional_encoding
+
+
+
+
+
+
+
+##################################################################################
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=True,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=768,                 # custom transformer support
+        vocab_size=256,                  # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=False,
+        args=None, 
+        max_seq_len=20,
+        #mix_rate=0.5
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            # Handle ListConfig from omegaconf if available, otherwise check for list-like types
+            try:
+                from omegaconf.listconfig import ListConfig
+                if type(context_dim) == ListConfig:
+                    context_dim = list(context_dim)
+            except ImportError:
+                # If omegaconf not available, check if it's a list-like object
+                if hasattr(context_dim, '__iter__') and not isinstance(context_dim, (str, int)):
+                    context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        self.args = args
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        
+        self.max_seq_len = max_seq_len
+        
+        self.word_emb = CharacterEncoder(vocab_size, context_dim, max_seq_len).to(args.device)
+        
+        #==================== INPUT BLOCK ====================
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+
+        #==================== MIDDLE BLOCK ====================
+        
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        
+        #==================== OUTPUT BLOCK ====================
+        
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer(
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+            normalization(ch),
+            nn.Conv2d(model_channels, n_embed, 1),
+            nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+        )
+        
+        self.interpolation = args.interpolation
+        
+    
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+  
+    
+    def forward(self, x, original_images=None, timesteps=None, context=None, y=None, original_context=None, or_images=None, mix_rate=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        #print('y', y.shape)
+        
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        
+        
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+        
+        #if you want to explore interpolation between 2 random styles you can go to the --interpolation argument in the train.py file
+        if self.interpolation:
+            if mix_rate is not None:
+                print('interpolation')
+                s1 = random.randint(0, 338)
+                s2 = random.randint(0, 338)
+                while s1 == s2:
+                    s2 = random.randint(0, 338)
+                y1 = torch.tensor([s1]).long().to(x.device)
+                y2 = torch.tensor([s2]).long().to(x.device)
+                y1 = self.label_emb(y1).to(x.device)
+                y2 = self.label_emb(y2).to(x.device)
+            
+                y = (1-mix_rate)*y1 + mix_rate*y2
+                
+                y = y.to(x.device)
+                emb = emb + y #self.label_emb(y)
+            else:
+                emb = emb + self.label_emb(y) 
+        else:
+            emb = emb + self.label_emb(y)  
+            
+        if context is not None:
+            #Word embedding
+            context = self.word_emb(context)
+        
+        
+        h = x.type(self.dtype)
+        
+        #INPUT BLOCKS
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        
+        #MIDDLE BLOCK
+        h = self.middle_block(h, emb, context)
+        
+        #OUTPUT BLOCKS
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            
+        h = h.type(x.dtype)
+        
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            
+            return self.out(h)
+
+
+
+
+
+
diff --git a/handwriting_service/handler.py b/handwriting_service/handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f43cc4300c1875a5cee0e0a76558b893231ae543
--- /dev/null
+++ b/handwriting_service/handler.py
@@ -0,0 +1,202 @@
+"""
+RunPod Serverless Handler for Handwriting Generation Service
+
+This handler wraps the FastAPI-based HandwritingGenerator to work with
+RunPod's serverless queue-based endpoints.
+
+ENDPOINT SUPPORT:
+- /run (async): Returns job ID immediately, poll /status/{job_id} for results (10MB input limit)
+- /runsync (sync): Waits for completion, returns result directly (20MB input limit)
+- This handler supports BOTH automatically - RunPod SDK handles the routing
+
+The same handler function works for both endpoints. RunPod determines whether to:
+1. Queue and return job ID (/run)
+2. Wait and return result (/runsync)
+"""
+
+import runpod
+import logging
+import os
+
+# Import the generator from the FastAPI app
+from inference import HandwritingGenerator
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Initialize model ONCE at module level (not in handler)
+logger.info("Initializing HandwritingGenerator...")
+generator = HandwritingGenerator(
+    model_dir="WordStylist",
+    checkpoint_path="WordStylist/models",
+    device="cuda" if os.getenv("DEVICE", "cuda") == "cuda" else "cpu"
+)
+logger.info("HandwritingGenerator initialized successfully!")
+
+
+def handler(job):
+    """
+    RunPod handler function for handwriting generation.
+    Works with both /run (async) and /runsync (sync) endpoints.
+    
+    Supports BOTH single and batch processing:
+    
+    Single text input:
+    {
+        "text": "Hello",
+        "author_id": 42,
+        "apply_blur": true,
+        "hw_id": "hw_0"  # optional
+    }
+    
+    Batch input (RECOMMENDED for cost efficiency):
+    {
+        "texts": [
+            {"text": "Hello", "author_id": 42, "hw_id": "hw_0"},
+            {"text": "World", "author_id": 42, "hw_id": "hw_1"}
+        ],
+        "apply_blur": true  # applied to all
+    }
+    
+    Returns:
+    - Single: {"image_base64": "...", "width": 200, "height": 64, ...}
+    - Batch: {"images": [{...}, {...}], "total_generated": 2}
+    
+    Usage with RunPod API:
+    - POST /run: Async, returns {"id": "job-id", "status": "IN_QUEUE"}
+    - POST /runsync: Sync, returns {"status": "COMPLETED", "output": {...}}
+    """
+    try:
+        # Extract input from job
+        job_input = job["input"]
+        
+        # Check if batch or single request
+        is_batch = "texts" in job_input
+        
+        if is_batch:
+            # BATCH PROCESSING (cost-efficient - 1 worker for N texts)
+            texts_input = job_input.get("texts", [])
+            if not texts_input:
+                return {"error": "Batch mode requires 'texts' list"}
+            
+            # Global parameters applied to all texts
+            apply_blur = job_input.get("apply_blur", False)
+            blur_radius = job_input.get("blur_radius", 1.2)
+            
+            # Prepare batch lists
+            texts, author_ids, hw_ids = [], [], []
+            
+            for item in texts_input:
+                # Sanitize text (WordStylist: A-Z, a-z only)
+                # NOTE: API already sanitizes, but we keep this as defensive programming
+                # for direct RunPod testing. Cost is negligible (~0.001s vs 18s inference)
+                text = item.get("text", "")
+                original_text = text
+                text = ''.join(c for c in text if c.isalpha())
+                
+                if not text:
+                    logger.warning(f"Skipping invalid text: '{original_text}'")
+                    continue
+                
+                if text != original_text:
+                    logger.debug(f"Sanitized: '{original_text}' -> '{text}'")
+                
+                texts.append(text)
+                author_ids.append(item.get("author_id", 0))
+                hw_ids.append(item.get("hw_id"))
+            
+            if not texts:
+                return {"error": "No valid texts after sanitization"}
+            
+            logger.info(f"Batch processing {len(texts)} texts")
+            
+            # Generate all images in one batch (SEQUENTIAL in same worker)
+            results = generator.generate_batch(
+                texts=texts,
+                author_ids=author_ids,
+                widths=[None] * len(texts),
+                heights=[None] * len(texts),
+                num_inference_steps=50,
+                temperature=1.0,
+                apply_blur=apply_blur,
+                blur_radius=blur_radius
+            )
+            
+            # Format batch response
+            images = [
+                {
+                    "image_base64": result["image_base64"],
+                    "width": result["width"],
+                    "height": result["height"],
+                    "text": texts[i],
+                    "author_id": author_ids[i],
+                    "hw_id": hw_ids[i]
+                }
+                for i, result in enumerate(results)
+            ]
+            
+            logger.info(f"Batch complete: {len(images)} images generated")
+            return {"images": images, "total_generated": len(images)}
+            
+        else:
+            # SINGLE REQUEST (backward compatible)
+            text = job_input.get("text")
+            if not text:
+                return {"error": "Missing required field: text"}
+            
+            # Sanitize text (defensive programming, API already does this)
+            original_text = text
+            text = ''.join(c for c in text if c.isalpha())
+            
+            if not text:
+                return {"error": f"Text contains no valid characters. WordStylist only supports A-Z and a-z. Got: '{original_text}'"}
+            
+            if text != original_text:
+                logger.warning(f"Text sanitized: '{original_text}' -> '{text}' (removed non-letters)")
+            
+            author_id = job_input.get("author_id", 0)
+            apply_blur = job_input.get("apply_blur", False)
+            blur_radius = job_input.get("blur_radius", 1.2)
+            hw_id = job_input.get("hw_id")
+            
+            logger.info(f"Single generation - text: '{text}', author: {author_id}")
+            
+            # Generate image
+            results = generator.generate_batch(
+                texts=[text],
+                author_ids=[author_id],
+                widths=[None],
+                heights=[None],
+                num_inference_steps=50,
+                temperature=1.0,
+                apply_blur=apply_blur,
+                blur_radius=blur_radius
+            )
+            
+            # Extract result
+            generated = results[0]
+            result = {
+                "image_base64": generated["image_base64"],
+                "width": generated["width"],
+                "height": generated["height"],
+                "text": text,
+                "author_id": author_id,
+                "hw_id": hw_id
+            }
+            
+            logger.info(f"Generation complete for: '{text}'")
+            return result
+        
+    except Exception as e:
+        logger.error(f"Handler error: {e}", exc_info=True)
+        return {"error": str(e)}
+
+
+# Start the RunPod serverless worker
+if __name__ == "__main__":
+    logger.info("Starting RunPod serverless handler...")
+    runpod.serverless.start({"handler": handler})
diff --git a/handwriting_service/index2letter.json b/handwriting_service/index2letter.json
new file mode 100644
index 0000000000000000000000000000000000000000..48c8a8a75bf7f2e472aa114d7979f923da9978f5
--- /dev/null
+++ b/handwriting_service/index2letter.json
@@ -0,0 +1 @@
+{"0": "A", "1": "B", "2": "C", "3": "D", "4": "E", "5": "F", "6": "G", "7": "H", "8": "I", "9": "J", "10": "K", "11": "L", "12": "M", "13": "N", "14": "O", "15": "P", "16": "Q", "17": "R", "18": "S", "19": "T", "20": "U", "21": "V", "22": "W", "23": "X", "24": "Y", "25": "Z", "26": "a", "27": "b", "28": "c", "29": "d", "30": "e", "31": "f", "32": "g", "33": "h", "34": "i", "35": "j", "36": "k", "37": "l", "38": "m", "39": "n", "40": "o", "41": "p", "42": "q", "43": "r", "44": "s", "45": "t", "46": "u", "47": "v", "48": "w", "49": "x", "50": "y", "51": "z"}
\ No newline at end of file
diff --git a/handwriting_service/inference.py b/handwriting_service/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6def74f995c8c60fe61bab00281f7da5a1160c
--- /dev/null
+++ b/handwriting_service/inference.py
@@ -0,0 +1,439 @@
+"""
+Handwriting generator wrapper using WordStylist diffusion model.
+
+Handles model loading, batched inference, and image encoding.
+Adapted to use WordStylist (ICDAR 2023) instead of original revoked model.
+"""
+
+import base64
+import io
+import json
+import logging
+import sys
+import copy
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import random
+
+import torch
+import torch.nn as nn
+import torchvision  # For ToPILImage transform
+from torch import optim
+from PIL import Image
+import cv2
+import numpy as np
+from diffusers import AutoencoderKL
+
+# Add WordStylist to Python path
+# First try local WordStylist (for Docker), then parent directory (for local dev)
+WORDSTYLIST_LOCAL = Path(__file__).parent / "WordStylist"
+WORDSTYLIST_PARENT = Path(__file__).parent.parent / "WordStylist"
+
+if WORDSTYLIST_LOCAL.exists():
+    WORDSTYLIST_PATH = WORDSTYLIST_LOCAL
+elif WORDSTYLIST_PARENT.exists():
+    WORDSTYLIST_PATH = WORDSTYLIST_PARENT
+else:
+    raise FileNotFoundError("WordStylist not found. Please copy WordStylist folder to handwriting_service/ or parent directory.")
+
+if str(WORDSTYLIST_PATH) not in sys.path:
+    sys.path.insert(0, str(WORDSTYLIST_PATH))
+
+logging.info(f"Loading WordStylist from: {WORDSTYLIST_PATH}")
+
+# Import WordStylist components
+from train import Diffusion, EMA
+from unet import UNetModel
+
+logger = logging.getLogger(__name__)
+
+
+class HandwritingGenerator:
+    """
+    Wrapper for WordStylist handwriting diffusion model with batched inference support.
+    
+    WordStylist Model Details:
+    - Trained on subset of IAM dataset (339 writer styles)
+    - Character vocabulary: 52 chars (A-Z, a-z) + PAD token
+    - Latent diffusion with VAE (stabilityai/sd-vae-ft-mse)
+    - Model size: ~150MB
+    - Output: 64x256 latent -> ~64px height handwriting
+    - Fixed 1000 diffusion steps (cannot be changed)
+    
+    Feature Parity with Original Diffusion Model:
+    ✅ Blur: Gaussian blur post-processing (matches add_handwriting_blur.py)
+    ✅ Cropping: Automatic whitespace removal with Otsu thresholding
+    ✅ Style consistency: Author ID mapping to 339 writer styles
+    ✅ VAE encoding/decoding: Using stabilityai/sd-vae-ft-mse
+    ✅ EMA model: Exponential moving average for stable generation
+    ⚠️  Batching: Sequential processing (WordStylist only batches same-text/different-styles)
+    ⚠️  Inference steps: Fixed at 1000 (parameter ignored)
+    ⚠️  Temperature: Not used by WordStylist (uses fixed noise schedule)
+    """
+    
+    def __init__(
+        self,
+        model_dir: str = "WordStylist",
+        checkpoint_path: str = "WordStylist/models",
+        device: str = "cuda"
+    ):
+        """
+        Initialize WordStylist handwriting generator.
+        
+        Args:
+            model_dir: Directory containing WordStylist code (default: WordStylist/)
+            checkpoint_path: Path to model checkpoints directory (default: WordStylist/models/)
+            device: Device to run inference on ('cuda' or 'cpu')
+        """
+        self.wordstylist_dir = Path(model_dir)
+        if not self.wordstylist_dir.is_absolute():
+            # In Docker: /app/inference.py -> /app/WordStylist
+            # Locally: inference.py -> ../WordStylist
+            self.wordstylist_dir = Path(__file__).parent / model_dir
+            
+        self.models_dir = Path(checkpoint_path)
+        if not self.models_dir.is_absolute():
+            self.models_dir = self.wordstylist_dir / "models"
+            
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        
+        logger.info(f"Initializing WordStylist HandwritingGenerator on {self.device}")
+        logger.info(f"WordStylist directory: {self.wordstylist_dir}")
+        logger.info(f"Models directory: {self.models_dir}")
+        
+        # Model hyperparameters
+        self.img_size = (64, 256)
+        self.num_classes = 339  # Number of writer styles in WordStylist
+        self.vocab_size = 53    # Character vocabulary size
+        self.emb_dim = 320
+        self.num_heads = 4
+        
+        # Load model components
+        self.args_obj = self._create_args()
+        self.diffusion, self.ema_model, self.vae = self._load_wordstylist_model()
+        
+        # Load character vocabulary mapping
+        self.letter2index = self._load_char_vocab()
+        
+        logger.info("WordStylist HandwritingGenerator initialized successfully")
+    
+    def _create_args(self):
+        """Create Args object required by WordStylist"""
+        class Args:
+            pass
+        
+        args = Args()
+        args.img_size = self.img_size
+        args.device = self.device
+        args.latent = True
+        args.save_path = str(self.wordstylist_dir / "output")
+        args.interpolation = False
+        args.mix_rate = 1
+        
+        return args
+    
+    def _load_wordstylist_model(self) -> tuple:
+        """Load WordStylist UNet, EMA model, and VAE"""
+        try:
+            logger.info("Loading WordStylist models...")
+            
+            # Initialize diffusion
+            diffusion = Diffusion(img_size=self.img_size, args=self.args_obj)
+            logger.info("✓ Diffusion initialized")
+            
+            # Initialize UNet architecture (no weights loaded yet)
+            # We only need the architecture to load EMA weights
+            unet = UNetModel(
+                image_size=self.img_size,
+                in_channels=4,
+                model_channels=self.emb_dim,
+                out_channels=4,
+                num_res_blocks=1,
+                attention_resolutions=(1, 1),
+                channel_mult=(1, 1),
+                num_heads=self.num_heads,
+                num_classes=self.num_classes,
+                context_dim=self.emb_dim,
+                vocab_size=self.vocab_size,
+                args=self.args_obj
+            ).to(self.device)
+            logger.info("✓ UNet architecture initialized")
+            
+            # Load EMA model directly (skip loading base ckpt.pt - it's never used!)
+            # The base ckpt.pt is only needed if you want the non-EMA model
+            ema_model = copy.deepcopy(unet).eval().requires_grad_(False)
+            
+            ema_ckpt_path = self.models_dir / "ema_ckpt.pt"
+            if not ema_ckpt_path.exists():
+                raise FileNotFoundError(f"EMA checkpoint not found: {ema_ckpt_path}")
+                
+            ema_checkpoint = torch.load(ema_ckpt_path, map_location=self.device)
+            ema_model.load_state_dict(ema_checkpoint)
+            ema_model.eval()
+            logger.info(f"✓ EMA model loaded from {ema_ckpt_path} (base checkpoint skipped - not used in inference)")
+            
+            # Load VAE (stabilityai/sd-vae-ft-mse)
+            # In Docker: Always download from HuggingFace (no cached copy)
+            # Locally: Try to use cached copy from data/models/handwriting/cached_vae
+            vae_path = Path(__file__).parent.parent / "data" / "models" / "handwriting" / "cached_vae"
+            
+            # Only check for cached VAE if we're NOT in Docker (Docker container has /app as parent)
+            if vae_path.exists() and vae_path.is_dir():
+                logger.info(f"Loading cached VAE from {vae_path}")
+                vae = AutoencoderKL.from_pretrained(vae_path)
+            else:
+                logger.info("Downloading VAE from HuggingFace (stabilityai/sd-vae-ft-mse)...")
+                vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
+                
+            vae = vae.to(self.device)
+            vae.requires_grad_(False)
+            vae.eval()
+            logger.info("✓ VAE loaded")
+            
+            return diffusion, ema_model, vae
+            
+        except Exception as e:
+            logger.error(f"Failed to load WordStylist model: {e}")
+            raise
+    
+    def _load_char_vocab(self) -> Dict[str, int]:
+        """Load character-to-index mapping from WordStylist"""
+        vocab_path = self.wordstylist_dir / "letter2index.json"
+        
+        if not vocab_path.exists():
+            # Fallback: create vocabulary from scratch
+            logger.warning(f"letter2index.json not found at {vocab_path}, creating default vocabulary")
+            c_classes = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+            return {c: i for i, c in enumerate(c_classes)}
+        
+        with open(vocab_path, 'r') as f:
+            letter2index = json.load(f)
+        
+        logger.info(f"✓ Character vocabulary loaded: {len(letter2index)} characters")
+        return letter2index
+    
+    def _map_author_to_style(self, author_id: int) -> int:
+        """
+        Map author ID to WordStylist writer style ID.
+        
+        Args:
+            author_id: Author identifier (0-656 from original model)
+        
+        Returns:
+            Writer style ID (0-338 for WordStylist)
+        """
+        # Map 657 original styles -> 339 WordStylist styles
+        style_id = author_id % self.num_classes
+        return style_id
+    
+    def _crop_whitespace(self, img: Image.Image) -> Image.Image:
+        """Crop whitespace from PIL image using Otsu thresholding"""
+        img_gray = img.convert("L")
+        img_gray = np.array(img_gray)
+        ret, thresholded = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        coords = cv2.findNonZero(thresholded)
+        
+        if coords is None:
+            return img
+        
+        x, y, w, h = cv2.boundingRect(coords)
+        rect = img.crop((x, y, x + w, y + h))
+        return rect
+    
+    def _make_background_transparent(self, img: Image.Image, threshold: int = 240) -> Image.Image:
+        """
+        Convert white/light background to transparent.
+        Matches original pipeline behavior where handwriting has transparent background.
+        
+        Args:
+            img: PIL Image (RGB or RGBA)
+            threshold: Brightness threshold above which pixels become transparent (default: 240)
+            
+        Returns:
+            RGBA PIL Image with transparent background
+        """
+        try:
+            # Convert to RGBA if needed
+            if img.mode != 'RGBA':
+                img = img.convert('RGBA')
+            
+            # Get image data
+            data = img.getdata()
+            
+            # Create new data with transparent background
+            new_data = []
+            for item in data:
+                # If pixel is mostly white (bright), make it transparent
+                # Otherwise keep original with full opacity
+                r, g, b, a = item
+                avg = (r + g + b) / 3
+                if avg > threshold:
+                    # Very light pixel - make transparent
+                    new_data.append((255, 255, 255, 0))
+                else:
+                    # Dark pixel (actual handwriting) - keep it
+                    new_data.append((r, g, b, 255))
+            
+            img.putdata(new_data)
+            return img
+            
+        except Exception as e:
+            logger.warning(f"Failed to make background transparent: {e}, returning RGBA conversion")
+            return img.convert('RGBA') if img.mode != 'RGBA' else img
+    
+    def _apply_blur(self, img: Image.Image, radius: float = 1.2) -> Image.Image:
+        """
+        Apply Gaussian blur to handwriting image for more natural appearance.
+        Matches original pipeline's add_handwriting_blur.py functionality.
+        
+        Args:
+            img: PIL Image to blur (RGBA)
+            radius: Gaussian blur radius (default: 1.2, range: 0.6-1.8)
+            
+        Returns:
+            Blurred PIL Image with preserved alpha channel
+        """
+        try:
+            from PIL import ImageFilter
+            
+            # Ensure RGBA mode
+            if img.mode != 'RGBA':
+                img = img.convert('RGBA')
+            
+            # Split into RGB and Alpha channels
+            r, g, b, a = img.split()
+            
+            # Apply blur to RGB only (preserve alpha channel)
+            rgb_img = Image.merge('RGB', (r, g, b))
+            rgb_blurred = rgb_img.filter(ImageFilter.GaussianBlur(radius=radius))
+            
+            # Merge back with original alpha
+            r_blur, g_blur, b_blur = rgb_blurred.split()
+            result = Image.merge('RGBA', (r_blur, g_blur, b_blur, a))
+            
+            return result
+            
+        except Exception as e:
+            logger.warning(f"Failed to apply blur: {e}, returning original image")
+            return img
+    
+    def generate_batch(
+        self,
+        texts: List[str],
+        author_ids: List[int],
+        widths: Optional[List[Optional[int]]] = None,
+        heights: Optional[List[Optional[int]]] = None,
+        num_inference_steps: int = 50,
+        temperature: float = 1.0,
+        apply_blur: bool = False,
+        blur_radius: float = 1.2
+    ) -> List[dict]:
+        """
+        Generate batch of handwriting images using WordStylist.
+        
+        NOTE: WordStylist processes each text sequentially (not true GPU batching).
+        The model only supports batching for same-text with different writer styles.
+        For different texts, they are generated one at a time.
+        
+        Args:
+            texts: List of text strings to generate
+            author_ids: List of author IDs for style consistency (0-656 → mapped to 0-338)
+            widths: Not used (WordStylist has fixed aspect ratio) - kept for API compatibility
+            heights: Not used (WordStylist fixed at ~64px height) - kept for API compatibility
+            num_inference_steps: Not used (WordStylist fixed at 1000 steps) - kept for API compatibility
+            temperature: Not used (WordStylist uses fixed noise schedule) - kept for API compatibility
+            apply_blur: Whether to apply Gaussian blur (matches original pipeline)
+            blur_radius: Blur radius when apply_blur=True (0.6-1.8 typical, default: 1.2)
+        
+        Returns:
+            List of dicts with keys: image_base64, width, height
+        """
+        if not texts:
+            return []
+        
+        # Map author IDs to WordStylist style IDs
+        style_ids = [self._map_author_to_style(author_id) for author_id in author_ids]
+        
+        logger.info(f"Generating batch: {len(texts)} texts")
+        logger.info(f"  Author IDs: {author_ids} -> Style IDs: {style_ids}")
+        logger.info(f"  Texts: {texts}")
+        
+        results = []
+        
+        try:
+            # WordStylist limitation: Must generate each text sequentially
+            # The model only supports batching for same-text with different writer styles
+            # Processing different texts requires separate forward passes
+            for text, style_id in zip(texts, style_ids):
+                logger.info(f"  Generating: '{text}' with style {style_id}")
+                
+                # Prepare style labels
+                labels = torch.tensor([style_id]).long().to(self.device)
+                
+                # Generate handwriting using WordStylist diffusion
+                with torch.no_grad():
+                    sampled_image = self.diffusion.sampling(
+                        self.ema_model, 
+                        self.vae, 
+                        n=1, 
+                        x_text=text, 
+                        labels=labels, 
+                        args=self.args_obj
+                    )
+                
+                # Convert tensor to PIL image
+                image_tensor = sampled_image.squeeze(0)  # Remove batch dimension
+                pil_image = torchvision.transforms.ToPILImage()(image_tensor)
+                
+                # Crop whitespace
+                pil_image = self._crop_whitespace(pil_image)
+                
+                # Convert to RGBA and make white background transparent
+                pil_image = self._make_background_transparent(pil_image)
+                
+                # Apply blur if requested (makes handwriting look more natural)
+                if apply_blur:
+                    pil_image = self._apply_blur(pil_image, radius=blur_radius)
+                
+                # Get actual dimensions
+                img_width, img_height = pil_image.size
+                
+                # Encode to base64
+                base64_img = self._encode_image(pil_image)
+                
+                # Store result with actual dimensions
+                results.append({
+                    "image_base64": base64_img,
+                    "width": img_width,
+                    "height": img_height
+                })
+                
+                logger.info(f"    ✓ Generated {img_width}x{img_height} image")
+            
+            return results
+            
+        except Exception as e:
+            logger.error(f"Batch generation failed: {e}", exc_info=True)
+            raise
+    
+    def _encode_image(self, pil_image: Image.Image) -> str:
+        """
+        Encode PIL image to base64 string.
+        
+        Args:
+            pil_image: PIL Image object (RGBA)
+        
+        Returns:
+            Base64-encoded PNG string
+        """
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format="PNG")
+        buffer.seek(0)
+        return base64.b64encode(buffer.read()).decode('utf-8')
+    
+    def cleanup(self):
+        """Cleanup resources"""
+        logger.info("Cleaning up WordStylist HandwritingGenerator resources")
+        # Clear CUDA cache if using GPU
+        if self.device.type == 'cuda':
+            torch.cuda.empty_cache()
diff --git a/handwriting_service/inference_old.py b/handwriting_service/inference_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4b438ee45b171c9e692e42ce2b4b7510701efc
--- /dev/null
+++ b/handwriting_service/inference_old.py
@@ -0,0 +1,214 @@
+"""
+Handwriting generator wrapper using WordStylist diffusion model.
+
+Handles model loading, batched inference, and image encoding.
+Adapted to use WordStylist (ICDAR 2023) instead of original revoked model.
+"""
+
+import base64
+import io
+import json
+import logging
+import sys
+import copy
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import random
+
+import torch
+import torch.nn as nn
+from torch import optim
+from PIL import Image
+import cv2
+import numpy as np
+from diffusers import AutoencoderKL
+
+# Add WordStylist to Python path
+WORDSTYLIST_PATH = Path(__file__).parent.parent / "WordStylist"
+if str(WORDSTYLIST_PATH) not in sys.path:
+    sys.path.insert(0, str(WORDSTYLIST_PATH))
+
+# Import WordStylist components
+from train import Diffusion, EMA
+from unet import UNetModel
+
+logger = logging.getLogger(__name__)
+
+
+class HandwritingGenerator:
+    """
+    Wrapper for WordStylist handwriting diffusion model with batched inference support.
+    
+    WordStylist Model Details:
+    - Trained on subset of IAM dataset (339 writer styles)
+    - Character vocabulary: 52 chars (A-Z, a-z) + PAD token
+    - Latent diffusion with VAE (stabilityai/sd-vae-ft-mse)
+    - Model size: ~150MB
+    - Output: 64x256 latent -> ~64px height handwriting
+    """
+    
+    def __init__(
+        self,
+        model_dir: str = "WordStylist",
+        checkpoint_path: str = "WordStylist/models",
+        device: str = "cuda"
+    ):
+        """
+        Initialize WordStylist handwriting generator.
+        
+        Args:
+            model_dir: Directory containing WordStylist code (default: WordStylist/)
+            checkpoint_path: Path to model checkpoints directory (default: WordStylist/models/)
+            device: Device to run inference on ('cuda' or 'cpu')
+        """
+        self.wordstylist_dir = Path(model_dir)
+        if not self.wordstylist_dir.is_absolute():
+            self.wordstylist_dir = Path(__file__).parent.parent / model_dir
+            
+        self.models_dir = Path(checkpoint_path)
+        if not self.models_dir.is_absolute():
+            self.models_dir = self.wordstylist_dir / "models"
+            
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        
+        logger.info(f"Initializing WordStylist HandwritingGenerator on {self.device}")
+        logger.info(f"WordStylist directory: {self.wordstylist_dir}")
+        logger.info(f"Models directory: {self.models_dir}")
+        
+        # Model hyperparameters
+        self.img_size = (64, 256)
+        self.num_classes = 339  # Number of writer styles in WordStylist
+        self.vocab_size = 53    # Character vocabulary size
+        self.emb_dim = 320
+        self.num_heads = 4
+        
+        # Load model components
+        self.diffusion, self.ema_model, self.vae = self._load_wordstylist_model()
+        
+        # Load character vocabulary mapping
+        self.letter2index = self._load_char_vocab()
+        
+        logger.info("WordStylist HandwritingGenerator initialized successfully")
+    
+    def _load_model(self) -> Dict[str, Any]:
+        """Load model components (tokenizer, encoder, unet, vae, etc.)"""
+        try:
+            # Use the actual load_experiment function from generate_handwriting_diffusion_raw.py
+            # It expects: run_dir (parent of config.yaml), checkpoint_name, device
+            run_dir = self.model_dir
+            checkpoint_name = self.checkpoint_path.name if self.checkpoint_path.is_file() else "latest.pt"
+            
+            components = load_experiment(
+                run_dir=run_dir,
+                checkpoint_name=checkpoint_name,
+                device=self.device
+            )
+            
+            return components
+            
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    
+    def _load_author_mapping(self) -> Dict[str, int]:
+        """Load author ID to writer style ID mapping from components"""
+        # writer_id_map is already loaded by load_experiment
+        # It maps writer IDs (strings) to class indices (integers)
+        return self.components.get("writer_id_map", {})
+    
+    def _map_author_to_style(self, author_id: int) -> int:
+        """
+        Map author ID to writer style ID.
+        
+        Args:
+            author_id: Author identifier (integer, e.g., 0, 1, 2, ...)
+        
+        Returns:
+            Writer style ID (integer) - the class index for the UNet
+        """
+        # Convert to string for lookup in writer_id_map
+        author_key = str(author_id)
+        
+        if author_key in self.author_to_style:
+            return self.author_to_style[author_key]
+        
+        # If not found, use author_id modulo the number of available writer styles
+        num_styles = len(self.author_to_style) if self.author_to_style else 657
+        style_id = author_id % num_styles
+        
+        logger.info(f"Mapped author {author_id} to style {style_id}")
+        return style_id
+    
+    def generate_batch(
+        self,
+        texts: List[str],
+        author_ids: List[int],
+        widths: Optional[List[int]] = None,
+        heights: Optional[List[int]] = None,
+        num_inference_steps: int = 50,
+        temperature: float = 1.0
+    ) -> List[str]:
+        """
+        Generate batch of handwriting images (OPTIMIZED).
+        
+        Args:
+            texts: List of text strings to generate
+            author_ids: List of author IDs for style consistency
+            widths: List of target widths (not strictly enforced by diffusion)
+            heights: List of target heights (fixed at model output size)
+            num_inference_steps: Number of diffusion steps
+            temperature: Sampling temperature
+        
+        Returns:
+            List of base64-encoded PNG images with transparency
+        """
+        if not texts:
+            return []
+        
+        # Map author IDs to style IDs
+        style_ids = [self._map_author_to_style(author_id) for author_id in author_ids]
+        
+        logger.info(f"Generating batch: {len(texts)} texts, {num_inference_steps} steps")
+        
+        try:
+            # Call existing batched generation function
+            pil_images = diffusion_generate_batch(
+                tokens=texts,
+                style_ids=style_ids,
+                components=self.components,
+                steps=num_inference_steps,
+                temperature=temperature
+            )
+            
+            # Convert PIL images to base64
+            base64_images = []
+            for pil_img in pil_images:
+                base64_images.append(self._encode_image(pil_img))
+            
+            return base64_images
+            
+        except Exception as e:
+            logger.error(f"Batch generation failed: {e}")
+            raise
+    
+    def _encode_image(self, pil_image: Image.Image) -> str:
+        """
+        Encode PIL image to base64 string.
+        
+        Args:
+            pil_image: PIL Image object (RGBA)
+        
+        Returns:
+            Base64-encoded PNG string
+        """
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format="PNG")
+        buffer.seek(0)
+        return base64.b64encode(buffer.read()).decode('utf-8')
+    
+    def cleanup(self):
+        """Cleanup resources"""
+        logger.info("Cleaning up HandwritingGenerator resources")
+        # Clear CUDA cache if using GPU
+        if self.device.type == 'cuda':
+            torch.cuda.empty_cache()
diff --git a/handwriting_service/letter2index.json b/handwriting_service/letter2index.json
new file mode 100644
index 0000000000000000000000000000000000000000..faf8841981f6957c22171dbc555832749982c304
--- /dev/null
+++ b/handwriting_service/letter2index.json
@@ -0,0 +1 @@
+{"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "H": 7, "I": 8, "J": 9, "K": 10, "L": 11, "M": 12, "N": 13, "O": 14, "P": 15, "Q": 16, "R": 17, "S": 18, "T": 19, "U": 20, "V": 21, "W": 22, "X": 23, "Y": 24, "Z": 25, "a": 26, "b": 27, "c": 28, "d": 29, "e": 30, "f": 31, "g": 32, "h": 33, "i": 34, "j": 35, "k": 36, "l": 37, "m": 38, "n": 39, "o": 40, "p": 41, "q": 42, "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, "y": 50, "z": 51}
\ No newline at end of file
diff --git a/handwriting_service/main.py b/handwriting_service/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e585558a0af4b455b0680dcd180dcc88080389
--- /dev/null
+++ b/handwriting_service/main.py
@@ -0,0 +1,298 @@
+"""
+Handwriting Inference API Service for EC2 Deployment
+
+This service runs on GPU-enabled EC2 instance and provides
+handwriting generation via diffusion model inference.
+
+Run with: uvicorn main:app --host 0.0.0.0 --port 8080 --workers 1
+"""
+
+import os
+import sys
+import logging
+from typing import List, Optional
+from pathlib import Path
+
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import uvicorn
+
+# Direct imports since we're in the handwriting_service directory in Docker
+from inference import HandwritingGenerator
+from models import (
+    HandwritingRequest,
+    HandwritingBatchRequest,
+    HandwritingResponse,
+    HandwritingBatchResponse,
+    HealthResponse
+)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Initialize FastAPI
+app = FastAPI(
+    title="DocGenie Handwriting Inference API",
+    description="GPU-accelerated handwriting generation using diffusion models",
+    version="1.0.0"
+)
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Restrict in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Global model instance (loaded on startup)
+generator: Optional[HandwritingGenerator] = None
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Load model on startup"""
+    global generator
+    
+    logger.info("Loading handwriting diffusion model...")
+    
+    try:
+        # Load from environment or default paths
+        # Paths are relative to docgenie/ root directory
+        model_dir = os.getenv("HANDWRITING_MODEL_DIR", "WordStylist")
+        checkpoint_path = os.getenv("HANDWRITING_CHECKPOINT_PATH", "WordStylist/models")
+        
+        generator = HandwritingGenerator(
+            model_dir=model_dir,
+            checkpoint_path=checkpoint_path,
+            device="cuda" if os.getenv("DEVICE", "cuda") == "cuda" else "cpu"
+        )
+        
+        logger.info("Model loaded successfully!")
+        
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}")
+        raise
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    global generator
+    if generator:
+        generator.cleanup()
+    logger.info("Service shutdown complete")
+
+
+@app.get("/", response_model=HealthResponse)
+async def root():
+    """Health check endpoint"""
+    return HealthResponse(
+        status="healthy",
+        message="Handwriting Inference API is running",
+        model_loaded=generator is not None
+    )
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Detailed health check"""
+    if generator is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    
+    return HealthResponse(
+        status="healthy",
+        message="Service is ready",
+        model_loaded=True,
+        device=str(generator.device)
+    )
+
+
+@app.post("/generate-handwriting", response_model=HandwritingResponse)
+async def generate_handwriting(request: HandwritingRequest):
+    """
+    Generate a single handwriting image.
+    
+    For multiple texts, use /generate-handwriting-batch for better performance.
+    """
+    if generator is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    
+    try:
+        logger.info(f"Generating handwriting for text: '{request.text}' (author: {request.author_id})")
+        
+        # Generate single image (internally batches with size 1)
+        results = generator.generate_batch(
+            texts=[request.text],
+            author_ids=[request.author_id],
+            widths=[request.width],
+            heights=[request.height],
+            num_inference_steps=request.num_inference_steps,
+            temperature=request.temperature,
+            apply_blur=request.apply_blur,
+            blur_radius=request.blur_radius
+        )
+        
+        # Extract result (returns list of dicts with image_base64, width, height)
+        generated = results[0]
+        return HandwritingResponse(
+            image_base64=generated["image_base64"],
+            width=generated["width"],
+            height=generated["height"],
+            text=request.text,
+            author_id=request.author_id,
+            hw_id=request.hw_id
+        )
+        
+    except Exception as e:
+        logger.error(f"Generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+
+
+@app.post("/generate-handwriting-batch", response_model=HandwritingBatchResponse)
+async def generate_handwriting_batch(request: HandwritingBatchRequest):
+    """
+    Generate multiple handwriting images in a single batch (OPTIMIZED).
+    
+    This is the preferred endpoint for generating multiple images as it
+    processes all texts in a single GPU batch for maximum efficiency.
+    """
+    if generator is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    
+    if not request.requests:
+        raise HTTPException(status_code=400, detail="Empty request list")
+    
+    try:
+        num_requests = len(request.requests)
+        logger.info(f"Generating batch of {num_requests} handwriting images")
+        
+        # Extract batch data
+        texts = [req.text for req in request.requests]
+        author_ids = [req.author_id for req in request.requests]
+        widths = [req.width for req in request.requests]
+        heights = [req.height for req in request.requests]
+        
+        # Use first request's parameters for all (or could make per-request)
+        num_inference_steps = request.requests[0].num_inference_steps
+        temperature = request.requests[0].temperature
+        apply_blur = request.requests[0].apply_blur
+        blur_radius = request.requests[0].blur_radius
+        
+        # Batched generation (single GPU forward pass)
+        results = generator.generate_batch(
+            texts=texts,
+            author_ids=author_ids,
+            widths=widths,
+            heights=heights,
+            num_inference_steps=num_inference_steps,
+            temperature=temperature,
+            apply_blur=apply_blur,
+            blur_radius=blur_radius
+        )
+        
+        logger.info(f"Batch generation complete: {num_requests} images")
+        
+        return HandwritingBatchResponse(
+            images=[
+                HandwritingResponse(
+                    image_base64=generated["image_base64"],
+                    width=generated["width"],
+                    height=generated["height"],
+                    text=req.text,
+                    author_id=req.author_id,
+                    hw_id=req.hw_id
+                )
+                for generated, req in zip(results, request.requests)
+            ],
+            total_generated=num_requests
+        )
+        
+    except Exception as e:
+        logger.error(f"Batch generation failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Batch generation failed: {str(e)}")
+
+
+@app.post("/run")
+@app.post("/runsync")  # RunPod sync endpoint (recommended for <90s tasks)
+async def runpod_handler(request: dict):
+    """
+    RunPod Serverless compatible endpoint (both async /run and sync /runsync).
+    
+    For handwriting generation, use /runsync for immediate results.
+    
+    Request format: {"input": {"text": "...", "author_id": 42, ...}}
+    Response format: {"output": {"image_base64": "...", ...}}
+    """
+    if generator is None:
+        return {"error": "Model not loaded"}
+    
+    try:
+        # Extract input from RunPod format
+        input_data = request.get("input", {})
+        
+        # Parse request
+        text = input_data.get("text")
+        author_id = input_data.get("author_id", 0)
+        
+        if not text:
+            return {"error": "Missing required field: text"}
+        
+        # Optional parameters
+        width = input_data.get("width")
+        height = input_data.get("height")
+        num_inference_steps = input_data.get("num_inference_steps", 50)
+        temperature = input_data.get("temperature", 1.0)
+        apply_blur = input_data.get("apply_blur", False)
+        blur_radius = input_data.get("blur_radius", 1.2)
+        hw_id = input_data.get("hw_id")
+        
+        logger.info(f"RunPod request - text: '{text}', author: {author_id}")
+        
+        # Generate image
+        results = generator.generate_batch(
+            texts=[text],
+            author_ids=[author_id],
+            widths=[width],
+            heights=[height],
+            num_inference_steps=num_inference_steps,
+            temperature=temperature,
+            apply_blur=apply_blur,
+            blur_radius=blur_radius
+        )
+        
+        # Extract result (returns list of dicts with image_base64, width, height)
+        generated = results[0]
+        
+        # Return in RunPod format
+        return {
+            "output": {
+                "image_base64": generated["image_base64"],
+                "width": generated["width"],
+                "height": generated["height"],
+                "text": text,
+                "author_id": author_id,
+                "hw_id": hw_id
+            }
+        }
+        
+    except Exception as e:
+        logger.error(f"RunPod generation failed: {e}")
+        return {"error": str(e)}
+
+
+if __name__ == "__main__":
+    # For development/testing
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=8080,
+        reload=False,
+        workers=1  # Single worker to avoid multiple model loads
+    )
diff --git a/handwriting_service/models.py b/handwriting_service/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ae5b724d443884c9f9ef4bcbe8096421c1c685
--- /dev/null
+++ b/handwriting_service/models.py
@@ -0,0 +1,67 @@
+"""
+Pydantic models for Handwriting Inference API
+"""
+
+from typing import List, Optional
+from pydantic import BaseModel, Field, field_validator
+
+
+class HandwritingRequest(BaseModel):
+    """Request schema for single handwriting generation"""
+    text: str = Field(..., description="Text to generate as handwriting (IMPORTANT: Only A-Z and a-z supported - spaces/numbers/punctuation will be removed)")
+    author_id: int = Field(..., ge=0, le=656, description="Author ID for consistent style (0-656, mapped to 0-338 WordStylist styles)")
+    
+    @field_validator('text')
+    @classmethod
+    def sanitize_text(cls, v: str) -> str:
+        """
+        Sanitize text to only include characters supported by WordStylist.
+        WordStylist vocabulary: A-Z and a-z only (52 characters)
+        Removes: spaces, numbers, punctuation, special characters
+        """
+        if not v:
+            raise ValueError("text cannot be empty")
+        
+        # Filter to only alphabetic characters
+        sanitized = ''.join(c for c in v if c.isalpha())
+        
+        if not sanitized:
+            raise ValueError(f"text contains no valid characters. WordStylist only supports A-Z and a-z. Got: '{v}'")
+        
+        return sanitized
+    width: Optional[int] = Field(default=None, ge=50, le=2000, description="Target width (not used by WordStylist - kept for API compatibility)")
+    height: Optional[int] = Field(default=None, ge=20, le=200, description="Target height (not used by WordStylist - kept for API compatibility)")
+    num_inference_steps: int = Field(default=50, ge=10, le=100, description="Diffusion steps (not used - WordStylist fixed at 1000)")
+    temperature: float = Field(default=1.0, ge=0.1, le=2.0, description="Sampling temperature (not used by WordStylist)")
+    apply_blur: bool = Field(default=False, description="Apply Gaussian blur for more natural appearance (matches original pipeline)")
+    blur_radius: float = Field(default=1.2, ge=0.5, le=2.5, description="Blur radius when apply_blur=True (0.6-1.8 typical)")
+    hw_id: Optional[str] = Field(default=None, description="Optional identifier for tracking in batch requests")
+
+
+class HandwritingBatchRequest(BaseModel):
+    """Request schema for batch handwriting generation"""
+    requests: List[HandwritingRequest] = Field(..., min_items=1, max_items=32)
+
+
+class HandwritingResponse(BaseModel):
+    """Response schema for handwriting generation"""
+    image_base64: str = Field(..., description="Base64-encoded PNG image with transparency")
+    width: int = Field(..., description="Actual image width")
+    height: int = Field(..., description="Actual image height")
+    text: str = Field(..., description="Original text that was generated")
+    author_id: int = Field(..., description="Author ID used for generation")
+    hw_id: Optional[str] = Field(default=None, description="Optional identifier for tracking")
+
+
+class HandwritingBatchResponse(BaseModel):
+    """Response schema for batch generation"""
+    images: List[HandwritingResponse]
+    total_generated: int
+
+
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    message: str
+    model_loaded: bool = False
+    device: Optional[str] = None
diff --git a/handwriting_service/prepare_build.sh b/handwriting_service/prepare_build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..887e2e8231b044ee349a4cae7ad5e8fbb5158541
--- /dev/null
+++ b/handwriting_service/prepare_build.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# ============================================
+# Prepare Handwriting Service for Docker Build
+# ============================================
+# This script copies WordStylist into handwriting_service 
+# so Docker can access it during build
+
+set -e
+
+echo "🔧 Preparing Handwriting Service for Docker build..."
+
+# Check if WordStylist exists in parent directory
+if [ ! -d "../WordStylist" ]; then
+    echo "❌ Error: WordStylist not found in parent directory"
+    echo "   Expected: ../WordStylist/"
+    exit 1
+fi
+
+# Remove existing WordStylist if present (to avoid conflicts)
+if [ -d "WordStylist" ]; then
+    echo "🗑️  Removing existing WordStylist copy..."
+    rm -rf WordStylist
+fi
+
+# Copy WordStylist
+echo "📦 Copying WordStylist into handwriting_service..."
+cp -r ../WordStylist .
+
+# Remove unnecessary large files (saves 432MB in Docker image!)
+echo "🗑️  Removing unnecessary files for Docker build..."
+rm -f WordStylist/models/optim.pt      # 277MB - optimizer state
+rm -f WordStylist/models/ckpt.pt       # 155MB - only ema_ckpt.pt is used
+rm -rf WordStylist/figs/               # Example images
+rm -rf WordStylist/gt/                 # Ground truth data  
+rm -rf WordStylist/test_output/        # Test outputs
+rm -rf WordStylist/__pycache__/        # Python cache
+
+# Verify copy
+if [ -d "WordStylist" ]; then
+    WORDSTYLIST_SIZE=$(du -sh WordStylist | cut -f1)
+    echo "✅ WordStylist prepared successfully! (Size: $WORDSTYLIST_SIZE)"
+    echo ""
+    echo "Optimizations applied:"
+    echo "  ✅ Removed optim.pt (277MB)"
+    echo "  ✅ Removed ckpt.pt (155MB)"
+    echo "  ✅ Removed example files"
+    echo "  💾 Saved ~432MB!"
+    echo ""
+    echo "You can now build the Docker image:"
+    echo "  docker buildx build --platform linux/amd64 \\"
+    echo "    -t yourusername/docgenie-handwriting:latest \\"
+    echo "    --build-arg BUILDKIT_INLINE_CACHE=1 ."
+else
+    echo "❌ Error: Failed to copy WordStylist"
+    exit 1
+fi
diff --git a/handwriting_service/requirements-light.txt b/handwriting_service/requirements-light.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2ebc6d97dffdd939025b0e5a853f4cd12adba935
--- /dev/null
+++ b/handwriting_service/requirements-light.txt
@@ -0,0 +1,36 @@
+# Handwriting Inference API Requirements (without PyTorch)
+# PyTorch is installed separately in Dockerfile due to large size
+
+# Web framework
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+
+# Deep learning (PyTorch installed separately in Dockerfile)
+# torch==2.1.0
+torchvision>=0.16.0  # Required for ToPILImage transform
+diffusers==0.30.3
+safetensors>=0.4.0  # Required for loading VAE weights from HuggingFace
+accelerate>=0.20.0  # Recommended by diffusers for faster model loading
+
+# Image processing
+pillow==11.3.0
+numpy==1.26.4
+opencv-python==4.11.0.86  # For whitespace cropping (correct version)
+
+# Utilities
+pyyaml==6.0.2
+rich==14.1.0
+runpod>=1.6.0  # RunPod serverless SDK
+aiohttp[speedups]>=3.9.0  # HTTP client used by RunPod SDK (with brotli/aiodns support)
+Brotli>=1.1.0  # Brotli decompression (capital B for official package)
+aiodns>=3.0.0  # Async DNS resolution for aiohttp
+
+# HTTP client (for testing)
+httpx==0.28.1
+
+# Additional dependencies from WordStylist
+einops==0.8.2  # Required by WordStylist UNet
+tqdm==4.67.1
+typing-extensions==4.14.1
diff --git a/handwriting_service/requirements.txt b/handwriting_service/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ee85e867f4995c8a3347b797138d8fc8754076a8
--- /dev/null
+++ b/handwriting_service/requirements.txt
@@ -0,0 +1,30 @@
+# Handwriting Inference API Requirements
+# Using WordStylist (ICDAR 2023) model
+
+# Web framework
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+
+# Deep learning
+torch==2.1.0
+diffusers>=0.35.2
+
+# Image processing
+pillow==11.3.0
+numpy==1.26.4
+opencv-python==4.11.0.86  # For whitespace cropping (correct version)
+torchvision>=0.16.0  # For ToPILImage transform
+
+# Utilities
+pyyaml==6.0.2
+rich==14.1.0
+
+# HTTP client (for testing)
+httpx==0.28.1
+
+# Additional dependencies from WordStylist
+einops==0.8.2  # Required by WordStylist UNet
+tqdm==4.67.1
+typing-extensions==4.14.1
diff --git a/handwriting_service/start.sh b/handwriting_service/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..97615fe2be0d18eecca55c722db7a19624a83197
--- /dev/null
+++ b/handwriting_service/start.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Start the Handwriting Generation Service locally
+# For testing integration with the DocGenie API
+#
+# The service supports:
+# - Batch endpoint: /generate-handwriting-batch (up to 32 texts per request)
+# - Concurrent request handling via FastAPI async
+# - Single worker to avoid multiple GPU model loads
+# - Sequential GPU processing (WordStylist limitation)
+
+echo "🚀 Starting Handwriting Service (Port 8080)..."
+
+# Set device (can be overridden: DEVICE=cpu ./start.sh)
+DEVICE=${DEVICE:-cuda}
+
+# Check if WordStylist directory exists
+if [ ! -d "../WordStylist" ]; then
+    echo "❌ Error: WordStylist directory not found at ../WordStylist"
+    echo "   Expected path: /media/ahad-hassan/Volume_E/FYP/FYP/docgenie/WordStylist"
+    exit 1
+fi
+
+# Check if models directory exists
+if [ ! -d "../WordStylist/models" ]; then
+    echo "❌ Error: Models directory not found at ../WordStylist/models"
+    echo "   Please ensure EMA checkpoint is downloaded (ema_ckpt.pt)"
+    echo "   Note: Base checkpoint (ckpt.pt) is optional - only EMA is used for inference"
+    exit 1
+fi
+
+echo "✓ Model directory: WordStylist"
+echo "✓ Checkpoint path: WordStylist/models"
+echo "✓ Device: $DEVICE"
+echo ""
+echo "Starting FastAPI server..."
+echo "Access health check at: http://localhost:8080/health"
+echo "Press Ctrl+C to stop"
+echo ""
+
+# Start the service with uvicorn
+# Single worker to avoid multiple model loads
+DEVICE=$DEVICE uvicorn main:app \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --reload \
+    --workers 1
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755
index 0000000000000000000000000000000000000000..25a241b19805af285e1f1c83bd5993d29a25525d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,176 @@
+[project]
+name = "docgenie"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = "==3.11.12"
+dependencies = [
+    "aiohappyeyeballs==2.6.1",
+    "aiohttp==3.12.15",
+    "aiosignal==1.4.0",
+    "annotated-types==0.7.0",
+    "anthropic==0.64.0",
+    "anyio==4.10.0",
+    "attrs==25.3.0",
+    "beautifulsoup4==4.13.4",
+    "certifi==2025.8.3",
+    "charset-normalizer==3.4.3",
+    "click==8.2.1",
+    "cssutils==2.11.1",
+    "datasets==4.0.0",
+    "dill==0.3.8",
+    "distro==1.9.0",
+    "einops==0.8.2",
+    "filelock==3.19.1",
+    "frozenlist==1.7.0",
+    "fsspec==2025.3.0",
+    "gitdb==4.0.12",
+    "gitpython==3.1.45",
+    "h11==0.16.0",
+    "hf-xet==1.1.8",
+    "httpcore==1.0.9",
+    "httpx==0.28.1",
+    "huggingface-hub==0.34.4",
+    "idna==3.10",
+    "jinja2==3.1.6",
+    "jiter==0.10.0",
+    "jsonlines==4.0.0",
+    "levenshtein==0.27.1",
+    "markdown-it-py==4.0.0",
+    "markupsafe==3.0.2",
+    "mdurl==0.1.2",
+    "more-itertools==10.7.0",
+    "mpmath==1.3.0",
+    "multidict==6.6.4",
+    "multiprocess==0.70.16",
+    "networkx==3.5",
+    "numpy==1.26.4",
+    "packaging==25.0",
+    "pandas==2.3.1",
+    "pdf2image==1.17.0",
+    "pillow==11.3.0",
+    "platformdirs==4.3.8",
+    "propcache==0.3.2",
+    "protobuf==6.32.0",
+    "pyarrow==21.0.0",
+    "pydantic==2.11.7",
+    "pydantic-core==2.33.2",
+    "pygments==2.19.2",
+    "pymupdf==1.26.3",
+    "pypdf2==3.0.1",
+    "python-dateutil==2.9.0.post0",
+    "pytz==2025.2",
+    "pyyaml==6.0.2",
+    "rapidfuzz==3.13.0",
+    "regex==2025.7.34",
+    "requests==2.32.5",
+    "rich==14.1.0",
+    "safetensors==0.6.2",
+    "sentry-sdk==2.35.0",
+    "setuptools==78.1.1",
+    "six==1.17.0",
+    "smmap==5.0.2",
+    "sniffio==1.3.1",
+    "soupsieve==2.7",
+    "sympy==1.13.1",
+    "tokenizers==0.21.4",
+    "tqdm==4.67.1",
+    "transformers==4.49",
+    "triton==2.1.0",
+    "typing-extensions==4.14.1",
+    "typing-inspection==0.4.1",
+    "tzdata==2025.2",
+    "urllib3==2.5.0",
+    "wandb==0.21.1",
+    "wheel==0.45.1",
+    "xxhash==3.5.0",
+    "yarl==1.20.1",
+    "torch==2.1.0",
+    "torchvision",
+    "atria-core",
+    "datadings>=3.4.7",
+    "pytorch-ignite>=0.5.2",
+    "scikit-learn>=1.7.2",
+    "fire>=0.7.1",
+    "tensorboardx>=2.6.4",
+    "torchinfo>=1.8.0",
+    "hydra-core>=1.3.2",
+    "umap-learn==0.5.9.post2",
+    "hdbscan>=0.8.40",
+    "h5py>=3.14.0",
+    "colorlog>=6.9.0",
+    "dash>=3.2.0",
+    "flask>=3.1.2",
+    "plotly>=6.3.1",
+    "dash-bootstrap-components>=2.0.4",
+    "matplotlib>=3.10.6",
+    "pydantic-argparse>=0.10.0",
+    "sentence-transformers>=5.1.1",
+    "pydantic-settings>=2.11.0",
+    "playwright>=1.55.0",
+    "mmcv==2.1.0",
+    "mmdet==3.3.0",
+    "tensorboard>=2.20.0",
+    "seqeval>=1.2.2",
+    "textdistance>=4.0.0",
+    "due-evaluator",
+    "python-barcode>=0.16.1",
+    "diffusers>=0.35.2",
+    "icecream>=2.1.8",
+    "editdistance>=0.8.1",
+    "selenium>=4.36.0",
+    "accelerate>=1.11.0",
+    "seaborn>=0.13.2",
+    "timm>=1.0.20",
+    "fastapi>=0.109.0",
+    "uvicorn[standard]>=0.27.0",
+    "python-multipart>=0.0.6",
+    "lxml>=5.1.0",
+    "pdfplumber>=0.10.4",
+    "python-dotenv>=1.0.0",
+    "tenacity>=8.2.3",
+    "pytesseract>=0.3.10",
+    "redis>=5.0.0",
+    "rq>=1.15.0",
+    "supabase>=2.0.0",
+    "google-api-python-client>=2.100.0",
+    "google-auth-httplib2>=0.2.0",
+    "google-auth-oauthlib>=1.2.0",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu121"
+url = "https://download.pytorch.org/whl/cu121"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu121"},
+]
+torchvision = [
+  { index = "pytorch-cu121"},
+]
+mmcv = { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" }
+atria-core = { git = "https://github.com/saifullah3396/atria_core.git", branch = "devel-estella" }
+nest-asyncio = [
+  { index = "pypi" }
+]
+due-evaluator = { git = "https://github.com/due-benchmark/evaluator.git" }
+
+[dependency-groups]
+dev = [
+    "ipykernel>=6.30.1",
+    "pydrive2>=1.21.3",
+    "pytest>=8.4.2",
+]
+
+[tool.uv]
+package = true
+
+[tool.pytest.ini_options]
+pythonpath = [".", "docgenie", "tests"]
+# Set additional command line options for pytest
+# Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags
+addopts = "-rXs --strict-config --strict-markers --tb=short"
+xfail_strict = true         # Treat tests that are marked as xfail but pass as test failures
+# filterwarnings = ["error"]  # Treat all warnings as errors
\ No newline at end of file
diff --git a/pyrightconfig.json b/pyrightconfig.json
new file mode 100755
index 0000000000000000000000000000000000000000..2f4197bd9aa162d88fce2fcc4ed77f55e6b3cf24
--- /dev/null
+++ b/pyrightconfig.json
@@ -0,0 +1,8 @@
+{
+    "exclude": [
+        "data",
+        "**/__pycache__",
+        "**/build",
+        "**/.venv",
+    ]
+}
\ No newline at end of file
diff --git a/railway.json b/railway.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc125bb26e046b0bff90b3593415af0be0fb21c4
--- /dev/null
+++ b/railway.json
@@ -0,0 +1,11 @@
+{
+  "$schema": "https://railway.app/railway.schema.json",
+  "build": {
+    "builder": "DOCKERFILE",
+    "dockerfilePath": "Dockerfile"
+  },
+  "deploy": {
+    "restartPolicyType": "ON_FAILURE",
+    "restartPolicyMaxRetries": 10
+  }
+}
diff --git a/railway_setup_vars.sh b/railway_setup_vars.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3484a42f3700e8bed0fe6f8ed0ad5d3a33550399
--- /dev/null
+++ b/railway_setup_vars.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Railway Environment Variables Setup Script
+# Run this after linking your Railway project: railway link
+
+echo "🚀 Setting up Railway environment variables for DocGenie API..."
+echo ""
+
+# Required Variables
+echo "📦 Setting required variables..."
+railway variables set ANTHROPIC_API_KEY="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"
+railway variables set REDIS_URL="rediss://default:AW79AAIncDJiMTVjMTk2NDVmYzg0YTdiOWI2OWQzZDg4ZmJlZDkzNnAyMjg0MTM@beloved-starling-28413.upstash.io:6379"
+railway variables set HANDWRITING_SERVICE_URL="https://api.runpod.ai/v2/ht9ajgrduitgpr/runsync"
+railway variables set HANDWRITING_SERVICE_ENABLED="true"
+railway variables set RUNPOD_API_KEY="rpa_CYMSSCD76KMXHCBTIVGP3G1FV87MMFQAEMMAJDFBts0kvq"
+railway variables set SUPABASE_URL="https://vbdwvbjbrbjzegtfsads.supabase.co"
+railway variables set SUPABASE_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZiZHd2YmpicmJqemVndGZzYWRzIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTgyMjAxOTIsImV4cCI6MjA3Mzc5NjE5Mn0.3yK5yGK1OneWSZXBFSmd3j8wuRBDUKveJCsSYsK4tp0"
+railway variables set GOOGLE_CLIENT_ID="129149757519-bm3qg2mtepkpti8ifjuhbgu6bl52khf6.apps.googleusercontent.com"
+railway variables set GOOGLE_CLIENT_SECRET="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"
+
+echo ""
+echo "⚙️  Setting recommended variables..."
+railway variables set OCR_SERVICE_ENABLED="true"
+railway variables set OCR_USE_LOCAL="true"
+railway variables set OCR_TESSERACT_LANG="eng"
+railway variables set OCR_TESSERACT_CONFIG="--psm 3"
+railway variables set OCR_DPI="300"
+railway variables set RQ_QUEUE_NAME="docgenie"
+railway variables set BATCH_POLL_INTERVAL="30"
+railway variables set HANDWRITING_SERVICE_TIMEOUT="300"
+railway variables set HANDWRITING_SERVICE_MAX_RETRIES="1"
+railway variables set CLAUDE_MODEL="claude-sonnet-4-5-20250929"
+railway variables set GOOGLE_DRIVE_FOLDER_NAME="DocGenie Documents"
+railway variables set LOG_LEVEL="INFO"
+railway variables set DEBUG_MODE="false"
+
+echo ""
+echo "✅ All environment variables set successfully!"
+echo ""
+echo "Next steps:"
+echo "1. Verify variables: railway variables"
+echo "2. Deploy: railway up"
+echo "3. Monitor: railway logs"
+echo ""
diff --git a/scripts/clustering/experiments/00_generate_all_embeddings.sh b/scripts/clustering/experiments/00_generate_all_embeddings.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1550cc6d09554615ccd4e2ca8303a19da46c88bf
--- /dev/null
+++ b/scripts/clustering/experiments/00_generate_all_embeddings.sh
@@ -0,0 +1,21 @@
+declare -a datasets=(
+    # classification datasets
+    tobacco3482
+    rvlcdip
+    # entity labeling datasets
+    cord
+    funsd
+    sroie
+    # extractive QA,
+    ex_docvqa
+    ex_wiki
+    ex_klc
+    # layout analysis
+    publaynet
+    doclaynet_4k
+    icdar2019
+)
+
+for dataset in "${datasets[@]}"; do
+    uv run docgenie/analyzation/clustering/cmds/generate_embeddings.py --dataset-name ${dataset}
+done
diff --git a/scripts/clustering/experiments/01_verify_embeddings.sh b/scripts/clustering/experiments/01_verify_embeddings.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ffcd36b3d30b0c725104b2f69d66460a1ca8f7b5
--- /dev/null
+++ b/scripts/clustering/experiments/01_verify_embeddings.sh
@@ -0,0 +1,21 @@
+declare -a datasets=(
+    # classification datasets
+    tobacco3482
+    rvlcdip
+    # entity labeling datasets
+    cord
+    funsd
+    sroie
+    # extractive QA,
+    ex_docvqa
+    ex_wiki
+    ex_klc
+    # layout analysis
+    publaynet
+    doclaynet_4k
+    icdar2019
+)
+
+for dataset in "${datasets[@]}"; do
+    uv run docgenie/analyzation/clustering/cmds/generate_embeddings.py --dataset-name ${dataset} --verify-only
+done
diff --git a/scripts/clustering/experiments/02_generate_all_clusters.sh b/scripts/clustering/experiments/02_generate_all_clusters.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ac2db475b44b37dbf53ebede8b9bde56a42065c0
--- /dev/null
+++ b/scripts/clustering/experiments/02_generate_all_clusters.sh
@@ -0,0 +1,25 @@
+declare -a datasets=(
+    # classification datasets
+    tobacco3482
+    rvlcdip
+    # entity labeling datasets
+    cord
+    funsd
+    sroie
+    # extractive QA,
+    ex_docvqa
+    ex_wiki
+    ex_klc
+    # layout analysis
+    publaynet
+    doclaynet_4k
+    icdar2019
+)
+
+for dataset in "${datasets[@]}"; do
+    uv run docgenie/analyzation/clustering/cmds/generate_clusters.py --dataset-name ${dataset} --hdbscan-min-cluster-size 10
+done
+
+for dataset in "${datasets[@]}"; do
+    uv run docgenie/analyzation/clustering/cmds/generate_clusters.py --dataset-name ${dataset} --hdbscan-min-cluster-size 5
+done
diff --git a/scripts/clustering/experiments/03_generate_all_seed_documents.sh b/scripts/clustering/experiments/03_generate_all_seed_documents.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4392523f71da37ea623bb80b8401aac97006be4a
--- /dev/null
+++ b/scripts/clustering/experiments/03_generate_all_seed_documents.sh
@@ -0,0 +1,21 @@
+declare -a datasets=(
+    # classification datasets
+    tobacco3482
+    rvlcdip
+    # entity labeling datasets
+    cord
+    funsd
+    sroie
+    # extractive QA,
+    ex_docvqa
+    ex_wiki
+    ex_klc
+    # layout analysis
+    publaynet
+    doclaynet_4k
+    icdar2019
+)
+
+for dataset in "${datasets[@]}"; do
+    uv run docgenie/analyzation/clustering/cmds/generate_seeds.py --dataset-name ${dataset}
+done
diff --git a/scripts/clustering/webapp.sh b/scripts/clustering/webapp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5a9846b15aacbc7c0621afaf47876859f7c90a7c
--- /dev/null
+++ b/scripts/clustering/webapp.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+uv run docgenie/analyzation/clustering/webapp/app.py $@
\ No newline at end of file
diff --git a/scripts/data/prepare_alpha_sampling_datasets.sh b/scripts/data/prepare_alpha_sampling_datasets.sh
new file mode 100755
index 0000000000000000000000000000000000000000..43e4ee77c395ba50330b4090626a5d9a86b0eb33
--- /dev/null
+++ b/scripts/data/prepare_alpha_sampling_datasets.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+declare datasets=(
+    "rvlcdip_alpha=0.5_v1"
+    "rvlcdip_alpha=0.75_v1"
+    "rvlcdip_alpha=1.0_v1"
+    "rvlcdip_alpha=0.5" # this is v2
+    "rvlcdip_alpha=0.75" # this is v2
+    "rvlcdip_alpha=1.0" # this is v2
+
+    "docvqa_alpha=0.5_v1"
+    "docvqa_alpha=0.75_v1"
+    "docvqa_alpha=1.0_v1"
+    "docvqa_alpha=0.5"  # this is v2
+    "docvqa_alpha=0.75"  # this is v2
+    "docvqa_alpha=1.0"  # this is v2
+
+    "cord_alpha=0.5_v1"
+    "cord_alpha=0.75_v1"
+    "cord_alpha=1.0_v1"
+    "cord_alpha=0.5" # this is v2
+    "cord_alpha=0.75" # this is v2
+    "cord_alpha=1.0" # this is v2
+
+    "publaynet_correct-sampling_alpha=0.5_v1"
+    "publaynet_correct-sampling_alpha=0.75_v1"
+    "publaynet_correct-sampling_alpha=1.0_v1"
+    "publaynet_correct-sampling_alpha=0.5" # this is v2
+    "publaynet_correct-sampling_alpha=0.75" # this is v2
+    "publaynet_correct-sampling_alpha=1.0" # this is v2
+)
+
+for dataset in "${datasets[@]}"; do
+    echo "Preparing synthetic dataset: ${dataset}"
+    python docgenie/data/cmds/prepare_synth_datasets.py --dataset-name ${dataset} $@
+done
diff --git a/scripts/data/prepare_eval_datasets.sh b/scripts/data/prepare_eval_datasets.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1f552747090204eb65fa0365190d013069ea7de9
--- /dev/null
+++ b/scripts/data/prepare_eval_datasets.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+declare datasets=(
+    # classification
+    "rvlcdip_alpha=1.0"
+    "tobacco3482_alpha=1.0"
+    "doclaynet4k_alpha=1.0_CLS"
+
+    # docvqa
+    "docvqa_alpha=1.0"
+    "wtq_alpha=1.0"
+
+    # kie
+    "cord_alpha=1.0"
+    "funsd_alpha=1.0"
+    "kleister_alpha=1.0"
+    "sroie_alpha=1.0"
+
+    # # layout analysis
+    "publaynet_correct-sampling_alpha=1.0"
+    "icdar2019_alpha=1.0"
+    "doclaynet4k_alpha=1.0_DLA --clip-bboxes-to-foreground"
+)
+
+for dataset in "${datasets[@]}"; do
+    echo "Preparing synthetic dataset: ${dataset}"
+    python docgenie/data/cmds/prepare_synth_datasets.py --dataset-name ${dataset} $@
+done
diff --git a/scripts/experiments/02_v1_sampling_eval/base.sh b/scripts/experiments/02_v1_sampling_eval/base.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6721a9a978ce38f9b1dd97256827127bc41e9ec8
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/base.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Base script for running model-dataset experiments.
+# Called by another script that defines dataset and model configs.
+# Usage (inside another script):
+#   source run_base.sh [DATASET_NAME] [MODEL_NAME] [extra_args...]
+
+set -e
+EXPERIMENT_NAME="experiment_02_v1"
+DATASET_NAME=${1:-all}
+SYNTHETIC_DATASET_NAME=${2:-all}
+MODEL_NAME=${3:-all}
+SEED=${4:-42}
+
+# Verify configs are defined by the calling script
+if [ -z "${dataset_names[*]}" ] || [ -z "${model_configs[*]}" ] || [ -z "${monitored_metrics[*]}" ]; then
+    echo "Error: dataset_names and model_configs must be defined before calling run_base.sh"
+    exit 1
+fi
+
+for idx in "${!dataset_names[@]}"; do
+    dataset="${dataset_names[$idx]}"
+    synthetic_dataset="${synthetic_dataset_names[$idx]}"
+    monitored_metric="${monitored_metrics[$idx]}"
+
+    if [ "$DATASET_NAME" != "all" ] && [ "$DATASET_NAME" != "$dataset" ]; then
+        continue
+    fi
+
+    if [ "$SYNTHETIC_DATASET_NAME" != "all" ] && [ "$SYNTHETIC_DATASET_NAME" != "$synthetic_dataset" ]; then
+        continue
+    fi
+
+    for config in "${model_configs[@]}"; do
+        IFS='|' read -r model_name model_args <<< "$config"
+
+        if [ "$MODEL_NAME" != "all" ] && [ "$MODEL_NAME" != "$model_name" ]; then
+            continue
+        fi
+
+        run_name_with_seed="${EXPERIMENT_NAME}-${model_name}-${dataset}-0-${synthetic_dataset}-all-seed-${SEED}"
+
+        echo "=========================================================="
+        echo "Running experiment: ${run_name_with_seed}"
+        echo "Dataset: ${dataset} | Synthetic: ${synthetic_dataset} | Model: ${model_name} | Seed: ${SEED}"
+        echo "=========================================================="
+
+        # set -x
+        python docgenie/evaluation/runners/mixed_runner.py \
+            --dataset-name "${dataset}" \
+            --synthetic-dataset-name "${synthetic_dataset}" \
+            --num-real-samples 0 \
+            --num-synthetic-samples -1 \
+            --run-name "${run_name_with_seed}" \
+            --monitored-metric ${monitored_metric} \
+            --seed ${SEED} \
+            ${model_args} \
+            ${TASK_ARGUMENTS} \
+            ${@:5}
+    done
+done
diff --git a/scripts/experiments/02_v1_sampling_eval/jobs.sh b/scripts/experiments/02_v1_sampling_eval/jobs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..93c90be8007dc14496eb3c6a71b693a83c2130c3
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/jobs.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/02_v2_sampling_eval/jobs.sh --gpu-ids 1,2,3,4,5,6,7
+EXPERIMENT_NAME="experiment_02_v1"
+
+declare -a model_configs=(
+    "layoutlmv3"
+)
+
+declare -a layout_model_configs=(
+    "faster-rcnn"
+)
+
+declare -a dataset_name_and_script=(
+    # classification datasets
+    "scripts/experiments/02_v1_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=1.0_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=0.75_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=0.5_v1"
+    # # # entity labeling datasets
+    "scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh cord cord_alpha=1.0_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh cord cord_alpha=0.75_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh cord cord_alpha=0.5_v1"
+    # # qa datasets
+    "scripts/experiments/02_v1_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=1.0_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=0.75_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=0.5_v1"
+    # # layout analysis datasets
+    "scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=1.0_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=0.75_v1"
+    "scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=0.5_v1"
+)
+
+CONFIGS=()
+
+# Generate configurations based on model_configs and dataset_names
+for dataset_script in "${dataset_name_and_script[@]}"; do
+    dataset_script_parts=($dataset_script)
+    dataset_script=${dataset_script_parts[0]}
+    dataset_name=${dataset_script_parts[1]}
+    synthetic_dataset=${dataset_script_parts[2]}
+    if [[ "$dataset_name" == "publaynet" || "$dataset_name" == "doclaynet" || "$dataset_name" == "icdar2019" ]]; then
+        for model_config in "${layout_model_configs[@]}"; do
+            for seed in 42 826 548 401 335; do # randomly chosen seeds
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-0-${synthetic_dataset}-all-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    else
+        for model_config in "${model_configs[@]}"; do
+            for seed in 42 826 548 401 335; do # randomly chosen seeds
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-0-${synthetic_dataset}-all-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    fi
+done
diff --git a/scripts/experiments/02_v1_sampling_eval/train_classification.sh b/scripts/experiments/02_v1_sampling_eval/train_classification.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1214db169f9c4aa0828e15687bf0cb950647ea1c
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/train_classification.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes --train-batch-size 16 --eval-batch-size 16"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "rvlcdip"
+    # alpha 0.75
+    "rvlcdip"
+    # alpha 0.5
+    "rvlcdip"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1
+    "rvlcdip_alpha=1.0_v1"
+    # alpha 0.75
+    "rvlcdip_alpha=0.75_v1"
+    # alpha 0.5
+    "rvlcdip_alpha=0.5_v1"
+)
+
+declare -a monitored_metrics=(
+    "validation/accuracy"
+    "validation/accuracy"
+    "validation/accuracy"
+)
+
+TASK_ARGUMENTS="--optimizer adam --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh b/scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9b5908ca2d3456d784c48a4a1831aa953e2ea4dc
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/train_entity_labeling.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "cord"
+    # alpha 0.75
+    "cord"
+    # alpha 0.5
+    "cord"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "cord_alpha=1.0_v1"
+    # alpha 0.75
+    "cord_alpha=0.75_v1"
+    # alpha 0.5
+    "cord_alpha=0.5_v1"
+)
+
+declare -a monitored_metrics=(
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+)
+
+TASK_ARGUMENTS="--optimizer adamw --lr-start 2.0e-5 --train-batch-size 16 --eval-batch-size 16 --num-epochs 100  --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh b/scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2ac6ac70c0d432efe035437a15853e2ad4da8f4c
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/train_layout_analysis.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "faster-rcnn|--model-name faster-rcnn_r50_fpn_1x_coco --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "publaynet"
+    # alpha 0.75
+    "publaynet"
+    # alpha 0.5
+    "publaynet"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "publaynet_correct-sampling_alpha=1.0_v1"
+    # alpha 0.75
+    "publaynet_correct-sampling_alpha=0.75_v1"
+    # alpha 0.5
+    "publaynet_correct-sampling_alpha=0.5_v1"
+)
+
+declare -a monitored_metrics=(
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+)
+
+TASK_ARGUMENTS="--num-epochs 40 --validate-every-n-epochs 1 --lr-schedule-warmup-steps-frac-of-total 0.05"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v1_sampling_eval/train_question_answering.sh b/scripts/experiments/02_v1_sampling_eval/train_question_answering.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2b475efae16b853c723ce1af3dda4617a94810c7
--- /dev/null
+++ b/scripts/experiments/02_v1_sampling_eval/train_question_answering.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "ex_docvqa"
+    # alpha 0.75
+    "ex_docvqa"
+    # alpha 0.5
+    "ex_docvqa"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "docvqa_alpha=1.0_v1"
+    # alpha 0.75
+    "docvqa_alpha=0.75_v1"
+    # alpha 0.5
+    "docvqa_alpha=0.5_v1"
+)
+
+declare -a monitored_metrics=(
+    "validation/ex_due_eval/ANLS"
+    "validation/ex_due_eval/ANLS"
+    "validation/ex_due_eval/ANLS"
+)
+
+TASK_ARGUMENTS="--validate-every-n-epochs 0.2 --lr-start 5.0e-5 --ignore-samples-with-no-answer --lr-schedule-warmup-steps-frac-of-total 0.02 --train-batch-size 8 --eval-batch-size 1 --num-epochs 50  --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v2_sampling_eval/base.sh b/scripts/experiments/02_v2_sampling_eval/base.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6163028683b95c3926596cfa697245440316b62c
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/base.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Base script for running model-dataset experiments.
+# Called by another script that defines dataset and model configs.
+# Usage (inside another script):
+#   source run_base.sh [DATASET_NAME] [MODEL_NAME] [extra_args...]
+
+set -e
+EXPERIMENT_NAME="experiment_02_v2"
+DATASET_NAME=${1:-all}
+SYNTHETIC_DATASET_NAME=${2:-all}
+MODEL_NAME=${3:-all}
+SEED=${4:-42}
+
+# Verify configs are defined by the calling script
+if [ -z "${dataset_names[*]}" ] || [ -z "${model_configs[*]}" ] || [ -z "${monitored_metrics[*]}" ]; then
+    echo "Error: dataset_names and model_configs must be defined before calling run_base.sh"
+    exit 1
+fi
+
+for idx in "${!dataset_names[@]}"; do
+    dataset="${dataset_names[$idx]}"
+    synthetic_dataset="${synthetic_dataset_names[$idx]}"
+    monitored_metric="${monitored_metrics[$idx]}"
+
+    if [ "$DATASET_NAME" != "all" ] && [ "$DATASET_NAME" != "$dataset" ]; then
+        continue
+    fi
+
+    if [ "$SYNTHETIC_DATASET_NAME" != "all" ] && [ "$SYNTHETIC_DATASET_NAME" != "$synthetic_dataset" ]; then
+        continue
+    fi
+
+    for config in "${model_configs[@]}"; do
+        IFS='|' read -r model_name model_args <<< "$config"
+
+        if [ "$MODEL_NAME" != "all" ] && [ "$MODEL_NAME" != "$model_name" ]; then
+            continue
+        fi
+
+        run_name_with_seed="${EXPERIMENT_NAME}-${model_name}-${dataset}-0-${synthetic_dataset}-all-seed-${SEED}"
+
+        echo "=========================================================="
+        echo "Running experiment: ${run_name_with_seed}"
+        echo "Dataset: ${dataset} | Synthetic: ${synthetic_dataset} | Model: ${model_name} | Seed: ${SEED}"
+        echo "=========================================================="
+
+        # set -x
+        python docgenie/evaluation/runners/mixed_runner.py \
+            --dataset-name "${dataset}" \
+            --synthetic-dataset-name "${synthetic_dataset}" \
+            --num-real-samples 0 \
+            --num-synthetic-samples -1 \
+            --run-name "${run_name_with_seed}" \
+            --monitored-metric ${monitored_metric} \
+            --seed ${SEED} \
+            --no-do-train \
+            ${model_args} \
+            ${TASK_ARGUMENTS} \
+            ${@:5}
+    done
+done
diff --git a/scripts/experiments/02_v2_sampling_eval/jobs.sh b/scripts/experiments/02_v2_sampling_eval/jobs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..73f7ba44487b8a6dacb57bb4ccdbbfe2d9397f7a
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/jobs.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/02_v2_sampling_eval/jobs.sh --gpu-ids 1,2,3,4,5,6,7
+EXPERIMENT_NAME="experiment_02_v2"
+
+declare -a model_configs=(
+    "layoutlmv3"
+)
+
+declare -a layout_model_configs=(
+    "faster-rcnn"
+)
+
+declare -a dataset_name_and_script=(
+    # classification datasets
+    "scripts/experiments/02_v2_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=1.0"
+    "scripts/experiments/02_v2_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=0.75"
+    "scripts/experiments/02_v2_sampling_eval/train_classification.sh rvlcdip rvlcdip_alpha=0.5"
+    # # # entity labeling datasets
+    "scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh cord cord_alpha=1.0"
+    "scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh cord cord_alpha=0.75"
+    "scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh cord cord_alpha=0.5"
+    # # qa datasets
+    "scripts/experiments/02_v2_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=1.0"
+    "scripts/experiments/02_v2_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=0.75"
+    "scripts/experiments/02_v2_sampling_eval/train_question_answering.sh ex_docvqa docvqa_alpha=0.5"
+    # layout analysis datasets
+    "scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=1.0"
+    "scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=0.75"
+    "scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=0.5"
+)
+
+CONFIGS=()
+
+# Generate configurations based on model_configs and dataset_names
+for dataset_script in "${dataset_name_and_script[@]}"; do
+    dataset_script_parts=($dataset_script)
+    dataset_script=${dataset_script_parts[0]}
+    dataset_name=${dataset_script_parts[1]}
+    synthetic_dataset=${dataset_script_parts[2]}
+    if [[ "$dataset_name" == "publaynet" || "$dataset_name" == "doclaynet" || "$dataset_name" == "icdar2019" ]]; then
+        for model_config in "${layout_model_configs[@]}"; do
+            for seed in 42 826 548 401 335; do # randomly chosen seeds
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-0-${synthetic_dataset}-all-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    else
+        for model_config in "${model_configs[@]}"; do
+            for seed in 42 826 548 401 335; do # randomly chosen seeds
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-0-${synthetic_dataset}-all-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    fi
+done
diff --git a/scripts/experiments/02_v2_sampling_eval/train_classification.sh b/scripts/experiments/02_v2_sampling_eval/train_classification.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b983426b95d03d5d6eb08b9a8af9d1d95967b7af
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/train_classification.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes --train-batch-size 32 --eval-batch-size 32"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "rvlcdip"
+    # alpha 0.75
+    "rvlcdip"
+    # alpha 0.5
+    "rvlcdip"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1
+    "rvlcdip_alpha=1.0"
+    # alpha 0.75
+    "rvlcdip_alpha=0.75"
+    # alpha 0.5
+    "rvlcdip_alpha=0.5"
+)
+
+declare -a monitored_metrics=(
+    "validation/accuracy"
+    "validation/accuracy"
+    "validation/accuracy"
+)
+
+TASK_ARGUMENTS="--optimizer adam --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh b/scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0775365e8f447a4122c02024493edae9572204ca
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/train_entity_labeling.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "cord"
+    # alpha 0.75
+    "cord"
+    # alpha 0.5
+    "cord"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "cord_alpha=1.0"
+    # alpha 0.75
+    "cord_alpha=0.75"
+    # alpha 0.5
+    "cord_alpha=0.5"
+)
+
+declare -a monitored_metrics=(
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+)
+
+TASK_ARGUMENTS="--optimizer adamw --lr-start 2.0e-5 --train-batch-size 16 --eval-batch-size 16 --num-epochs 100  --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh b/scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7b779acc2970db3721e65c12725c652558980e1a
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/train_layout_analysis.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "faster-rcnn|--model-name faster-rcnn_r50_fpn_1x_coco --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "publaynet"
+    # alpha 0.75
+    "publaynet"
+    # alpha 0.5
+    "publaynet"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "publaynet_correct-sampling_alpha=1.0"
+    # alpha 0.75
+    "publaynet_correct-sampling_alpha=0.75"
+    # alpha 0.5
+    "publaynet_correct-sampling_alpha=0.5"
+)
+
+declare -a monitored_metrics=(
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+)
+
+TASK_ARGUMENTS="--num-epochs 40 --validate-every-n-epochs 1 --lr-schedule-warmup-steps-frac-of-total 0.05"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/02_v2_sampling_eval/train_question_answering.sh b/scripts/experiments/02_v2_sampling_eval/train_question_answering.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c31c108e02d9efabc64abd9b6de8e3cb26d70e61
--- /dev/null
+++ b/scripts/experiments/02_v2_sampling_eval/train_question_answering.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes"
+)
+
+declare -a dataset_names=(
+    # alpha 1.0
+    "ex_docvqa"
+    # alpha 0.75
+    "ex_docvqa"
+    # alpha 0.5
+    "ex_docvqa"
+)
+
+declare -a synthetic_dataset_names=(
+    # alpha 1.0
+    "docvqa_alpha=1.0"
+    # alpha 0.75
+    "docvqa_alpha=0.75"
+    # alpha 0.5
+    "docvqa_alpha=0.5"
+)
+
+declare -a monitored_metrics=(
+    "validation/ex_due_eval/ANLS"
+    "validation/ex_due_eval/ANLS"
+    "validation/ex_due_eval/ANLS"
+)
+
+TASK_ARGUMENTS="--validate-every-n-epochs 0.2 --lr-start 5.0e-5 --ignore-samples-with-no-answer --lr-schedule-warmup-steps-frac-of-total 0.02 --train-batch-size 8 --eval-batch-size 1 --num-epochs 50  --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/03_mixed_training/base.sh b/scripts/experiments/03_mixed_training/base.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0a240435677a2cfb9a46bd6418097659d5099037
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/base.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Base script for running model-dataset experiments.
+# Called by another script that defines dataset and model configs.
+# Usage (inside another script):
+#   source run_base.sh [DATASET_NAME] [MODEL_NAME] [extra_args...]
+
+set -e
+EXPERIMENT_NAME="experiment_03"
+DATASET_NAME=${1:-all}
+SYNTHETIC_DATASET_NAME=${2:-all}
+MODEL_NAME=${3:-all}
+SEED=${4:-42}
+NUM_REAL_SAMPLES=${5:-0}
+NUM_SYNTHETIC_SAMPLES=${6:--1}
+
+# Verify configs are defined by the calling script
+if [ -z "${dataset_names[*]}" ] || [ -z "${model_configs[*]}" ] || [ -z "${monitored_metrics[*]}" ]; then
+    echo "Error: dataset_names and model_configs must be defined before calling run_base.sh"
+    exit 1
+fi
+
+for idx in "${!dataset_names[@]}"; do
+    dataset="${dataset_names[$idx]}"
+    synthetic_dataset="${synthetic_dataset_names[$idx]}"
+    monitored_metric="${monitored_metrics[$idx]}"
+
+    if [ "$DATASET_NAME" != "all" ] && [ "$DATASET_NAME" != "$dataset" ]; then
+        continue
+    fi
+
+    if [ "$SYNTHETIC_DATASET_NAME" != "all" ] && [ "$SYNTHETIC_DATASET_NAME" != "$synthetic_dataset" ]; then
+        continue
+    fi
+
+    for config in "${model_configs[@]}"; do
+        IFS='|' read -r model_name model_args <<< "$config"
+
+        if [ "$MODEL_NAME" != "all" ] && [ "$MODEL_NAME" != "$model_name" ]; then
+            continue
+        fi
+
+        run_name_with_seed="${EXPERIMENT_NAME}-${model_name}-${dataset}-${NUM_REAL_SAMPLES}-${synthetic_dataset}-${NUM_SYNTHETIC_SAMPLES}-seed-${SEED}"
+
+        echo "=========================================================="
+        echo "Running experiment: ${run_name_with_seed}"
+        echo "Dataset: ${dataset} | Synthetic: ${synthetic_dataset} | Model: ${model_name} | Seed: ${SEED} | Real samples: ${NUM_REAL_SAMPLES} | Synthetic samples: ${NUM_SYNTHETIC_SAMPLES}"
+        echo "=========================================================="
+
+        set -x
+        python docgenie/evaluation/runners/mixed_runner.py \
+            --dataset-name "${dataset}" \
+            --synthetic-dataset-name "${synthetic_dataset}" \
+            --num-real-samples ${NUM_REAL_SAMPLES} \
+            --num-synthetic-samples ${NUM_SYNTHETIC_SAMPLES} \
+            --run-name "${run_name_with_seed}" \
+            --monitored-metric ${monitored_metric} \
+            --seed ${SEED} \
+            ${model_args} \
+            ${TASK_ARGUMENTS} \
+            ${@:7}
+    done
+done
+
diff --git a/scripts/experiments/03_mixed_training/jobs_set_cls.sh b/scripts/experiments/03_mixed_training/jobs_set_cls.sh
new file mode 100755
index 0000000000000000000000000000000000000000..acaa34b987722da0419c681bc9f7ee82b4f99f57
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/jobs_set_cls.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/03_mixed_training/jobs_set_cls.sh --gpu-ids 1,2,3,4,5,6,7
+declare -a model_configs=(
+    "bert-base-uncased" # text only
+    "lilt" # text + layout
+    "layoutlmv3"
+)
+
+declare -a dataset_name_and_script=(
+    # classification datasets
+    "scripts/experiments/03_mixed_training/train_classification.sh rvlcdip rvlcdip_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_classification.sh tobacco3482 tobacco3482_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_classification.sh doclaynet_4k_cls doclaynet4k_alpha=1.0_CLS"
+)
+
+SEEDS=(42 826 548) # randomly chosen seeds
+EXPERIMENT_NAME="experiment_03"
+
+declare -a real_syn_sizes=(
+    "100 -1" # train on 100 real samples
+    "1000 -1" # train on 1000 real samples
+    "-1 -1" # train on all real samples
+    "0 -1" # train on 0 real samples
+    "-1 0" # train on all real samples + 0 synthetic samples
+    "100 0" # train on 100 real samples + 0 synthetic samples
+    "1000 0" # train on 1000 real samples + 0 synthetic samples
+)
+
+declare -a per_config_args=(
+    ""
+    ""
+    ""
+    ""
+    ""
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+)
+
+CONFIGS=()
+for seed in "${SEEDS[@]}"; do
+    for dataset_script in "${dataset_name_and_script[@]}"; do
+        dataset_script_parts=($dataset_script)
+        dataset_script=${dataset_script_parts[0]}
+        dataset_name=${dataset_script_parts[1]}
+        synthetic_dataset=${dataset_script_parts[2]}
+        for model_config in "${model_configs[@]}"; do
+            for idx in "${!real_syn_sizes[@]}"; do
+                real_syn_size="${real_syn_sizes[$idx]}"
+                real_size=$(echo $real_syn_size | cut -d' ' -f1)
+                synthetic_size=$(echo $real_syn_size | cut -d' ' -f2)
+                per_config_args="${per_config_args[$idx]}"
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-${real_size}-${synthetic_dataset}-${synthetic_size}-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed} ${real_size} ${synthetic_size} ${per_config_args}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    done
+done
+
+# use for slurm
+# echo "Total configurations to run: ${#CONFIGS[@]}"
+
+# GPUS_PER_TASK=1
+# CPUS_PER_TASK=8
+# MEMORY="40G"
+
+# for config in "${CONFIGS[@]}"; do
+#     config_parts=($config)
+#     job_name=${config_parts[0]}
+#     dataset_script=${config_parts[1]}
+
+#     echo "Submitting job: ${job_name}"
+#     echo "Command: bash $dataset_script ${config_parts[@]:2}"
+#     # bash $dataset_script ${config_parts[@]:2}
+
+#     # # # Submit each configuration as a separate Slurm job using srun
+#     srun --job-name="${job_name}" \
+#         -n 1 \
+#         --gpus-per-task="${GPUS_PER_TASK}" \
+#         --cpus-per-task="${CPUS_PER_TASK}" \
+#         --mem="${MEMORY}" \
+#         --output="data/cache/slurm_logs/${job_name}.out" \
+#         --error="data/cache/slurm_logs/${job_name}.err" \
+#         bash "$dataset_script" "${config_parts[@]:2}" &
+# done
+
+# wait
diff --git a/scripts/experiments/03_mixed_training/jobs_set_dla_doclaynet.sh b/scripts/experiments/03_mixed_training/jobs_set_dla_doclaynet.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b9f3c9f2435d6c1081def41399768ad9d4ced982
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/jobs_set_dla_doclaynet.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/03_mixed_training/jobs_set_dla.sh --gpu-ids 1,2,3,4,5,6,7
+declare -a model_configs=(
+    "faster-rcnn-doclaynet"
+    "cascade-rcnn-doclaynet"
+)
+
+declare -a dataset_name_and_script=(
+    # layout analysis datasets
+    "scripts/experiments/03_mixed_training/train_layout_analysis.sh doclaynet_4k_dla doclaynet4k_alpha=1.0_DLA"
+)
+
+SEEDS=(42 826 548) # randomly chosen seeds
+EXPERIMENT_NAME="experiment_03"
+
+declare -a real_syn_sizes=(
+    "100 -1" # train on 100 real samples
+    "1000 -1" # train on 1000 real samples
+    "-1 -1" # train on all real samples
+    "0 -1" # train on 0 real samples
+    "-1 0"
+    "100 0" # train on 100 real samples + 0 synthetic samples
+    "1000 0" # train on 1000 real samples + 0 synthetic samples
+)
+
+declare -a per_config_args=(
+    ""
+    ""
+    ""
+    ""
+    ""
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+)
+
+CONFIGS=()
+# Generate configurations based on model_configs and dataset_names
+for seed in "${SEEDS[@]}"; do
+    for dataset_script in "${dataset_name_and_script[@]}"; do
+        dataset_script_parts=($dataset_script)
+        dataset_script=${dataset_script_parts[0]}
+        dataset_name=${dataset_script_parts[1]}
+        synthetic_dataset=${dataset_script_parts[2]}
+        for model_config in "${model_configs[@]}"; do
+            for idx in "${!real_syn_sizes[@]}"; do
+                real_syn_size="${real_syn_sizes[$idx]}"
+                real_size=$(echo $real_syn_size | cut -d' ' -f1)
+                synthetic_size=$(echo $real_syn_size | cut -d' ' -f2)
+                per_config_args="${per_config_args[$idx]}"
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-${real_size}-${synthetic_dataset}-${synthetic_size}-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed} ${real_size} ${synthetic_size} ${per_config_args}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    done
+done
+
+# use for slurm
+# echo "Total configurations to run: ${#CONFIGS[@]}"
+
+# GPUS_PER_TASK=1
+# CPUS_PER_TASK=8
+# MEMORY="40G"
+
+# for config in "${CONFIGS[@]}"; do
+#     config_parts=($config)
+#     job_name=${config_parts[0]}
+#     dataset_script=${config_parts[1]}
+
+#     echo "Submitting job: ${job_name}"
+#     echo "Command: bash $dataset_script ${config_parts[@]:2}"
+#     # bash $dataset_script ${config_parts[@]:2}
+
+#     # # # Submit each configuration as a separate Slurm job using srun
+#     srun --job-name="${job_name}" \
+#         -n 1 \
+#         --gpus-per-task="${GPUS_PER_TASK}" \
+#         --cpus-per-task="${CPUS_PER_TASK}" \
+#         --mem="${MEMORY}" \
+#         --output="data/cache/slurm_logs/${job_name}.out" \
+#         --error="data/cache/slurm_logs/${job_name}.err" \
+#         bash "$dataset_script" "${config_parts[@]:2}" &
+# done
+
+# wait
diff --git a/scripts/experiments/03_mixed_training/jobs_set_dla_icdar_publaynet.sh b/scripts/experiments/03_mixed_training/jobs_set_dla_icdar_publaynet.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4ae1fc98dbd92efb19b6d742ecfb0a02ecc11a48
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/jobs_set_dla_icdar_publaynet.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/03_mixed_training/jobs_set_dla.sh --gpu-ids 1,2,3,4,5,6,7
+declare -a model_configs=(
+    "faster-rcnn"
+    "cascade-rcnn"
+)
+
+declare -a dataset_name_and_script=(
+    # layout analysis datasets
+    "scripts/experiments/03_mixed_training/train_layout_analysis.sh icdar2019 icdar2019_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_layout_analysis.sh publaynet publaynet_correct-sampling_alpha=1.0"
+)
+
+SEEDS=(42 826 548) # randomly chosen seeds
+EXPERIMENT_NAME="experiment_03"
+
+declare -a real_syn_sizes=(
+    "100 -1" # train on 100 real samples
+    "1000 -1" # train on 1000 real samples
+    "-1 -1" # train on all real samples
+    "0 -1" # train on 0 real samples
+    "-1 0"
+    "100 0" # train on 100 real samples + 0 synthetic samples
+    "1000 0" # train on 1000 real samples + 0 synthetic samples
+)
+
+declare -a per_config_args=(
+    ""
+    ""
+    ""
+    ""
+    ""
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+)
+
+CONFIGS=()
+# Generate configurations based on model_configs and dataset_names
+for seed in "${SEEDS[@]}"; do
+    for dataset_script in "${dataset_name_and_script[@]}"; do
+        dataset_script_parts=($dataset_script)
+        dataset_script=${dataset_script_parts[0]}
+        dataset_name=${dataset_script_parts[1]}
+        synthetic_dataset=${dataset_script_parts[2]}
+        for model_config in "${model_configs[@]}"; do
+            for idx in "${!real_syn_sizes[@]}"; do
+                real_syn_size="${real_syn_sizes[$idx]}"
+                real_size=$(echo $real_syn_size | cut -d' ' -f1)
+                synthetic_size=$(echo $real_syn_size | cut -d' ' -f2)
+                per_config_args="${per_config_args[$idx]}"
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-${real_size}-${synthetic_dataset}-${synthetic_size}-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed} ${real_size} ${synthetic_size} ${per_config_args}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    done
+done
+
+# use for slurm
+# echo "Total configurations to run: ${#CONFIGS[@]}"
+
+# GPUS_PER_TASK=1
+# CPUS_PER_TASK=8
+# MEMORY="40G"
+
+# for config in "${CONFIGS[@]}"; do
+#     config_parts=($config)
+#     job_name=${config_parts[0]}
+#     dataset_script=${config_parts[1]}
+
+#     echo "Submitting job: ${job_name}"
+#     echo "Command: bash $dataset_script ${config_parts[@]:2}"
+#     # bash $dataset_script ${config_parts[@]:2}
+
+#     # # # Submit each configuration as a separate Slurm job using srun
+#     srun --job-name="${job_name}" \
+#         -n 1 \
+#         --gpus-per-task="${GPUS_PER_TASK}" \
+#         --cpus-per-task="${CPUS_PER_TASK}" \
+#         --mem="${MEMORY}" \
+#         --output="data/cache/slurm_logs/${job_name}.out" \
+#         --error="data/cache/slurm_logs/${job_name}.err" \
+#         bash "$dataset_script" "${config_parts[@]:2}" &
+# done
+
+# wait
diff --git a/scripts/experiments/03_mixed_training/jobs_set_kie.sh b/scripts/experiments/03_mixed_training/jobs_set_kie.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c3e980d3b00180c478b469679c6d95c04b0c0cad
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/jobs_set_kie.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/03_mixed_training/jobs_set_kie.sh --gpu-ids 1,2,3,4,5,6,7
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/jobs_base_kie.sh"
+
+declare -a model_configs=(
+    "bert-base-uncased" # text only
+    "lilt" # text + layout
+    "layoutlmv3"
+)
+
+declare -a dataset_name_and_script=(
+    # entity labeling datasets
+    "scripts/experiments/03_mixed_training/train_entity_labeling.sh cord cord_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_entity_labeling.sh funsd funsd_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_entity_labeling.sh sroie sroie_alpha=1.0"
+)
+
+SEEDS=(42 826 548) # randomly chosen seeds
+EXPERIMENT_NAME="experiment_03"
+
+declare -a real_syn_sizes=(
+    "100 -1" # train on 100 real samples
+    "300 -1" # train on 1000 real samples
+    "-1 -1" # train on all real samples
+    "0 -1" # train on 0 real samples
+    "-1 0"
+    "100 0"
+    "300 0"
+)
+
+declare -a per_config_args=(
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+)
+
+CONFIGS=()
+# Generate configurations based on model_configs and dataset_names
+for seed in "${SEEDS[@]}"; do
+    for dataset_script in "${dataset_name_and_script[@]}"; do
+        dataset_script_parts=($dataset_script)
+        dataset_script=${dataset_script_parts[0]}
+        dataset_name=${dataset_script_parts[1]}
+        synthetic_dataset=${dataset_script_parts[2]}
+        for model_config in "${model_configs[@]}"; do
+            for idx in "${!real_syn_sizes[@]}"; do
+                real_syn_size="${real_syn_sizes[$idx]}"
+                real_size=$(echo $real_syn_size | cut -d' ' -f1)
+                synthetic_size=$(echo $real_syn_size | cut -d' ' -f2)
+                per_config_args="${per_config_args[$idx]}"
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-${real_size}-${synthetic_dataset}-${synthetic_size}-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed} ${real_size} ${synthetic_size} ${per_config_args}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    done
+done
+
+# use for slurm
+# echo "Total configurations to run: ${#CONFIGS[@]}"
+
+# GPUS_PER_TASK=1
+# CPUS_PER_TASK=8
+# MEMORY="40G"
+
+# for config in "${CONFIGS[@]}"; do
+#     config_parts=($config)
+#     job_name=${config_parts[0]}
+#     dataset_script=${config_parts[1]}
+
+#     echo "Submitting job: ${job_name}"
+#     echo "Command: bash $dataset_script ${config_parts[@]:2}"
+#     # bash $dataset_script ${config_parts[@]:2}
+
+#     # # # Submit each configuration as a separate Slurm job using srun
+#     srun --job-name="${job_name}" \
+#         -n 1 \
+#         --gpus-per-task="${GPUS_PER_TASK}" \
+#         --cpus-per-task="${CPUS_PER_TASK}" \
+#         --mem="${MEMORY}" \
+#         --output="data/cache/slurm_logs/${job_name}.out" \
+#         --error="data/cache/slurm_logs/${job_name}.err" \
+#         bash "$dataset_script" "${config_parts[@]:2}" &
+# done
+
+# wait
diff --git a/scripts/experiments/03_mixed_training/jobs_set_vqa.sh b/scripts/experiments/03_mixed_training/jobs_set_vqa.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3569b91e0be9035d84f4d268d5233ded326d6e26
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/jobs_set_vqa.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# use scheduler python scripts/experiments/schedule_jobs.py --config-file scripts/experiments/03_mixed_training/jobs_set_vqa.sh --gpu-ids 1,2,3,4,5,6,7
+declare -a model_configs=(
+    "bert-base-uncased" # text only
+    "lilt" # text + layout
+    "layoutlmv3"
+)
+
+declare -a dataset_name_and_script=(
+    # qa datasets
+    "scripts/experiments/03_mixed_training/train_question_answering.sh ex_docvqa docvqa_alpha=1.0" # this is running on turtle
+    "scripts/experiments/03_mixed_training/train_question_answering.sh ex_klc kleister_alpha=1.0"
+    "scripts/experiments/03_mixed_training/train_question_answering.sh ex_wiki wtq_alpha=1.0"
+)
+
+SEEDS=(42 826 548) # randomly chosen seeds
+EXPERIMENT_NAME="experiment_03"
+
+declare -a real_syn_sizes=(
+    "100 -1" # train on 100 real samples
+    "1000 -1" # train on 1000 real samples
+    "-1 -1" # train on all real samples
+    "0 -1" # train on 0 real samples
+    "-1 0"
+    "100 0" # train on 100 real samples + 0 synthetic samples
+    "1000 0" # train on 1000 real samples + 0 synthetic samples
+)
+
+declare -a per_config_args=(
+    ""
+    ""
+    ""
+    ""
+    ""
+    "--no-enable-early-stopping"
+    "--no-enable-early-stopping"
+)
+
+
+CONFIGS=()
+# Generate configurations based on model_configs and dataset_names
+for seed in "${SEEDS[@]}"; do
+    for dataset_script in "${dataset_name_and_script[@]}"; do
+        dataset_script_parts=($dataset_script)
+        dataset_script=${dataset_script_parts[0]}
+        dataset_name=${dataset_script_parts[1]}
+        synthetic_dataset=${dataset_script_parts[2]}
+        for model_config in "${model_configs[@]}"; do
+            for idx in "${!real_syn_sizes[@]}"; do
+                real_syn_size="${real_syn_sizes[$idx]}"
+                real_size=$(echo $real_syn_size | cut -d' ' -f1)
+                synthetic_size=$(echo $real_syn_size | cut -d' ' -f2)
+                per_config_args="${per_config_args[$idx]}"
+                config_entry="${EXPERIMENT_NAME}-${model_config}-${dataset_name}-${real_size}-${synthetic_dataset}-${synthetic_size}-seed-${seed} ${dataset_script} ${dataset_name} ${synthetic_dataset} ${model_config} ${seed} ${real_size} ${synthetic_size} ${per_config_args}"
+                CONFIGS+=("$config_entry")
+            done
+        done
+    done
+done
+
+# use for slurm
+# echo "Total configurations to run: ${#CONFIGS[@]}"
+
+# GPUS_PER_TASK=1
+# CPUS_PER_TASK=8
+# MEMORY="40G"
+
+# for config in "${CONFIGS[@]}"; do
+#     config_parts=($config)
+#     job_name=${config_parts[0]}
+#     dataset_script=${config_parts[1]}
+
+#     echo "Submitting job: ${job_name}"
+#     echo "Command: bash $dataset_script ${config_parts[@]:2}"
+#     # bash $dataset_script ${config_parts[@]:2}
+
+#     # # # Submit each configuration as a separate Slurm job using srun
+#     srun --job-name="${job_name}" \
+#         -n 1 \
+#         --gpus-per-task="${GPUS_PER_TASK}" \
+#         --cpus-per-task="${CPUS_PER_TASK}" \
+#         --mem="${MEMORY}" \
+#         --output="data/cache/slurm_logs/${job_name}.out" \
+#         --error="data/cache/slurm_logs/${job_name}.err" \
+#         bash "$dataset_script" "${config_parts[@]:2}" &
+# done
+
+# wait
diff --git a/scripts/experiments/03_mixed_training/train_classification.sh b/scripts/experiments/03_mixed_training/train_classification.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4e6f68c40c237958a65ce28b93f08a7104f35d28
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/train_classification.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "bert-base-uncased|--model-name bert-base-uncased --tokenizer-name bert-base-uncased --train-batch-size 32 --eval-batch-size 32" # text only
+    "lilt|--model-name SCUT-DLVCLab/lilt-roberta-en-base --tokenizer-name SCUT-DLVCLab/lilt-roberta-en-base --train-batch-size 32 --eval-batch-size 32" # text + layout
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes --train-batch-size 32 --eval-batch-size 32"
+)
+
+declare -a dataset_names=(
+    "rvlcdip"
+    "tobacco3482"
+    "doclaynet_4k_cls"
+)
+
+declare -a synthetic_dataset_names=(
+    "rvlcdip_alpha=1.0"
+    "tobacco3482_alpha=1.0"
+    "doclaynet4k_alpha=1.0_CLS"
+)
+
+declare -a monitored_metrics=(
+    "validation/accuracy"
+    "validation/accuracy"
+    "validation/accuracy"
+)
+
+TASK_ARGUMENTS="--optimizer adam" # --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/03_mixed_training/train_entity_labeling.sh b/scripts/experiments/03_mixed_training/train_entity_labeling.sh
new file mode 100755
index 0000000000000000000000000000000000000000..40859672973784505e3b2ce4e2e63ed8e25f31c6
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/train_entity_labeling.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "bert-base-uncased|--model-name bert-base-uncased --tokenizer-name bert-base-uncased" # text only
+    "lilt|--model-name SCUT-DLVCLab/lilt-roberta-en-base --tokenizer-name SCUT-DLVCLab/lilt-roberta-en-base" # text + layout
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes"
+)
+
+declare -a dataset_names=(
+    "cord"
+    "funsd"
+    "sroie"
+)
+
+declare -a synthetic_dataset_names=(
+    "cord_alpha=1.0"
+    "funsd_alpha=1.0"
+    "sroie_alpha=1.0"
+)
+
+declare -a monitored_metrics=(
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+    "validation/seqeval/f1_score"
+)
+
+TASK_ARGUMENTS="--optimizer adamw --lr-start 2.0e-5 --train-batch-size 16 --eval-batch-size 16 --num-epochs 100 --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/03_mixed_training/train_layout_analysis.sh b/scripts/experiments/03_mixed_training/train_layout_analysis.sh
new file mode 100755
index 0000000000000000000000000000000000000000..abfdb60ac052b8a45cb9881653cde5a552ff384f
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/train_layout_analysis.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "faster-rcnn|--model-name faster-rcnn_r50_fpn_1x_coco --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001"
+    "faster-rcnn-doclaynet|--model-name faster-rcnn_r50_fpn_1x_coco_doclaynet --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001  --use-fixed-size --no-use-flip"
+    "cascade-rcnn|--model-name cascade-rcnn_r50_fpn_1x_coco --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001"
+    "cascade-rcnn-doclaynet|--model-name cascade-rcnn_r50_fpn_1x_coco_doclaynet --with-amp --train-batch-size 16 --eval-batch-size 16 --optimizer sgd --lr-start 0.02 --momentum 0.9 --weight-decay 0.0001  --use-fixed-size --no-use-flip"
+)
+
+declare -a dataset_names=(
+    "publaynet"
+    "icdar2019"
+    "doclaynet_4k_dla"
+)
+
+declare -a synthetic_dataset_names=(
+    "publaynet_correct-sampling_alpha=1.0"
+    "icdar2019_alpha=1.0"
+    "doclaynet4k_alpha=1.0_DLA"
+)
+
+declare -a monitored_metrics=(
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+    "validation/coco_eval_AP"
+)
+
+TASK_ARGUMENTS="--num-epochs 40 --validate-every-n-epochs 1 --lr-schedule-warmup-steps-frac-of-total 0.05"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/03_mixed_training/train_question_answering.sh b/scripts/experiments/03_mixed_training/train_question_answering.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d28b5c232ab19199e60a182e02cbc458eb41814e
--- /dev/null
+++ b/scripts/experiments/03_mixed_training/train_question_answering.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Resolve relative path to run_base.sh (works no matter where you run this from)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BASE_SCRIPT="${SCRIPT_DIR}/base.sh"
+
+declare -a model_configs=(
+    "bert-base-uncased|--model-name bert-base-uncased --tokenizer-name bert-base-uncased --train-batch-size 32" # text only
+    "lilt|--model-name SCUT-DLVCLab/lilt-roberta-en-base --tokenizer-name SCUT-DLVCLab/lilt-roberta-en-base --train-batch-size 32" # text + layout
+    "layoutlmv3|--model-name microsoft/layoutlmv3-base --tokenizer-name microsoft/layoutlmv3-base --use-segment-level-bboxes --train-batch-size 16"
+)
+
+declare -a dataset_names=(
+    "ex_docvqa"
+    "ex_klc"
+    "ex_wiki"
+)
+
+declare -a synthetic_dataset_names=(
+    "docvqa_alpha=1.0"
+    "kleister_alpha=1.0"
+    "wtq_alpha=1.0"
+)
+
+declare -a monitored_metrics=(
+    "validation/ex_due_eval/ANLS"
+    "validation/ex_due_eval/F1"
+    "validation/ex_due_eval/WTQ"
+)
+
+TASK_ARGUMENTS="--optimizer adamw --lr-start 5.0e-5 --ignore-samples-with-no-answer --lr-schedule-warmup-steps-frac-of-total 0.02 --eval-batch-size 1 --num-epochs 50  --use-preprocessed-dataset"
+
+# Call the shared base script
+source "$BASE_SCRIPT" "$@"
diff --git a/scripts/experiments/schedule_jobs.py b/scripts/experiments/schedule_jobs.py
new file mode 100755
index 0000000000000000000000000000000000000000..e99a08e105bf8a4ea682ac31be34d9eda5970ea3
--- /dev/null
+++ b/scripts/experiments/schedule_jobs.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import shlex
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+import pydantic.v1 as pydantic
+import pydantic_argparse
+
+from docgenie.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def read_jobs_from_bash(config_file):
+    """Parse CONFIGS array from a bash file and return as a Python list."""
+    cmd = f'bash -c \'source {shlex.quote(config_file)} && printf "%s\\n" "${{CONFIGS[@]}}"\''
+    result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
+    lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
+    print(lines)
+    return lines
+
+
+def get_free_gpu(gpu_slots):
+    """Return the first available GPU id, or None if all busy."""
+    for gpu_id, slot in gpu_slots.items():
+        if slot is None:
+            return gpu_id
+    return None
+
+
+def get_running_jobs_info(gpu_slots):
+    """Return formatted string with current running jobs."""
+    running = []
+    for gpu_id, slot in gpu_slots.items():
+        if slot is not None:
+            job_name, process, start_time, log_file = slot
+            elapsed = datetime.now() - start_time
+            elapsed_str = str(elapsed).split(".")[0]
+            running.append(f"GPU{gpu_id}: {log_file.name} ({elapsed_str})\n")
+    return running
+
+
+def main(config: ScheduleJobs):
+    gpu_ids = [int(gid) for gid in config.gpu_ids.split(",")]
+    config_path = Path(config.config_file)
+
+    if not config_path.exists():
+        print(f"Error: Config file not found: {config_path}")
+        sys.exit(1)
+
+    # Read job definitions
+    jobs = read_jobs_from_bash(str(config_path))
+    total_jobs = len(jobs)
+    if total_jobs == 0:
+        print(f"No jobs found in {config_path}")
+        sys.exit(1)
+
+    # Prepare logs directory
+    log_dir = Path("data/runs/logs")
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Starting job scheduler with {total_jobs} jobs across GPUs {gpu_ids}")
+
+    gpu_slots = {i: None for i in gpu_ids}
+    completed_jobs = 0
+
+    while jobs or any(gpu_slots.values()):
+        # Check for finished jobs
+        for gpu_id, slot in list(gpu_slots.items()):
+            if slot is not None:
+                job_name, process, start_time, log_file = slot
+                if process.poll() is not None:
+                    completed_jobs += 1
+                    elapsed = datetime.now() - start_time
+                    elapsed_str = str(elapsed).split(".")[0]
+                    return_code = process.returncode
+                    status = (
+                        "SUCCESS"
+                        if return_code == 0
+                        else f"FAILED (code: {return_code})"
+                    )
+                    print(
+                        f"[GPU {gpu_id}] {status}: {log_file.name} (duration: {elapsed_str})"
+                    )
+                    log_file.close()
+                    gpu_slots[gpu_id] = None
+
+        # Launch new jobs if GPUs are free
+        while jobs and (free_gpu := get_free_gpu(gpu_slots)) is not None:
+            job = jobs.pop(0)
+            parts = job.split()
+            if len(parts) < 4:
+                print(f"⚠️  Skipping malformed job line: {job}")
+                continue
+
+            name, script, dataset, model, *extra_args = parts
+            log_path = log_dir / f"{name}.log"
+            log_file = open(log_path, "w")
+
+            extra = " ".join(extra_args)
+            cmd = f"CUDA_VISIBLE_DEVICES={free_gpu} bash {script} {dataset} {model} {extra}".strip()
+            start_time = datetime.now()
+
+            print(
+                f"[GPU {free_gpu}] 🏃 Starting job: {name} "
+                f"({completed_jobs + len([s for s in gpu_slots.values() if s])}/{total_jobs}) "
+                f"→ log: {log_path}"
+            )
+            print(cmd)
+
+            process = subprocess.Popen(
+                cmd, shell=True, stdout=log_file, stderr=subprocess.STDOUT
+            )
+            gpu_slots[free_gpu] = (name, process, start_time, log_file)
+
+        # Periodic status updates
+        running_jobs = get_running_jobs_info(gpu_slots)
+        if running_jobs:
+            print(
+                f"Status: {completed_jobs}/{total_jobs} completed, "
+                f"{len(jobs)} queued | Running:\n{', '.join(running_jobs)}"
+            )
+
+        time.sleep(5)
+
+    print("All jobs completed!")
+
+
+class ScheduleJobs(pydantic.BaseModel):
+    gpu_ids: str
+    config_file: str  # Path to bash config file with job definitions
+
+
+if __name__ == "__main__":
+    parser = pydantic_argparse.ArgumentParser(
+        model=ScheduleJobs,
+    )
+    main(parser.parse_typed_args())
diff --git a/scripts/pack_example.sh b/scripts/pack_example.sh
new file mode 100755
index 0000000000000000000000000000000000000000..23a61df9219d9392c3439984136ab14b6c461837
--- /dev/null
+++ b/scripts/pack_example.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+BASE_DIR="data/datasets/synthesized_datasets"
+N=5
+
+# List of dataset subdirectories to include
+DATASETS=(
+    "docvqa_alpha=1.0"
+    "cord_alpha=1.0"
+    "doclaynet4k_alpha=1.0_CLS"
+    "doclaynet4k_alpha=1.0_DLA"
+    "funsd_alpha=1.0"
+    "icdar2019_alpha=1.0"
+    "kleister_alpha=1.0"
+    "publaynet_correct-sampling_alpha=1.0"
+    "rvlcdip_alpha=1.0"
+    "sroie_alpha=1.0"
+    "tobacco3482_alpha=1.0"
+    "wtq_alpha=1.0"
+)
+
+# Subdirectories to exclude (relative to each dataset root)
+EXCLUDE_DIRS=(
+    "visual_elements/visual_elements_images"
+    "handwriting/handwriting_raw_tokens"
+)
+
+OUTPUT_ZIP="output.zip"
+
+echo "[INFO] Base directory: $BASE_DIR"
+echo "[INFO] Including all root-level files, first $N files per subdirectory"
+echo "[INFO] Excluding subdirectories: ${EXCLUDE_DIRS[*]}"
+echo "[INFO] Will include datasets: ${DATASETS[*]}"
+
+tmpfile=$(mktemp)
+echo "[INFO] Using temporary file list: $tmpfile"
+echo
+
+# Helper function to check if a directory should be excluded
+should_exclude() {
+    local dir="$1"
+    for ex in "${EXCLUDE_DIRS[@]}"; do
+        if [[ "$dir" == *"/$ex"* ]]; then
+            return 0  # exclude
+        fi
+    done
+    return 1  # include
+}
+
+# Iterate over all datasets
+for dataset in "${DATASETS[@]}"; do
+    ROOT="$BASE_DIR/$dataset"
+    echo "[INFO] Processing dataset: $dataset"
+
+    # Collect directories
+    while IFS= read -r d; do
+        if should_exclude "$d"; then
+            echo "  [SKIP DIR] $d"
+            continue
+        fi
+        echo "  [DIR] $d"
+        echo "$d" >> "$tmpfile"
+    done < <(find "$ROOT" -type d)
+
+    # Collect files
+    while IFS= read -r d; do
+        if should_exclude "$d"; then
+            continue
+        fi
+
+        # Check if this is the root-level directory
+        if [[ "$d" == "$ROOT" ]]; then
+            # Include ALL files in root
+            files=$(find "$d" -maxdepth 1 -type f | sort)
+        else
+            # Include only first N files in subdirectories
+            files=$(find "$d" -maxdepth 1 -type f | sort | head -n "$N")
+        fi
+
+        if [[ -z "$files" ]]; then
+            echo "    (no files)"
+        else
+            echo "$files" | tee -a "$tmpfile" | sed 's/^/      [FILE] /'
+        fi
+    done < <(find "$ROOT" -type d)
+
+    echo
+done
+
+echo "[INFO] Summary:"
+echo "  Total paths in list: $(wc -l < "$tmpfile")"
+echo "  Output zip: $OUTPUT_ZIP"
+echo
+
+echo "[INFO] Creating archive..."
+zip -@ "$OUTPUT_ZIP" < "$tmpfile"
+
+echo "[INFO] Cleaning up temp file: $tmpfile"
+rm "$tmpfile"
+
+echo "[INFO] Done!"
diff --git a/scripts/per_sample_evaluation/extractive_qa/deepform.sh b/scripts/per_sample_evaluation/extractive_qa/deepform.sh
new file mode 100755
index 0000000000000000000000000000000000000000..82434fe7dc3de556a4b7a8ea4edd8cfcd61a156f
--- /dev/null
+++ b/scripts/per_sample_evaluation/extractive_qa/deepform.sh
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name ex_deepform \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-ex_deepform-with-ssl-removed-samples \
+    --monitored-metric validation/ex_due_eval/F1 \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    --ignore-samples-with-no-answer \
+    --use-preprocessed-dataset \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/extractive_qa/docvqa.sh b/scripts/per_sample_evaluation/extractive_qa/docvqa.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9e6dbe6940e3b53ca520b0eb198b851e8265d2c5
--- /dev/null
+++ b/scripts/per_sample_evaluation/extractive_qa/docvqa.sh
@@ -0,0 +1,18 @@
+
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name ex_docvqa \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-ex_docvqa-layoutlmv3-with-ssl-removed-samples-v2 \
+    --monitored-metric validation/ex_due_eval/ANLS \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    --ignore-samples-with-no-answer \
+    --use-preprocessed-dataset \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/extractive_qa/klc.sh b/scripts/per_sample_evaluation/extractive_qa/klc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b6e89746f8aa952e532ed75cd750664efb87a3ee
--- /dev/null
+++ b/scripts/per_sample_evaluation/extractive_qa/klc.sh
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name ex_klc \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-ex_klc-with-ssl-removed-samples-v2 \
+    --monitored-metric validation/ex_due_eval/F1 \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    --ignore-samples-with-no-answer \
+    --use-preprocessed-dataset \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/extractive_qa/wiki.sh b/scripts/per_sample_evaluation/extractive_qa/wiki.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12a5c31ee0f450b79d352d54e9513b2645450bee
--- /dev/null
+++ b/scripts/per_sample_evaluation/extractive_qa/wiki.sh
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name ex_wiki \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-ex_wiki-with-ssl-removed-samples-v2 \
+    --monitored-metric validation/ex_due_eval/WTQ \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    --ignore-samples-with-no-answer \
+    --use-preprocessed-dataset \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/seq_classification/rvlcdip.sh b/scripts/per_sample_evaluation/seq_classification/rvlcdip.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f916e29291e3c0bea5f852aa0a8bfb26c37899fc
--- /dev/null
+++ b/scripts/per_sample_evaluation/seq_classification/rvlcdip.sh
@@ -0,0 +1,14 @@
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name rvlcdip \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-rvlcdip-with-ssl-layoutlmv3-v2 \
+    --monitored-metric validation/f1 \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    $@
diff --git a/scripts/per_sample_evaluation/seq_classification/tobacco3482.sh b/scripts/per_sample_evaluation/seq_classification/tobacco3482.sh
new file mode 100755
index 0000000000000000000000000000000000000000..905c212ffd0aea9e798c9f195036e392ce9f7236
--- /dev/null
+++ b/scripts/per_sample_evaluation/seq_classification/tobacco3482.sh
@@ -0,0 +1,13 @@
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name tobacco3482 \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-tobacco3482-with-ssl-layoutlmv3-v3 \
+    --monitored-metric validation/f1 \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/token_classification/cord.sh b/scripts/per_sample_evaluation/token_classification/cord.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f1df20177a35a3df2dcb23a25d1cf4de8d192ef3
--- /dev/null
+++ b/scripts/per_sample_evaluation/token_classification/cord.sh
@@ -0,0 +1,14 @@
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name cord \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-cord-layoutlmv3-with-ssl \
+    --monitored-metric validation/seqeval/f1_score \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/token_classification/docile.sh b/scripts/per_sample_evaluation/token_classification/docile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..031e91c7bafbd33a956303b3202230eac929d4a6
--- /dev/null
+++ b/scripts/per_sample_evaluation/token_classification/docile.sh
@@ -0,0 +1,14 @@
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name docile \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-docile-layoutlmv3-with-ssl \
+    --monitored-metric validation/seqeval/f1_score \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/token_classification/funsd.sh b/scripts/per_sample_evaluation/token_classification/funsd.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d01f53e6c8a870cdf5fcb300880050663809f662
--- /dev/null
+++ b/scripts/per_sample_evaluation/token_classification/funsd.sh
@@ -0,0 +1,17 @@
+
+
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name funsd \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-funsd-layoutlmv3-with-ssl-v2 \
+    --monitored-metric validation/seqeval/f1_score \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/token_classification/sroie.sh b/scripts/per_sample_evaluation/token_classification/sroie.sh
new file mode 100755
index 0000000000000000000000000000000000000000..388d8bc85e260fe06ee54f9732c1da4d7c7f29ed
--- /dev/null
+++ b/scripts/per_sample_evaluation/token_classification/sroie.sh
@@ -0,0 +1,14 @@
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name sroie \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-sroie-layoutlmv3-with-ssl \
+    --monitored-metric validation/seqeval/f1_score \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    $@
\ No newline at end of file
diff --git a/scripts/per_sample_evaluation/token_classification/wild_receipts.sh b/scripts/per_sample_evaluation/token_classification/wild_receipts.sh
new file mode 100755
index 0000000000000000000000000000000000000000..09f078019ad52713d8d22cd71ca8144955305221
--- /dev/null
+++ b/scripts/per_sample_evaluation/token_classification/wild_receipts.sh
@@ -0,0 +1,15 @@
+
+
+
+uv run python docgenie/evaluation/runners/per_sample_evaluator.py \
+    --dataset-name wild_receipts \
+    --model-name microsoft/layoutlmv3-base \
+    --run-name train-exp-wild_receipts-layoutlmv3-with-ssl \
+    --monitored-metric validation/seqeval/f1_score \
+    --eval-batch-size 1 \
+    --use-segment-level-bboxes \
+    --tokenizer-name microsoft/layoutlmv3-base \
+    --split-for-per-sample-eval test \
+    --no-do-train \
+    --with-amp \
+    $@
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..850bae51f42a833083aa9bdd979da9ac765f84f9
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="docgenie",
+    version="0.1.0",
+    packages=find_packages(),
+    url="https://gitlab.cs.hs-rm.de/diss_lamott/docgenie",
+    python_requires=">=3.10",
+)
\ No newline at end of file
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ca80e9422e529b59da307a3c468bc4da1f9f754d
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -e
+
+echo "🚀 Starting DocGenie API + Worker..."
+echo "============================================================"
+echo "🔧 Configuration:"
+echo "   PORT: ${PORT:-7860}"
+echo "   REDIS_URL: ${REDIS_URL:0:30}..."
+echo "   ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:0:20}..."
+echo "============================================================"
+
+# Function to add prefix to logs
+prefix_logs() {
+    local prefix=$1
+    while IFS= read -r line; do
+        echo "[$prefix] $line"
+    done
+}
+
+# Start API server in background with log prefix
+echo "🌐 Starting FastAPI server..."
+uvicorn api.main:app --host 0.0.0.0 --port ${PORT:-7860} 2>&1 | prefix_logs "API" &
+API_PID=$!
+echo "   PID: $API_PID"
+
+# Give API a moment to start
+sleep 2
+
+# Start RQ worker in background with log prefix
+# Use timestamp + random number to ensure unique worker name across deployments
+WORKER_ID="docgenie-worker-$(date +%s)-${RANDOM}"
+echo "👷 Starting RQ Worker..."
+rq worker ${RQ_QUEUE_NAME:-docgenie} --url $REDIS_URL --name "$WORKER_ID" --verbose 2>&1 | prefix_logs "WORKER" &
+WORKER_PID=$!
+echo "   PID: $WORKER_PID"
+
+echo "============================================================"
+echo "✓ Both processes started successfully"
+echo "   API: http://0.0.0.0:${PORT:-7860}"
+echo "   Worker: Connected to Redis"
+echo "============================================================"
+
+# Function to handle shutdown
+shutdown() {
+    echo ""
+    echo "🛑 Shutting down gracefully..."
+    kill -TERM $API_PID 2>/dev/null || true
+    kill -TERM $WORKER_PID 2>/dev/null || true
+    wait $API_PID 2>/dev/null || true
+    wait $WORKER_PID 2>/dev/null || true
+    echo "✓ Shutdown complete"
+    exit 0
+}
+
+# Trap SIGTERM and SIGINT
+trap shutdown SIGTERM SIGINT
+
+# Wait for both processes - if either exits, shutdown both
+while kill -0 $API_PID 2>/dev/null && kill -0 $WORKER_PID 2>/dev/null; do
+    sleep 1
+done
+
+# If we get here, one process died
+echo ""
+echo "❌ One of the processes exited unexpectedly"
+if ! kill -0 $API_PID 2>/dev/null; then
+    echo "   API process died (PID: $API_PID)"
+fi
+if ! kill -0 $WORKER_PID 2>/dev/null; then
+    echo "   Worker process died (PID: $WORKER_PID)"
+fi
+
+shutdown
diff --git a/uv.lock b/uv.lock
new file mode 100755
index 0000000000000000000000000000000000000000..4d4b7aec4c192c1d2f138dba994241d2ec801223
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,4059 @@
+version = 1
+revision = 3
+requires-python = "==3.11.12"
+resolution-markers = [
+    "platform_machine == 'aarch64' and platform_python_implementation != 'CPython' and sys_platform == 'linux'",
+    "platform_machine != 'aarch64' and sys_platform == 'linux'",
+    "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'",
+    "sys_platform == 'darwin'",
+    "sys_platform != 'darwin' and sys_platform != 'linux'",
+]
+
+[manifest]
+overrides = [{ name = "atria-core", git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella" }]
+
+[[package]]
+name = "absl-py"
+version = "2.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/10/2a/c93173ffa1b39c1d0395b7e842bbdc62e556ca9d8d3b5572926f3e4ca752/absl_py-2.3.1.tar.gz", hash = "sha256:a97820526f7fbfd2ec1bce83f3f25e3a14840dac0d8e02a0b71cd75db3f77fc9", size = 116588, upload-time = "2025-07-03T09:31:44.05Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" },
+]
+
+[[package]]
+name = "abydos"
+version = "0.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecation" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/30/53/4d8dfccbbfe6031a2293941d718dfda7cf2e39883f915b5e3b2c057b518c/abydos-0.5.0.tar.gz", hash = "sha256:4c02e84e18211ede6885e4347a93e64fe15b777bdce0d69ac5a8617a26baef4f", size = 416089, upload-time = "2020-01-11T00:00:29.052Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/a5/ca258a571997be1c9483d6075bbc1b9487ae80f3bb3bf1f60db0b29f5aa6/abydos-0.5.0-py2.py3-none-any.whl", hash = "sha256:fe758c8f8456a703b7637ab9ac49457c1461d1ee61c97b52a6d803a567f355e1", size = 886001, upload-time = "2020-01-11T00:00:25.853Z" },
+]
+
+[[package]]
+name = "accelerate"
+version = "1.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/23/60/2757c4f03a8705dbf80b1268b03881927878dca5ed07d74f733fb6c219e0/accelerate-1.11.0.tar.gz", hash = "sha256:bb1caf2597b4cd632b917b5000c591d10730bb024a79746f1ee205bba80bd229", size = 393715, upload-time = "2025-10-20T14:42:25.025Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/85/85951bc0f9843e2c10baaa1b6657227056095de08f4d1eea7d8b423a6832/accelerate-1.11.0-py3-none-any.whl", hash = "sha256:a628fa6beb069b8e549460fc449135d5bd8d73e7a11fd09f0bc9fc4ace7f06f1", size = 375777, upload-time = "2025-10-20T14:42:23.256Z" },
+]
+
+[[package]]
+name = "addict"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" },
+]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.12.15"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/19/9e86722ec8e835959bd97ce8c1efa78cf361fa4531fca372551abcc9cdd6/aiohttp-3.12.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d3ce17ce0220383a0f9ea07175eeaa6aa13ae5a41f30bc61d84df17f0e9b1117", size = 711246, upload-time = "2025-07-29T05:50:15.937Z" },
+    { url = "https://files.pythonhosted.org/packages/71/f9/0a31fcb1a7d4629ac9d8f01f1cb9242e2f9943f47f5d03215af91c3c1a26/aiohttp-3.12.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:010cc9bbd06db80fe234d9003f67e97a10fe003bfbedb40da7d71c1008eda0fe", size = 483515, upload-time = "2025-07-29T05:50:17.442Z" },
+    { url = "https://files.pythonhosted.org/packages/62/6c/94846f576f1d11df0c2e41d3001000527c0fdf63fce7e69b3927a731325d/aiohttp-3.12.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f9d7c55b41ed687b9d7165b17672340187f87a773c98236c987f08c858145a9", size = 471776, upload-time = "2025-07-29T05:50:19.568Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/6c/f766d0aaafcee0447fad0328da780d344489c042e25cd58fde566bf40aed/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc4fbc61bb3548d3b482f9ac7ddd0f18c67e4225aaa4e8552b9f1ac7e6bda9e5", size = 1741977, upload-time = "2025-07-29T05:50:21.665Z" },
+    { url = "https://files.pythonhosted.org/packages/17/e5/fb779a05ba6ff44d7bc1e9d24c644e876bfff5abe5454f7b854cace1b9cc/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7fbc8a7c410bb3ad5d595bb7118147dfbb6449d862cc1125cf8867cb337e8728", size = 1690645, upload-time = "2025-07-29T05:50:23.333Z" },
+    { url = "https://files.pythonhosted.org/packages/37/4e/a22e799c2035f5d6a4ad2cf8e7c1d1bd0923192871dd6e367dafb158b14c/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74dad41b3458dbb0511e760fb355bb0b6689e0630de8a22b1b62a98777136e16", size = 1789437, upload-time = "2025-07-29T05:50:25.007Z" },
+    { url = "https://files.pythonhosted.org/packages/28/e5/55a33b991f6433569babb56018b2fb8fb9146424f8b3a0c8ecca80556762/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b6f0af863cf17e6222b1735a756d664159e58855da99cfe965134a3ff63b0b0", size = 1828482, upload-time = "2025-07-29T05:50:26.693Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/82/1ddf0ea4f2f3afe79dffed5e8a246737cff6cbe781887a6a170299e33204/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b5b7fe4972d48a4da367043b8e023fb70a04d1490aa7d68800e465d1b97e493b", size = 1730944, upload-time = "2025-07-29T05:50:28.382Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/96/784c785674117b4cb3877522a177ba1b5e4db9ce0fd519430b5de76eec90/aiohttp-3.12.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6443cca89553b7a5485331bc9bedb2342b08d073fa10b8c7d1c60579c4a7b9bd", size = 1668020, upload-time = "2025-07-29T05:50:30.032Z" },
+    { url = "https://files.pythonhosted.org/packages/12/8a/8b75f203ea7e5c21c0920d84dd24a5c0e971fe1e9b9ebbf29ae7e8e39790/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c5f40ec615e5264f44b4282ee27628cea221fcad52f27405b80abb346d9f3f8", size = 1716292, upload-time = "2025-07-29T05:50:31.983Z" },
+    { url = "https://files.pythonhosted.org/packages/47/0b/a1451543475bb6b86a5cfc27861e52b14085ae232896a2654ff1231c0992/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:2abbb216a1d3a2fe86dbd2edce20cdc5e9ad0be6378455b05ec7f77361b3ab50", size = 1711451, upload-time = "2025-07-29T05:50:33.989Z" },
+    { url = "https://files.pythonhosted.org/packages/55/fd/793a23a197cc2f0d29188805cfc93aa613407f07e5f9da5cd1366afd9d7c/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:db71ce547012a5420a39c1b744d485cfb823564d01d5d20805977f5ea1345676", size = 1691634, upload-time = "2025-07-29T05:50:35.846Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/bf/23a335a6670b5f5dfc6d268328e55a22651b440fca341a64fccf1eada0c6/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ced339d7c9b5030abad5854aa5413a77565e5b6e6248ff927d3e174baf3badf7", size = 1785238, upload-time = "2025-07-29T05:50:37.597Z" },
+    { url = "https://files.pythonhosted.org/packages/57/4f/ed60a591839a9d85d40694aba5cef86dde9ee51ce6cca0bb30d6eb1581e7/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7c7dd29c7b5bda137464dc9bfc738d7ceea46ff70309859ffde8c022e9b08ba7", size = 1805701, upload-time = "2025-07-29T05:50:39.591Z" },
+    { url = "https://files.pythonhosted.org/packages/85/e0/444747a9455c5de188c0f4a0173ee701e2e325d4b2550e9af84abb20cdba/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:421da6fd326460517873274875c6c5a18ff225b40da2616083c5a34a7570b685", size = 1718758, upload-time = "2025-07-29T05:50:41.292Z" },
+    { url = "https://files.pythonhosted.org/packages/36/ab/1006278d1ffd13a698e5dd4bfa01e5878f6bddefc296c8b62649753ff249/aiohttp-3.12.15-cp311-cp311-win32.whl", hash = "sha256:4420cf9d179ec8dfe4be10e7d0fe47d6d606485512ea2265b0d8c5113372771b", size = 428868, upload-time = "2025-07-29T05:50:43.063Z" },
+    { url = "https://files.pythonhosted.org/packages/10/97/ad2b18700708452400278039272032170246a1bf8ec5d832772372c71f1a/aiohttp-3.12.15-cp311-cp311-win_amd64.whl", hash = "sha256:edd533a07da85baa4b423ee8839e3e91681c7bfa19b04260a469ee94b778bf6d", size = 453273, upload-time = "2025-07-29T05:50:44.613Z" },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "anthropic"
+version = "0.64.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d8/4f/f2b880cba1a76f3acc7d5eb2ae217632eac1b8cef5ed3027493545c59eba/anthropic-0.64.0.tar.gz", hash = "sha256:3d496c91a63dff64f451b3e8e4b238a9640bf87b0c11d0b74ddc372ba5a3fe58", size = 427893, upload-time = "2025-08-13T17:09:49.915Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/b2/2d268bcd5d6441df9dc0ebebc67107657edb8b0150d3fda1a5b81d1bec45/anthropic-0.64.0-py3-none-any.whl", hash = "sha256:6f5f7d913a6a95eb7f8e1bda4e75f76670e8acd8d4cd965e02e2a256b0429dd1", size = 297244, upload-time = "2025-08-13T17:09:47.908Z" },
+]
+
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.9.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" }
+
+[[package]]
+name = "anyio"
+version = "4.10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" },
+]
+
+[[package]]
+name = "appnope"
+version = "0.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
+]
+
+[[package]]
+name = "asttokens"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" },
+]
+
+[[package]]
+name = "atria-core"
+version = "0.0.0"
+source = { git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella#c171ab4c154bf3ba74ff3b0515908cc49bb5cdbe" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "codename" },
+    { name = "coloredlogs" },
+    { name = "imagesize" },
+    { name = "lazy-loader" },
+    { name = "lxml" },
+    { name = "numpy" },
+    { name = "omegaconf" },
+    { name = "pillow" },
+    { name = "pyarrow" },
+    { name = "pydantic" },
+    { name = "pymupdf" },
+    { name = "pypdf2" },
+    { name = "rich" },
+]
+
+[[package]]
+name = "attrs"
+version = "25.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload-time = "2025-03-13T11:10:22.779Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" },
+]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload-time = "2025-04-15T17:05:13.836Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" },
+]
+
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
+[[package]]
+name = "cachetools"
+version = "6.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cc/7e/b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz", hash = "sha256:3f391e4bd8f8bf0931169baf7456cc822705f4e2a31f840d218f445b9a854201", size = 31325, upload-time = "2025-10-12T14:55:30.139Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl", hash = "sha256:09868944b6dde876dfd44e1d47e18484541eaf12f26f29b7af91b26cc892d701", size = 11280, upload-time = "2025-10-12T14:55:28.382Z" },
+]
+
+[[package]]
+name = "certifi"
+version = "2025.8.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" },
+]
+
+[[package]]
+name = "cffi"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" },
+    { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" },
+    { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" },
+    { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/2d/5fd176ceb9b2fc619e63405525573493ca23441330fcdaee6bef9460e924/charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14", size = 122371, upload-time = "2025-08-09T07:57:28.46Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/b5/991245018615474a60965a7c9cd2b4efbaabd16d582a5547c47ee1c7730b/charset_normalizer-3.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b256ee2e749283ef3ddcff51a675ff43798d92d746d1a6e4631bf8c707d22d0b", size = 204483, upload-time = "2025-08-09T07:55:53.12Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/2a/ae245c41c06299ec18262825c1569c5d3298fc920e4ddf56ab011b417efd/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:13faeacfe61784e2559e690fc53fa4c5ae97c6fcedb8eb6fb8d0a15b475d2c64", size = 145520, upload-time = "2025-08-09T07:55:54.712Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/a4/b3b6c76e7a635748c4421d2b92c7b8f90a432f98bda5082049af37ffc8e3/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:00237675befef519d9af72169d8604a067d92755e84fe76492fef5441db05b91", size = 158876, upload-time = "2025-08-09T07:55:56.024Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/e6/63bb0e10f90a8243c5def74b5b105b3bbbfb3e7bb753915fe333fb0c11ea/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:585f3b2a80fbd26b048a0be90c5aae8f06605d3c92615911c3a2b03a8a3b796f", size = 156083, upload-time = "2025-08-09T07:55:57.582Z" },
+    { url = "https://files.pythonhosted.org/packages/87/df/b7737ff046c974b183ea9aa111b74185ac8c3a326c6262d413bd5a1b8c69/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e78314bdc32fa80696f72fa16dc61168fda4d6a0c014e0380f9d02f0e5d8a07", size = 150295, upload-time = "2025-08-09T07:55:59.147Z" },
+    { url = "https://files.pythonhosted.org/packages/61/f1/190d9977e0084d3f1dc169acd060d479bbbc71b90bf3e7bf7b9927dec3eb/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:96b2b3d1a83ad55310de8c7b4a2d04d9277d5591f40761274856635acc5fcb30", size = 148379, upload-time = "2025-08-09T07:56:00.364Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/92/27dbe365d34c68cfe0ca76f1edd70e8705d82b378cb54ebbaeabc2e3029d/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:939578d9d8fd4299220161fdd76e86c6a251987476f5243e8864a7844476ba14", size = 160018, upload-time = "2025-08-09T07:56:01.678Z" },
+    { url = "https://files.pythonhosted.org/packages/99/04/baae2a1ea1893a01635d475b9261c889a18fd48393634b6270827869fa34/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:fd10de089bcdcd1be95a2f73dbe6254798ec1bda9f450d5828c96f93e2536b9c", size = 157430, upload-time = "2025-08-09T07:56:02.87Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/36/77da9c6a328c54d17b960c89eccacfab8271fdaaa228305330915b88afa9/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1e8ac75d72fa3775e0b7cb7e4629cec13b7514d928d15ef8ea06bca03ef01cae", size = 151600, upload-time = "2025-08-09T07:56:04.089Z" },
+    { url = "https://files.pythonhosted.org/packages/64/d4/9eb4ff2c167edbbf08cdd28e19078bf195762e9bd63371689cab5ecd3d0d/charset_normalizer-3.4.3-cp311-cp311-win32.whl", hash = "sha256:6cf8fd4c04756b6b60146d98cd8a77d0cdae0e1ca20329da2ac85eed779b6849", size = 99616, upload-time = "2025-08-09T07:56:05.658Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/9c/996a4a028222e7761a96634d1820de8a744ff4327a00ada9c8942033089b/charset_normalizer-3.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:31a9a6f775f9bcd865d88ee350f0ffb0e25936a7f930ca98995c05abf1faf21c", size = 107108, upload-time = "2025-08-09T07:56:07.176Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
+]
+
+[[package]]
+name = "codename"
+version = "1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c8/44/682d39480678b0d3a07c7c7c08e5c102e4807938ed9d126a77e21be00afd/codename-1.1.tar.gz", hash = "sha256:863780fa47521baa19087015d408b19dac4e93c6eaac4b1cc59a7ea728ea15fd", size = 3493, upload-time = "2018-01-30T13:22:36.71Z" }
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "humanfriendly" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
+]
+
+[[package]]
+name = "colorlog"
+version = "6.10.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
+]
+
+[[package]]
+name = "comm"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" },
+]
+
+[[package]]
+name = "contourpy"
+version = "1.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773, upload-time = "2025-07-26T12:01:02.277Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149, upload-time = "2025-07-26T12:01:04.072Z" },
+    { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222, upload-time = "2025-07-26T12:01:05.688Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234, upload-time = "2025-07-26T12:01:07.054Z" },
+    { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555, upload-time = "2025-07-26T12:01:08.801Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238, upload-time = "2025-07-26T12:01:10.319Z" },
+    { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218, upload-time = "2025-07-26T12:01:12.659Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867, upload-time = "2025-07-26T12:01:15.533Z" },
+    { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677, upload-time = "2025-07-26T12:01:17.088Z" },
+    { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234, upload-time = "2025-07-26T12:01:18.256Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123, upload-time = "2025-07-26T12:01:19.848Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809, upload-time = "2025-07-26T12:02:52.74Z" },
+    { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593, upload-time = "2025-07-26T12:02:54.037Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202, upload-time = "2025-07-26T12:02:55.947Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207, upload-time = "2025-07-26T12:02:57.468Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315, upload-time = "2025-07-26T12:02:58.801Z" },
+]
+
+[[package]]
+name = "croniter"
+version = "6.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "python-dateutil" },
+    { name = "pytz" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/2f/44d1ae153a0e27be56be43465e5cb39b9650c781e001e7864389deb25090/croniter-6.0.0.tar.gz", hash = "sha256:37c504b313956114a983ece2c2b07790b1f1094fe9d81cc94739214748255577", size = 64481, upload-time = "2024-12-17T17:17:47.32Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/4b/290b4c3efd6417a8b0c284896de19b1d5855e6dbdb97d2a35e68fa42de85/croniter-6.0.0-py2.py3-none-any.whl", hash = "sha256:2f878c3856f17896979b2a4379ba1f09c83e374931ea15cc835c5dd2eee9b368", size = 25468, upload-time = "2024-12-17T17:17:45.359Z" },
+]
+
+[[package]]
+name = "cryptography"
+version = "43.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0d/05/07b55d1fa21ac18c3a8c79f764e2514e6f6a9698f1be44994f5adf0d29db/cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805", size = 686989, upload-time = "2024-10-18T15:58:32.918Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1f/f3/01fdf26701a26f4b4dbc337a26883ad5bccaa6f1bbbdd29cd89e22f18a1c/cryptography-43.0.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf7a1932ac4176486eab36a19ed4c0492da5d97123f1406cf15e41b05e787d2e", size = 6225303, upload-time = "2024-10-18T15:57:36.753Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/01/4896f3d1b392025d4fcbecf40fdea92d3df8662123f6835d0af828d148fd/cryptography-43.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63efa177ff54aec6e1c0aefaa1a241232dcd37413835a9b674b6e3f0ae2bfd3e", size = 3760905, upload-time = "2024-10-18T15:57:39.166Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/be/f9a1f673f0ed4b7f6c643164e513dbad28dd4f2dcdf5715004f172ef24b6/cryptography-43.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e1ce50266f4f70bf41a2c6dc4358afadae90e2a1e5342d3c08883df1675374f", size = 3977271, upload-time = "2024-10-18T15:57:41.227Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/49/80c3a7b5514d1b416d7350830e8c422a4d667b6d9b16a9392ebfd4a5388a/cryptography-43.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:443c4a81bb10daed9a8f334365fe52542771f25aedaf889fd323a853ce7377d6", size = 3746606, upload-time = "2024-10-18T15:57:42.903Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/16/a28ddf78ac6e7e3f25ebcef69ab15c2c6be5ff9743dd0709a69a4f968472/cryptography-43.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:74f57f24754fe349223792466a709f8e0c093205ff0dca557af51072ff47ab18", size = 3986484, upload-time = "2024-10-18T15:57:45.434Z" },
+    { url = "https://files.pythonhosted.org/packages/01/f5/69ae8da70c19864a32b0315049866c4d411cce423ec169993d0434218762/cryptography-43.0.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9762ea51a8fc2a88b70cf2995e5675b38d93bf36bd67d91721c309df184f49bd", size = 3852131, upload-time = "2024-10-18T15:57:47.267Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/db/e74911d95c040f9afd3612b1f732e52b3e517cb80de8bf183be0b7d413c6/cryptography-43.0.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:81ef806b1fef6b06dcebad789f988d3b37ccaee225695cf3e07648eee0fc6b73", size = 4075647, upload-time = "2024-10-18T15:57:49.684Z" },
+    { url = "https://files.pythonhosted.org/packages/56/48/7b6b190f1462818b324e674fa20d1d5ef3e24f2328675b9b16189cbf0b3c/cryptography-43.0.3-cp37-abi3-win32.whl", hash = "sha256:cbeb489927bd7af4aa98d4b261af9a5bc025bd87f0e3547e11584be9e9427be2", size = 2623873, upload-time = "2024-10-18T15:57:51.822Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/b1/0ebff61a004f7f89e7b65ca95f2f2375679d43d0290672f7713ee3162aff/cryptography-43.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:f46304d6f0c6ab8e52770addfa2fc41e6629495548862279641972b6215451cd", size = 3068039, upload-time = "2024-10-18T15:57:54.426Z" },
+    { url = "https://files.pythonhosted.org/packages/30/d5/c8b32c047e2e81dd172138f772e81d852c51f0f2ad2ae8a24f1122e9e9a7/cryptography-43.0.3-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:8ac43ae87929a5982f5948ceda07001ee5e83227fd69cf55b109144938d96984", size = 6222984, upload-time = "2024-10-18T15:57:56.174Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/78/55356eb9075d0be6e81b59f45c7b48df87f76a20e73893872170471f3ee8/cryptography-43.0.3-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:846da004a5804145a5f441b8530b4bf35afbf7da70f82409f151695b127213d5", size = 3762968, upload-time = "2024-10-18T15:57:58.206Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/2c/488776a3dc843f95f86d2f957ca0fc3407d0242b50bede7fad1e339be03f/cryptography-43.0.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f996e7268af62598f2fc1204afa98a3b5712313a55c4c9d434aef49cadc91d4", size = 3977754, upload-time = "2024-10-18T15:58:00.683Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/04/2345ca92f7a22f601a9c62961741ef7dd0127c39f7310dffa0041c80f16f/cryptography-43.0.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f7b178f11ed3664fd0e995a47ed2b5ff0a12d893e41dd0494f406d1cf555cab7", size = 3749458, upload-time = "2024-10-18T15:58:02.225Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/25/e715fa0bc24ac2114ed69da33adf451a38abb6f3f24ec207908112e9ba53/cryptography-43.0.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c2e6fc39c4ab499049df3bdf567f768a723a5e8464816e8f009f121a5a9f4405", size = 3988220, upload-time = "2024-10-18T15:58:04.331Z" },
+    { url = "https://files.pythonhosted.org/packages/21/ce/b9c9ff56c7164d8e2edfb6c9305045fbc0df4508ccfdb13ee66eb8c95b0e/cryptography-43.0.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e1be4655c7ef6e1bbe6b5d0403526601323420bcf414598955968c9ef3eb7d16", size = 3853898, upload-time = "2024-10-18T15:58:06.113Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/33/b3682992ab2e9476b9c81fff22f02c8b0a1e6e1d49ee1750a67d85fd7ed2/cryptography-43.0.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:df6b6c6d742395dd77a23ea3728ab62f98379eff8fb61be2744d4679ab678f73", size = 4076592, upload-time = "2024-10-18T15:58:08.673Z" },
+    { url = "https://files.pythonhosted.org/packages/81/1e/ffcc41b3cebd64ca90b28fd58141c5f68c83d48563c88333ab660e002cd3/cryptography-43.0.3-cp39-abi3-win32.whl", hash = "sha256:d56e96520b1020449bbace2b78b603442e7e378a9b3bd68de65c782db1507995", size = 2623145, upload-time = "2024-10-18T15:58:10.264Z" },
+    { url = "https://files.pythonhosted.org/packages/87/5c/3dab83cc4aba1f4b0e733e3f0c3e7d4386440d660ba5b1e3ff995feb734d/cryptography-43.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:0c580952eef9bf68c4747774cde7ec1d85a6e61de97281f2dba83c7d2c806362", size = 3068026, upload-time = "2024-10-18T15:58:11.916Z" },
+]
+
+[[package]]
+name = "cssutils"
+version = "2.11.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "more-itertools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/9f/329d26121fe165be44b1dfff21aa0dc348f04633931f1d20ed6cf448a236/cssutils-2.11.1.tar.gz", hash = "sha256:0563a76513b6af6eebbe788c3bf3d01c920e46b3f90c8416738c5cfc773ff8e2", size = 711657, upload-time = "2024-06-04T15:51:39.373Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/ec/bb273b7208c606890dc36540fe667d06ce840a6f62f9fae7e658fcdc90fb/cssutils-2.11.1-py3-none-any.whl", hash = "sha256:a67bfdfdff4f3867fab43698ec4897c1a828eca5973f4073321b3bccaf1199b1", size = 385747, upload-time = "2024-06-04T15:51:37.499Z" },
+]
+
+[[package]]
+name = "cycler"
+version = "0.12.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" },
+]
+
+[[package]]
+name = "dash"
+version = "3.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "flask" },
+    { name = "importlib-metadata" },
+    { name = "nest-asyncio" },
+    { name = "plotly" },
+    { name = "requests" },
+    { name = "retrying" },
+    { name = "setuptools" },
+    { name = "typing-extensions" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/80/37/8b5621e0a0b3c6e81a8b6cd3f033aa4b750f53e288dd1a494a887a8a06e9/dash-3.2.0.tar.gz", hash = "sha256:93300b9b99498f8b8ed267e61c455b4ee1282c7e4d4b518600eec87ce6ddea55", size = 7558708, upload-time = "2025-07-31T19:18:59.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/36/e0010483ca49b9bf6f389631ccea07b3ff6b678d14d8c7a0a4357860c36a/dash-3.2.0-py3-none-any.whl", hash = "sha256:4c1819588d83bed2cbcf5807daa5c2380c8c85789a6935a733f018f04ad8a6a2", size = 7900661, upload-time = "2025-07-31T19:18:50.679Z" },
+]
+
+[[package]]
+name = "dash-bootstrap-components"
+version = "2.0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/d4/5b7da808ff5acb3a6ca702f504d8ef05bc7d4c475b18dadefd783b1120c3/dash_bootstrap_components-2.0.4.tar.gz", hash = "sha256:c3206c0923774bbc6a6ddaa7822b8d9aa5326b0d3c1e7cd795cc975025fe2484", size = 115599, upload-time = "2025-08-20T19:42:09.449Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d6/38/1efeec8b4d741c09ccd169baf8a00c07a0176b58e418d4cd0c30dffedd22/dash_bootstrap_components-2.0.4-py3-none-any.whl", hash = "sha256:767cf0084586c1b2b614ccf50f79fe4525fdbbf8e3a161ed60016e584a14f5d1", size = 204044, upload-time = "2025-08-20T19:42:07.928Z" },
+]
+
+[[package]]
+name = "datadings"
+version = "3.4.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gdown" },
+    { name = "msgpack" },
+    { name = "msgpack-numpy" },
+    { name = "natsort" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pyzmq" },
+    { name = "requests" },
+    { name = "scipy" },
+    { name = "simplebloom" },
+    { name = "simplejpeg" },
+    { name = "tqdm" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/18/2e46a7241c4b3ca653c88b607ca129ff9fae23625f09c25ff73d4d90190c/datadings-3.4.7-py3-none-any.whl", hash = "sha256:3602f3581650ac55a2f7cf722a72786284441846907fa244561dc9924edacd19", size = 2907582, upload-time = "2025-05-28T09:44:51.584Z" },
+]
+
+[[package]]
+name = "datasets"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e3/9d/348ed92110ba5f9b70b51ca1078d4809767a835aa2b7ce7e74ad2b98323d/datasets-4.0.0.tar.gz", hash = "sha256:9657e7140a9050db13443ba21cb5de185af8af944479b00e7ff1e00a61c8dbf1", size = 569566, upload-time = "2025-07-09T14:35:52.431Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d", size = 494825, upload-time = "2025-07-09T14:35:50.658Z" },
+]
+
+[[package]]
+name = "debugpy"
+version = "1.8.17"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/ad/71e708ff4ca377c4230530d6a7aa7992592648c122a2cd2b321cf8b35a76/debugpy-1.8.17.tar.gz", hash = "sha256:fd723b47a8c08892b1a16b2c6239a8b96637c62a59b94bb5dab4bac592a58a8e", size = 1644129, upload-time = "2025-09-17T16:33:20.633Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/53/3af72b5c159278c4a0cf4cffa518675a0e73bdb7d1cac0239b815502d2ce/debugpy-1.8.17-cp311-cp311-macosx_15_0_universal2.whl", hash = "sha256:d3fce3f0e3de262a3b67e69916d001f3e767661c6e1ee42553009d445d1cd840", size = 2207154, upload-time = "2025-09-17T16:33:29.457Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/6d/204f407df45600e2245b4a39860ed4ba32552330a0b3f5f160ae4cc30072/debugpy-1.8.17-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:c6bdf134457ae0cac6fb68205776be635d31174eeac9541e1d0c062165c6461f", size = 3170322, upload-time = "2025-09-17T16:33:30.837Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/13/1b8f87d39cf83c6b713de2620c31205299e6065622e7dd37aff4808dd410/debugpy-1.8.17-cp311-cp311-win32.whl", hash = "sha256:e79a195f9e059edfe5d8bf6f3749b2599452d3e9380484cd261f6b7cd2c7c4da", size = 5155078, upload-time = "2025-09-17T16:33:33.331Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/c5/c012c60a2922cc91caa9675d0ddfbb14ba59e1e36228355f41cab6483469/debugpy-1.8.17-cp311-cp311-win_amd64.whl", hash = "sha256:b532282ad4eca958b1b2d7dbcb2b7218e02cb934165859b918e3b6ba7772d3f4", size = 5179011, upload-time = "2025-09-17T16:33:35.711Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/d0/89247ec250369fc76db477720a26b2fce7ba079ff1380e4ab4529d2fe233/debugpy-1.8.17-py2.py3-none-any.whl", hash = "sha256:60c7dca6571efe660ccb7a9508d73ca14b8796c4ed484c2002abba714226cfef", size = 5283210, upload-time = "2025-09-17T16:34:25.835Z" },
+]
+
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
+
+[[package]]
+name = "deprecation"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/d3/8ae2869247df154b64c1884d7346d412fed0c49df84db635aab2d1c40e62/deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff", size = 173788, upload-time = "2020-04-20T14:23:38.738Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" },
+]
+
+[[package]]
+name = "diffusers"
+version = "0.35.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "huggingface-hub" },
+    { name = "importlib-metadata" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "safetensors" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/03/68/288ca23c7c05c73e87ffe5efffc282400ac9b017f7a9bb03883f4310ea15/diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded", size = 3366711, upload-time = "2025-10-15T04:05:17.213Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/2e/38d9824f8c6bb048c5ba21c6d4da54c29c162a46b58b3ef907a360a76d3e/diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5", size = 4121649, upload-time = "2025-10-15T04:05:14.391Z" },
+]
+
+[[package]]
+name = "dill"
+version = "0.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847, upload-time = "2024-01-27T23:42:16.145Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252, upload-time = "2024-01-27T23:42:14.239Z" },
+]
+
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
+]
+
+[[package]]
+name = "docgenie"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "accelerate" },
+    { name = "aiohappyeyeballs" },
+    { name = "aiohttp" },
+    { name = "aiosignal" },
+    { name = "annotated-types" },
+    { name = "anthropic" },
+    { name = "anyio" },
+    { name = "atria-core" },
+    { name = "attrs" },
+    { name = "beautifulsoup4" },
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "click" },
+    { name = "colorlog" },
+    { name = "cssutils" },
+    { name = "dash" },
+    { name = "dash-bootstrap-components" },
+    { name = "datadings" },
+    { name = "datasets" },
+    { name = "diffusers" },
+    { name = "dill" },
+    { name = "distro" },
+    { name = "due-evaluator" },
+    { name = "editdistance" },
+    { name = "einops" },
+    { name = "fastapi" },
+    { name = "filelock" },
+    { name = "fire" },
+    { name = "flask" },
+    { name = "frozenlist" },
+    { name = "fsspec" },
+    { name = "gitdb" },
+    { name = "gitpython" },
+    { name = "google-api-python-client" },
+    { name = "google-auth-httplib2" },
+    { name = "google-auth-oauthlib" },
+    { name = "h11" },
+    { name = "h5py" },
+    { name = "hdbscan" },
+    { name = "hf-xet" },
+    { name = "httpcore" },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "hydra-core" },
+    { name = "icecream" },
+    { name = "idna" },
+    { name = "jinja2" },
+    { name = "jiter" },
+    { name = "jsonlines" },
+    { name = "levenshtein" },
+    { name = "lxml" },
+    { name = "markdown-it-py" },
+    { name = "markupsafe" },
+    { name = "matplotlib" },
+    { name = "mdurl" },
+    { name = "mmcv" },
+    { name = "mmdet" },
+    { name = "more-itertools" },
+    { name = "mpmath" },
+    { name = "multidict" },
+    { name = "multiprocess" },
+    { name = "networkx" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pdf2image" },
+    { name = "pdfplumber" },
+    { name = "pillow" },
+    { name = "platformdirs" },
+    { name = "playwright" },
+    { name = "plotly" },
+    { name = "propcache" },
+    { name = "protobuf" },
+    { name = "pyarrow" },
+    { name = "pydantic" },
+    { name = "pydantic-argparse" },
+    { name = "pydantic-core" },
+    { name = "pydantic-settings" },
+    { name = "pygments" },
+    { name = "pymupdf" },
+    { name = "pypdf2" },
+    { name = "pytesseract" },
+    { name = "python-barcode" },
+    { name = "python-dateutil" },
+    { name = "python-dotenv" },
+    { name = "python-multipart" },
+    { name = "pytorch-ignite" },
+    { name = "pytz" },
+    { name = "pyyaml" },
+    { name = "rapidfuzz" },
+    { name = "redis" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "rich" },
+    { name = "rq" },
+    { name = "safetensors" },
+    { name = "scikit-learn" },
+    { name = "seaborn" },
+    { name = "selenium" },
+    { name = "sentence-transformers" },
+    { name = "sentry-sdk" },
+    { name = "seqeval" },
+    { name = "setuptools" },
+    { name = "six" },
+    { name = "smmap" },
+    { name = "sniffio" },
+    { name = "soupsieve" },
+    { name = "supabase" },
+    { name = "sympy" },
+    { name = "tenacity" },
+    { name = "tensorboard" },
+    { name = "tensorboardx" },
+    { name = "textdistance" },
+    { name = "timm" },
+    { name = "tokenizers" },
+    { name = "torch" },
+    { name = "torchinfo" },
+    { name = "torchvision", version = "0.16.0", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.16.0+cu121", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "triton" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+    { name = "tzdata" },
+    { name = "umap-learn" },
+    { name = "urllib3" },
+    { name = "uvicorn", extra = ["standard"] },
+    { name = "wandb" },
+    { name = "wheel" },
+    { name = "xxhash" },
+    { name = "yarl" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "ipykernel" },
+    { name = "pydrive2" },
+    { name = "pytest" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "accelerate", specifier = ">=1.11.0" },
+    { name = "aiohappyeyeballs", specifier = "==2.6.1" },
+    { name = "aiohttp", specifier = "==3.12.15" },
+    { name = "aiosignal", specifier = "==1.4.0" },
+    { name = "annotated-types", specifier = "==0.7.0" },
+    { name = "anthropic", specifier = "==0.64.0" },
+    { name = "anyio", specifier = "==4.10.0" },
+    { name = "atria-core", git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella" },
+    { name = "attrs", specifier = "==25.3.0" },
+    { name = "beautifulsoup4", specifier = "==4.13.4" },
+    { name = "certifi", specifier = "==2025.8.3" },
+    { name = "charset-normalizer", specifier = "==3.4.3" },
+    { name = "click", specifier = "==8.2.1" },
+    { name = "colorlog", specifier = ">=6.9.0" },
+    { name = "cssutils", specifier = "==2.11.1" },
+    { name = "dash", specifier = ">=3.2.0" },
+    { name = "dash-bootstrap-components", specifier = ">=2.0.4" },
+    { name = "datadings", specifier = ">=3.4.7" },
+    { name = "datasets", specifier = "==4.0.0" },
+    { name = "diffusers", specifier = ">=0.35.2" },
+    { name = "dill", specifier = "==0.3.8" },
+    { name = "distro", specifier = "==1.9.0" },
+    { name = "due-evaluator", git = "https://github.com/due-benchmark/evaluator.git" },
+    { name = "editdistance", specifier = ">=0.8.1" },
+    { name = "einops", specifier = "==0.8.2" },
+    { name = "fastapi", specifier = ">=0.109.0" },
+    { name = "filelock", specifier = "==3.19.1" },
+    { name = "fire", specifier = ">=0.7.1" },
+    { name = "flask", specifier = ">=3.1.2" },
+    { name = "frozenlist", specifier = "==1.7.0" },
+    { name = "fsspec", specifier = "==2025.3.0" },
+    { name = "gitdb", specifier = "==4.0.12" },
+    { name = "gitpython", specifier = "==3.1.45" },
+    { name = "google-api-python-client", specifier = ">=2.100.0" },
+    { name = "google-auth-httplib2", specifier = ">=0.2.0" },
+    { name = "google-auth-oauthlib", specifier = ">=1.2.0" },
+    { name = "h11", specifier = "==0.16.0" },
+    { name = "h5py", specifier = ">=3.14.0" },
+    { name = "hdbscan", specifier = ">=0.8.40" },
+    { name = "hf-xet", specifier = "==1.1.8" },
+    { name = "httpcore", specifier = "==1.0.9" },
+    { name = "httpx", specifier = "==0.28.1" },
+    { name = "huggingface-hub", specifier = "==0.34.4" },
+    { name = "hydra-core", specifier = ">=1.3.2" },
+    { name = "icecream", specifier = ">=2.1.8" },
+    { name = "idna", specifier = "==3.10" },
+    { name = "jinja2", specifier = "==3.1.6" },
+    { name = "jiter", specifier = "==0.10.0" },
+    { name = "jsonlines", specifier = "==4.0.0" },
+    { name = "levenshtein", specifier = "==0.27.1" },
+    { name = "lxml", specifier = ">=5.1.0" },
+    { name = "markdown-it-py", specifier = "==4.0.0" },
+    { name = "markupsafe", specifier = "==3.0.2" },
+    { name = "matplotlib", specifier = ">=3.10.6" },
+    { name = "mdurl", specifier = "==0.1.2" },
+    { name = "mmcv", url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" },
+    { name = "mmdet", specifier = "==3.3.0" },
+    { name = "more-itertools", specifier = "==10.7.0" },
+    { name = "mpmath", specifier = "==1.3.0" },
+    { name = "multidict", specifier = "==6.6.4" },
+    { name = "multiprocess", specifier = "==0.70.16" },
+    { name = "networkx", specifier = "==3.5" },
+    { name = "numpy", specifier = "==1.26.4" },
+    { name = "packaging", specifier = "==25.0" },
+    { name = "pandas", specifier = "==2.3.1" },
+    { name = "pdf2image", specifier = "==1.17.0" },
+    { name = "pdfplumber", specifier = ">=0.10.4" },
+    { name = "pillow", specifier = "==11.3.0" },
+    { name = "platformdirs", specifier = "==4.3.8" },
+    { name = "playwright", specifier = ">=1.55.0" },
+    { name = "plotly", specifier = ">=6.3.1" },
+    { name = "propcache", specifier = "==0.3.2" },
+    { name = "protobuf", specifier = "==6.32.0" },
+    { name = "pyarrow", specifier = "==21.0.0" },
+    { name = "pydantic", specifier = "==2.11.7" },
+    { name = "pydantic-argparse", specifier = ">=0.10.0" },
+    { name = "pydantic-core", specifier = "==2.33.2" },
+    { name = "pydantic-settings", specifier = ">=2.11.0" },
+    { name = "pygments", specifier = "==2.19.2" },
+    { name = "pymupdf", specifier = "==1.26.3" },
+    { name = "pypdf2", specifier = "==3.0.1" },
+    { name = "pytesseract", specifier = ">=0.3.10" },
+    { name = "python-barcode", specifier = ">=0.16.1" },
+    { name = "python-dateutil", specifier = "==2.9.0.post0" },
+    { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "python-multipart", specifier = ">=0.0.6" },
+    { name = "pytorch-ignite", specifier = ">=0.5.2" },
+    { name = "pytz", specifier = "==2025.2" },
+    { name = "pyyaml", specifier = "==6.0.2" },
+    { name = "rapidfuzz", specifier = "==3.13.0" },
+    { name = "redis", specifier = ">=5.0.0" },
+    { name = "regex", specifier = "==2025.7.34" },
+    { name = "requests", specifier = "==2.32.5" },
+    { name = "rich", specifier = "==14.1.0" },
+    { name = "rq", specifier = ">=1.15.0" },
+    { name = "safetensors", specifier = "==0.6.2" },
+    { name = "scikit-learn", specifier = ">=1.7.2" },
+    { name = "seaborn", specifier = ">=0.13.2" },
+    { name = "selenium", specifier = ">=4.36.0" },
+    { name = "sentence-transformers", specifier = ">=5.1.1" },
+    { name = "sentry-sdk", specifier = "==2.35.0" },
+    { name = "seqeval", specifier = ">=1.2.2" },
+    { name = "setuptools", specifier = "==78.1.1" },
+    { name = "six", specifier = "==1.17.0" },
+    { name = "smmap", specifier = "==5.0.2" },
+    { name = "sniffio", specifier = "==1.3.1" },
+    { name = "soupsieve", specifier = "==2.7" },
+    { name = "supabase", specifier = ">=2.0.0" },
+    { name = "sympy", specifier = "==1.13.1" },
+    { name = "tenacity", specifier = ">=8.2.3" },
+    { name = "tensorboard", specifier = ">=2.20.0" },
+    { name = "tensorboardx", specifier = ">=2.6.4" },
+    { name = "textdistance", specifier = ">=4.0.0" },
+    { name = "timm", specifier = ">=1.0.20" },
+    { name = "tokenizers", specifier = "==0.21.4" },
+    { name = "torch", specifier = "==2.1.0", index = "https://download.pytorch.org/whl/cu121" },
+    { name = "torchinfo", specifier = ">=1.8.0" },
+    { name = "torchvision", index = "https://download.pytorch.org/whl/cu121" },
+    { name = "tqdm", specifier = "==4.67.1" },
+    { name = "transformers", specifier = "==4.49" },
+    { name = "triton", specifier = "==2.1.0" },
+    { name = "typing-extensions", specifier = "==4.14.1" },
+    { name = "typing-inspection", specifier = "==0.4.1" },
+    { name = "tzdata", specifier = "==2025.2" },
+    { name = "umap-learn", specifier = "==0.5.9.post2" },
+    { name = "urllib3", specifier = "==2.5.0" },
+    { name = "uvicorn", extras = ["standard"], specifier = ">=0.27.0" },
+    { name = "wandb", specifier = "==0.21.1" },
+    { name = "wheel", specifier = "==0.45.1" },
+    { name = "xxhash", specifier = "==3.5.0" },
+    { name = "yarl", specifier = "==1.20.1" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "ipykernel", specifier = ">=6.30.1" },
+    { name = "pydrive2", specifier = ">=1.21.3" },
+    { name = "pytest", specifier = ">=8.4.2" },
+]
+
+[[package]]
+name = "due-evaluator"
+version = "0.0.8"
+source = { git = "https://github.com/due-benchmark/evaluator.git#75c0f550f5b7c0104a86993d5f307794687493ac" }
+dependencies = [
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scipy" },
+    { name = "textdistance", extra = ["levenshtein"] },
+]
+
+[[package]]
+name = "editdistance"
+version = "0.8.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d5/18/9f4f975ca87a390832b1c22478f3702fcdf739f83211e24d054b7551270d/editdistance-0.8.1.tar.gz", hash = "sha256:d1cdf80a5d5014b0c9126a69a42ce55a457b457f6986ff69ca98e4fe4d2d8fed", size = 50006, upload-time = "2024-02-10T07:44:53.914Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e2/dc/d0c29fd52d8f9e795653ed2b838a2a48c739cdfff04ac5b79c6c0ecbdf79/editdistance-0.8.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:486105603a273d73d12a54f347dffa70ab281749d7c3879658b377bc49e4b98c", size = 106079, upload-time = "2024-02-10T07:43:34.34Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/c6/75fa45d7b78fbea6fd894f4e48895a75bd3c83d4a9a6b57673881d74d3e0/editdistance-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fad081f5f86a175c1a09a4e9e45b95c9349e454c21e181e842e01c85f1f536fc", size = 80580, upload-time = "2024-02-10T07:43:35.947Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/a3/058d823b6285c3511dc94ed80620c3fb0c18b4aaa708f70ba71f3af28436/editdistance-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8cb78e125f6759398885a775f5eed07c2bb72b2f86da43e674c6b6a3335b273b", size = 79087, upload-time = "2024-02-10T07:43:36.923Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/3a/0b13c7864c93b1e9b9952bd2a33c5ef3c4fd1bf70a5fad6924789e70e5eb/editdistance-0.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3778ca60aa89def9144b70e330bcec5330c7da1d69cb28c612e90b84510a1d3d", size = 409296, upload-time = "2024-02-10T07:43:38.52Z" },
+    { url = "https://files.pythonhosted.org/packages/96/8a/db0fd79e8ddb9b5f86f274107c5d0a27ec4f2af88877df1f26c2c6d150cc/editdistance-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fba945eaa0436cf40bc53d7e299dc537c7c71353379a095b7459ff4af910da33", size = 412913, upload-time = "2024-02-10T07:43:39.852Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/d2/98be7112750ff17b436dd76f988f1e38570dcec0df8578ee19ef046f22fe/editdistance-0.8.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:877f2a0d801f32bc1a1878901ffb947b974361e849c66e314a7f1d786a446b58", size = 407430, upload-time = "2024-02-10T07:43:41.048Z" },
+    { url = "https://files.pythonhosted.org/packages/03/62/1815e3bf164910c47ba1948c8b5e937a40c7f9763b64e98fb6666b01dd06/editdistance-0.8.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e79d351ca40a6ead5f3763253fd7521572ee0d3e5d42538630e56d10f48db481", size = 909217, upload-time = "2024-02-10T07:43:42.916Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/d3/a832cea7b507a9be54e4ac3d1340fb66dca5f9c16c70bf38d5039e8fdede/editdistance-0.8.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:70ed382b3052a51161bad0149d4665003bf3b949fce0b01bf1253a4cc1a88239", size = 969407, upload-time = "2024-02-10T07:43:44.912Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/b4/db291d2a3845cbf8047b4b5aad3b3e038a8a2994d87027b40e1a1b0f4b74/editdistance-0.8.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a529bfb384c4000775d76739c4e64f73337f0f5a3784933b1321b577a62bed4e", size = 922112, upload-time = "2024-02-10T07:43:47.047Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/26/7ddeacada4982d0b892a28897e21871d0f25bca165e3663e37c3a272808a/editdistance-0.8.1-cp311-cp311-win32.whl", hash = "sha256:b082232429e731f181af7f7d2bcf79da6ca8fadd04e9086c11e2973f7d330c81", size = 80799, upload-time = "2024-02-10T07:43:48.231Z" },
+    { url = "https://files.pythonhosted.org/packages/52/a1/778af8590b8b12f03f62eacc3c8744407ade9e3d69be6dabe38d0afbf2dd/editdistance-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:cef1a4359252a49f2c4718e64e9d40027d9d951b289d045bdb278656e59f6af8", size = 79698, upload-time = "2024-02-10T07:43:49.234Z" },
+]
+
+[[package]]
+name = "einops"
+version = "0.8.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" },
+]
+
+[[package]]
+name = "executing"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
+]
+
+[[package]]
+name = "fastapi"
+version = "0.128.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-doc" },
+    { name = "pydantic" },
+    { name = "starlette" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/52/08/8c8508db6c7b9aae8f7175046af41baad690771c9bcde676419965e338c7/fastapi-0.128.0.tar.gz", hash = "sha256:1cc179e1cef10a6be60ffe429f79b829dce99d8de32d7acb7e6c8dfdf7f2645a", size = 365682, upload-time = "2025-12-27T15:21:13.714Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" },
+]
+
+[[package]]
+name = "filelock"
+version = "3.19.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" },
+]
+
+[[package]]
+name = "fire"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "termcolor" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/00/f8d10588d2019d6d6452653def1ee807353b21983db48550318424b5ff18/fire-0.7.1.tar.gz", hash = "sha256:3b208f05c736de98fb343310d090dcc4d8c78b2a89ea4f32b837c586270a9cbf", size = 88720, upload-time = "2025-08-16T20:20:24.175Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
+]
+
+[[package]]
+name = "flask"
+version = "3.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/6d/cfe3c0fcc5e477df242b98bfe186a4c34357b4847e87ecaef04507332dab/flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87", size = 720160, upload-time = "2025-08-19T21:03:21.205Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/f9/7f9263c5695f4bd0023734af91bedb2ff8209e8de6ead162f35d8dc762fd/flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c", size = 103308, upload-time = "2025-08-19T21:03:19.499Z" },
+]
+
+[[package]]
+name = "fonttools"
+version = "4.60.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4b/42/97a13e47a1e51a5a7142475bbcf5107fe3a68fc34aef331c897d5fb98ad0/fonttools-4.60.1.tar.gz", hash = "sha256:ef00af0439ebfee806b25f24c8f92109157ff3fac5731dc7867957812e87b8d9", size = 3559823, upload-time = "2025-09-29T21:13:27.129Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/85/639aa9bface1537e0fb0f643690672dde0695a5bbbc90736bc571b0b1941/fonttools-4.60.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7b4c32e232a71f63a5d00259ca3d88345ce2a43295bb049d21061f338124246f", size = 2831872, upload-time = "2025-09-29T21:11:20.329Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/47/3c63158459c95093be9618794acb1067b3f4d30dcc5c3e8114b70e67a092/fonttools-4.60.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3630e86c484263eaac71d117085d509cbcf7b18f677906824e4bace598fb70d2", size = 2356990, upload-time = "2025-09-29T21:11:22.754Z" },
+    { url = "https://files.pythonhosted.org/packages/94/dd/1934b537c86fcf99f9761823f1fc37a98fbd54568e8e613f29a90fed95a9/fonttools-4.60.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5c1015318e4fec75dd4943ad5f6a206d9727adf97410d58b7e32ab644a807914", size = 5042189, upload-time = "2025-09-29T21:11:25.061Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/d2/9f4e4c4374dd1daa8367784e1bd910f18ba886db1d6b825b12edf6db3edc/fonttools-4.60.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e6c58beb17380f7c2ea181ea11e7db8c0ceb474c9dd45f48e71e2cb577d146a1", size = 4978683, upload-time = "2025-09-29T21:11:27.693Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/c4/0fb2dfd1ecbe9a07954cc13414713ed1eab17b1c0214ef07fc93df234a47/fonttools-4.60.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec3681a0cb34c255d76dd9d865a55f260164adb9fa02628415cdc2d43ee2c05d", size = 5021372, upload-time = "2025-09-29T21:11:30.257Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/d5/495fc7ae2fab20223cc87179a8f50f40f9a6f821f271ba8301ae12bb580f/fonttools-4.60.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f4b5c37a5f40e4d733d3bbaaef082149bee5a5ea3156a785ff64d949bd1353fa", size = 5132562, upload-time = "2025-09-29T21:11:32.737Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/fa/021dab618526323c744e0206b3f5c8596a2e7ae9aa38db5948a131123e83/fonttools-4.60.1-cp311-cp311-win32.whl", hash = "sha256:398447f3d8c0c786cbf1209711e79080a40761eb44b27cdafffb48f52bcec258", size = 2230288, upload-time = "2025-09-29T21:11:35.015Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/78/0e1a6d22b427579ea5c8273e1c07def2f325b977faaf60bb7ddc01456cb1/fonttools-4.60.1-cp311-cp311-win_amd64.whl", hash = "sha256:d066ea419f719ed87bc2c99a4a4bfd77c2e5949cb724588b9dd58f3fd90b92bf", size = 2278184, upload-time = "2025-09-29T21:11:37.434Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/93/0dd45cd283c32dea1545151d8c3637b4b8c53cdb3a625aeb2885b184d74d/fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb", size = 1143175, upload-time = "2025-09-29T21:13:24.134Z" },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/7e/803dde33760128acd393a27eb002f2020ddb8d99d30a44bfbaab31c5f08a/frozenlist-1.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:aa51e147a66b2d74de1e6e2cf5921890de6b0f4820b257465101d7f37b49fb5a", size = 82251, upload-time = "2025-06-09T23:00:16.279Z" },
+    { url = "https://files.pythonhosted.org/packages/75/a9/9c2c5760b6ba45eae11334db454c189d43d34a4c0b489feb2175e5e64277/frozenlist-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9b35db7ce1cd71d36ba24f80f0c9e7cff73a28d7a74e91fe83e23d27c7828750", size = 48183, upload-time = "2025-06-09T23:00:17.698Z" },
+    { url = "https://files.pythonhosted.org/packages/47/be/4038e2d869f8a2da165f35a6befb9158c259819be22eeaf9c9a8f6a87771/frozenlist-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:34a69a85e34ff37791e94542065c8416c1afbf820b68f720452f636d5fb990cd", size = 47107, upload-time = "2025-06-09T23:00:18.952Z" },
+    { url = "https://files.pythonhosted.org/packages/79/26/85314b8a83187c76a37183ceed886381a5f992975786f883472fcb6dc5f2/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a646531fa8d82c87fe4bb2e596f23173caec9185bfbca5d583b4ccfb95183e2", size = 237333, upload-time = "2025-06-09T23:00:20.275Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/fd/e5b64f7d2c92a41639ffb2ad44a6a82f347787abc0c7df5f49057cf11770/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:79b2ffbba483f4ed36a0f236ccb85fbb16e670c9238313709638167670ba235f", size = 231724, upload-time = "2025-06-09T23:00:21.705Z" },
+    { url = "https://files.pythonhosted.org/packages/20/fb/03395c0a43a5976af4bf7534759d214405fbbb4c114683f434dfdd3128ef/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26f205c9ca5829cbf82bb2a84b5c36f7184c4316617d7ef1b271a56720d6b30", size = 245842, upload-time = "2025-06-09T23:00:23.148Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/15/c01c8e1dffdac5d9803507d824f27aed2ba76b6ed0026fab4d9866e82f1f/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcacfad3185a623fa11ea0e0634aac7b691aa925d50a440f39b458e41c561d98", size = 239767, upload-time = "2025-06-09T23:00:25.103Z" },
+    { url = "https://files.pythonhosted.org/packages/14/99/3f4c6fe882c1f5514b6848aa0a69b20cb5e5d8e8f51a339d48c0e9305ed0/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72c1b0fe8fe451b34f12dce46445ddf14bd2a5bcad7e324987194dc8e3a74c86", size = 224130, upload-time = "2025-06-09T23:00:27.061Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/83/220a374bd7b2aeba9d0725130665afe11de347d95c3620b9b82cc2fcab97/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d1a5baeaac6c0798ff6edfaeaa00e0e412d49946c53fae8d4b8e8b3566c4ae", size = 235301, upload-time = "2025-06-09T23:00:29.02Z" },
+    { url = "https://files.pythonhosted.org/packages/03/3c/3e3390d75334a063181625343e8daab61b77e1b8214802cc4e8a1bb678fc/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7edf5c043c062462f09b6820de9854bf28cc6cc5b6714b383149745e287181a8", size = 234606, upload-time = "2025-06-09T23:00:30.514Z" },
+    { url = "https://files.pythonhosted.org/packages/23/1e/58232c19608b7a549d72d9903005e2d82488f12554a32de2d5fb59b9b1ba/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d50ac7627b3a1bd2dcef6f9da89a772694ec04d9a61b66cf87f7d9446b4a0c31", size = 248372, upload-time = "2025-06-09T23:00:31.966Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/a4/e4a567e01702a88a74ce8a324691e62a629bf47d4f8607f24bf1c7216e7f/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce48b2fece5aeb45265bb7a58259f45027db0abff478e3077e12b05b17fb9da7", size = 229860, upload-time = "2025-06-09T23:00:33.375Z" },
+    { url = "https://files.pythonhosted.org/packages/73/a6/63b3374f7d22268b41a9db73d68a8233afa30ed164c46107b33c4d18ecdd/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:fe2365ae915a1fafd982c146754e1de6ab3478def8a59c86e1f7242d794f97d5", size = 245893, upload-time = "2025-06-09T23:00:35.002Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/eb/d18b3f6e64799a79673c4ba0b45e4cfbe49c240edfd03a68be20002eaeaa/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:45a6f2fdbd10e074e8814eb98b05292f27bad7d1883afbe009d96abdcf3bc898", size = 246323, upload-time = "2025-06-09T23:00:36.468Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f5/720f3812e3d06cd89a1d5db9ff6450088b8f5c449dae8ffb2971a44da506/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21884e23cffabb157a9dd7e353779077bf5b8f9a58e9b262c6caad2ef5f80a56", size = 233149, upload-time = "2025-06-09T23:00:37.963Z" },
+    { url = "https://files.pythonhosted.org/packages/69/68/03efbf545e217d5db8446acfd4c447c15b7c8cf4dbd4a58403111df9322d/frozenlist-1.7.0-cp311-cp311-win32.whl", hash = "sha256:284d233a8953d7b24f9159b8a3496fc1ddc00f4db99c324bd5fb5f22d8698ea7", size = 39565, upload-time = "2025-06-09T23:00:39.753Z" },
+    { url = "https://files.pythonhosted.org/packages/58/17/fe61124c5c333ae87f09bb67186d65038834a47d974fc10a5fadb4cc5ae1/frozenlist-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:387cbfdcde2f2353f19c2f66bbb52406d06ed77519ac7ee21be0232147c2592d", size = 44019, upload-time = "2025-06-09T23:00:40.988Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" },
+]
+
+[[package]]
+name = "fsspec"
+version = "2025.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/34/f4/5721faf47b8c499e776bc34c6a8fc17efdf7fdef0b00f398128bc5dcb4ac/fsspec-2025.3.0.tar.gz", hash = "sha256:a935fd1ea872591f2b5148907d103488fc523295e6c64b835cfad8c3eca44972", size = 298491, upload-time = "2025-03-07T21:47:56.461Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/53/eb690efa8513166adef3e0669afd31e95ffde69fb3c52ec2ac7223ed6018/fsspec-2025.3.0-py3-none-any.whl", hash = "sha256:efb87af3efa9103f94ca91a7f8cb7a4df91af9f74fc106c9c7ea0efd7277c1b3", size = 193615, upload-time = "2025-03-07T21:47:54.809Z" },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
+[[package]]
+name = "gdown"
+version = "5.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "filelock" },
+    { name = "requests", extra = ["socks"] },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/6a/37e6b70c5bda3161e40265861e63b64a86bfc6ca6a8f1c35328a675c84fd/gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787", size = 284647, upload-time = "2024-05-12T06:45:12.725Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/70/e07c381e6488a77094f04c85c9caf1c8008cdc30778f7019bc52e5285ef0/gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6", size = 18235, upload-time = "2024-05-12T06:45:10.017Z" },
+]
+
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
+[[package]]
+name = "google-api-core"
+version = "2.28.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-auth" },
+    { name = "googleapis-common-protos" },
+    { name = "proto-plus" },
+    { name = "protobuf" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a4/27/77ec922bf9b10ff605192cc6f7164f1448e60a9404290ed9b9c33589b1df/google_api_core-2.28.0.tar.gz", hash = "sha256:4743b7d45fe8c0930e59928b1bade287242910f30b06ff9b22f139a3e33271b8", size = 176510, upload-time = "2025-10-27T22:50:27.778Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/8a/c75ed5fd7819742201ffffbd61bb081af4819ea882a6b84930fa93f8e96f/google_api_core-2.28.0-py3-none-any.whl", hash = "sha256:b4362b0e2e6bc06037cfb0e2b28e2fe0c3f9d760dc311f314d5fb373768c7387", size = 173371, upload-time = "2025-10-27T22:50:25.853Z" },
+]
+
+[[package]]
+name = "google-api-python-client"
+version = "2.185.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core" },
+    { name = "google-auth" },
+    { name = "google-auth-httplib2" },
+    { name = "httplib2" },
+    { name = "uritemplate" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8e/5a/6f9b49d67ea91376305fdb8bbf2877c746d756e45fd8fb7d2e32d6dad19b/google_api_python_client-2.185.0.tar.gz", hash = "sha256:aa1b338e4bb0f141c2df26743f6b46b11f38705aacd775b61971cbc51da089c3", size = 13885609, upload-time = "2025-10-17T15:00:35.623Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/28/be3b17bd6a190c8c2ec9e4fb65d43e6ecd7b7a1bb19ccc1d9ab4f687a58c/google_api_python_client-2.185.0-py3-none-any.whl", hash = "sha256:00fe173a4b346d2397fbe0d37ac15368170dfbed91a0395a66ef2558e22b93fc", size = 14453595, upload-time = "2025-10-17T15:00:33.176Z" },
+]
+
+[[package]]
+name = "google-auth"
+version = "2.41.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cachetools" },
+    { name = "pyasn1-modules" },
+    { name = "rsa" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/af/5129ce5b2f9688d2fa49b463e544972a7c82b0fdb50980dafee92e121d9f/google_auth-2.41.1.tar.gz", hash = "sha256:b76b7b1f9e61f0cb7e88870d14f6a94aeef248959ef6992670efee37709cbfd2", size = 292284, upload-time = "2025-09-30T22:51:26.363Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/a4/7319a2a8add4cc352be9e3efeff5e2aacee917c85ca2fa1647e29089983c/google_auth-2.41.1-py2.py3-none-any.whl", hash = "sha256:754843be95575b9a19c604a848a41be03f7f2afd8c019f716dc1f51ee41c639d", size = 221302, upload-time = "2025-09-30T22:51:24.212Z" },
+]
+
+[[package]]
+name = "google-auth-httplib2"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-auth" },
+    { name = "httplib2" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/56/be/217a598a818567b28e859ff087f347475c807a5649296fb5a817c58dacef/google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05", size = 10842, upload-time = "2023-12-12T17:40:30.722Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/8a/fe34d2f3f9470a27b01c9e76226965863f153d5fbe276f83608562e49c04/google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d", size = 9253, upload-time = "2023-12-12T17:40:13.055Z" },
+]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-auth" },
+    { name = "requests-oauthlib" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ac/b4/1b19567e4c567b796f5c593d89895f3cfae5a38e04f27c6af87618fd0942/google_auth_oauthlib-1.3.0.tar.gz", hash = "sha256:cd39e807ac7229d6b8b9c1e297321d36fcc8a9e4857dff4301870985df51a528", size = 21777, upload-time = "2026-02-27T14:13:01.489Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/56/909fd5632226d3fba31d7aeffd4754410735d49362f5809956fe3e9af344/google_auth_oauthlib-1.3.0-py3-none-any.whl", hash = "sha256:386b3fb85cf4a5b819c6ad23e3128d975216b4cac76324de1d90b128aaf38f29", size = 19308, upload-time = "2026-02-27T14:12:47.865Z" },
+]
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.71.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/30/43/b25abe02db2911397819003029bef768f68a974f2ece483e6084d1a5f754/googleapis_common_protos-1.71.0.tar.gz", hash = "sha256:1aec01e574e29da63c80ba9f7bbf1ccfaacf1da877f23609fe236ca7c72a2e2e", size = 146454, upload-time = "2025-10-20T14:58:08.732Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/25/e8/eba9fece11d57a71e3e22ea672742c8f3cf23b35730c9e96db768b295216/googleapis_common_protos-1.71.0-py3-none-any.whl", hash = "sha256:59034a1d849dc4d18971997a72ac56246570afdd17f9369a0ff68218d50ab78c", size = 294576, upload-time = "2025-10-20T14:56:21.295Z" },
+]
+
+[[package]]
+name = "greenlet"
+version = "3.2.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/b8/704d753a5a45507a7aab61f18db9509302ed3d0a27ac7e0359ec2905b1a6/greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d", size = 188260, upload-time = "2025-08-07T13:24:33.51Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" },
+    { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/8f/95d48d7e3d433e6dae5b1682e4292242a53f22df82e6d3dda81b1701a960/greenlet-3.2.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:94abf90142c2a18151632371140b3dba4dee031633fe614cb592dbb6c9e17bc3", size = 644646, upload-time = "2025-08-07T13:45:26.523Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/5e/405965351aef8c76b8ef7ad370e5da58d57ef6068df197548b015464001a/greenlet-3.2.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:4d1378601b85e2e5171b99be8d2dc85f594c79967599328f95c1dc1a40f1c633", size = 640519, upload-time = "2025-08-07T13:53:13.928Z" },
+    { url = "https://files.pythonhosted.org/packages/25/5d/382753b52006ce0218297ec1b628e048c4e64b155379331f25a7316eb749/greenlet-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0db5594dce18db94f7d1650d7489909b57afde4c580806b8d9203b6e79cdc079", size = 639707, upload-time = "2025-08-07T13:18:27.146Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" },
+    { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" },
+]
+
+[[package]]
+name = "grpcio"
+version = "1.76.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/00/8163a1beeb6971f66b4bbe6ac9457b97948beba8dd2fc8e1281dce7f79ec/grpcio-1.76.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a", size = 5843567, upload-time = "2025-10-21T16:20:52.829Z" },
+    { url = "https://files.pythonhosted.org/packages/10/c1/934202f5cf335e6d852530ce14ddb0fef21be612ba9ecbbcbd4d748ca32d/grpcio-1.76.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c", size = 11848017, upload-time = "2025-10-21T16:20:56.705Z" },
+    { url = "https://files.pythonhosted.org/packages/11/0b/8dec16b1863d74af6eb3543928600ec2195af49ca58b16334972f6775663/grpcio-1.76.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465", size = 6412027, upload-time = "2025-10-21T16:20:59.3Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/64/7b9e6e7ab910bea9d46f2c090380bab274a0b91fb0a2fe9b0cd399fffa12/grpcio-1.76.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48", size = 7075913, upload-time = "2025-10-21T16:21:01.645Z" },
+    { url = "https://files.pythonhosted.org/packages/68/86/093c46e9546073cefa789bd76d44c5cb2abc824ca62af0c18be590ff13ba/grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da", size = 6615417, upload-time = "2025-10-21T16:21:03.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/b6/5709a3a68500a9c03da6fb71740dcdd5ef245e39266461a03f31a57036d8/grpcio-1.76.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397", size = 7199683, upload-time = "2025-10-21T16:21:06.195Z" },
+    { url = "https://files.pythonhosted.org/packages/91/d3/4b1f2bf16ed52ce0b508161df3a2d186e4935379a159a834cb4a7d687429/grpcio-1.76.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749", size = 8163109, upload-time = "2025-10-21T16:21:08.498Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/61/d9043f95f5f4cf085ac5dd6137b469d41befb04bd80280952ffa2a4c3f12/grpcio-1.76.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00", size = 7626676, upload-time = "2025-10-21T16:21:10.693Z" },
+    { url = "https://files.pythonhosted.org/packages/36/95/fd9a5152ca02d8881e4dd419cdd790e11805979f499a2e5b96488b85cf27/grpcio-1.76.0-cp311-cp311-win32.whl", hash = "sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054", size = 3997688, upload-time = "2025-10-21T16:21:12.746Z" },
+    { url = "https://files.pythonhosted.org/packages/60/9c/5c359c8d4c9176cfa3c61ecd4efe5affe1f38d9bae81e81ac7186b4c9cc8/grpcio-1.76.0-cp311-cp311-win_amd64.whl", hash = "sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d", size = 4709315, upload-time = "2025-10-21T16:21:15.26Z" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
+]
+
+[[package]]
+name = "h2"
+version = "4.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "hpack" },
+    { name = "hyperframe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
+]
+
+[[package]]
+name = "h5py"
+version = "3.15.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/6a/0d79de0b025aa85dc8864de8e97659c94cf3d23148394a954dc5ca52f8c8/h5py-3.15.1.tar.gz", hash = "sha256:c86e3ed45c4473564de55aa83b6fc9e5ead86578773dfbd93047380042e26b69", size = 426236, upload-time = "2025-10-16T10:35:27.404Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/41/fd/8349b48b15b47768042cff06ad6e1c229f0a4bd89225bf6b6894fea27e6d/h5py-3.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5aaa330bcbf2830150c50897ea5dcbed30b5b6d56897289846ac5b9e529ec243", size = 3434135, upload-time = "2025-10-16T10:33:47.954Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/b0/1c628e26a0b95858f54aba17e1599e7f6cd241727596cc2580b72cb0a9bf/h5py-3.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c970fb80001fffabb0109eaf95116c8e7c0d3ca2de854e0901e8a04c1f098509", size = 2870958, upload-time = "2025-10-16T10:33:50.907Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/e3/c255cafc9b85e6ea04e2ad1bba1416baa1d7f57fc98a214be1144087690c/h5py-3.15.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:80e5bb5b9508d5d9da09f81fd00abbb3f85da8143e56b1585d59bc8ceb1dba8b", size = 4504770, upload-time = "2025-10-16T10:33:54.357Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/23/4ab1108e87851ccc69694b03b817d92e142966a6c4abd99e17db77f2c066/h5py-3.15.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b849ba619a066196169763c33f9f0f02e381156d61c03e000bb0100f9950faf", size = 4700329, upload-time = "2025-10-16T10:33:57.616Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/e4/932a3a8516e4e475b90969bf250b1924dbe3612a02b897e426613aed68f4/h5py-3.15.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e7f6c841efd4e6e5b7e82222eaf90819927b6d256ab0f3aca29675601f654f3c", size = 4152456, upload-time = "2025-10-16T10:34:00.843Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/0a/f74d589883b13737021b2049ac796328f188dbb60c2ed35b101f5b95a3fc/h5py-3.15.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ca8a3a22458956ee7b40d8e39c9a9dc01f82933e4c030c964f8b875592f4d831", size = 4617295, upload-time = "2025-10-16T10:34:04.154Z" },
+    { url = "https://files.pythonhosted.org/packages/23/95/499b4e56452ef8b6c95a271af0dde08dac4ddb70515a75f346d4f400579b/h5py-3.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:550e51131376889656feec4aff2170efc054a7fe79eb1da3bb92e1625d1ac878", size = 2882129, upload-time = "2025-10-16T10:34:06.886Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/bb/cfcc70b8a42222ba3ad4478bcef1791181ea908e2adbd7d53c66395edad5/h5py-3.15.1-cp311-cp311-win_arm64.whl", hash = "sha256:b39239947cb36a819147fc19e86b618dcb0953d1cd969f5ed71fc0de60392427", size = 2477121, upload-time = "2025-10-16T10:34:09.579Z" },
+]
+
+[[package]]
+name = "hdbscan"
+version = "0.8.40"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c1/84/6b010387b795f774e1ec695df3c8660c15abd041783647d5e7e4076bfc6b/hdbscan-0.8.40.tar.gz", hash = "sha256:c9e383ff17beee0591075ff65d524bda5b5a35dfb01d218245a7ba30c8d48a17", size = 6904096, upload-time = "2024-11-18T16:14:05.384Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/6b/88b8c8023c0c0b27589ad83c82084a1b751917a3e09bdf7fcacf7e6bd523/hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5e958f0d7a33cd2b5e8e927b47f7360bf8a3e7d72355dd65a701e8aabe407b27", size = 1491349, upload-time = "2024-11-18T16:16:10.666Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/72/84bc7b6ea9eb59ca6c5e4d3f32313cdfa8f4ab5cfece6fb6dfef4c9149fc/hdbscan-0.8.40-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b95447c9c2cf6c95f98210c0edee3dc463d0a237e5531076855d9776495c96fc", size = 4459927, upload-time = "2025-10-11T11:55:49.958Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/ef/32c8a0b3dc6e6c4e433b85b30c3723d8eb48d115c0185b82ab89e1a0ef89/hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e0d6197ee045b173e1f16e6884386f335a56091e373a839dd24f7331a8fa9ed", size = 4576215, upload-time = "2024-11-18T16:14:11.241Z" },
+    { url = "https://files.pythonhosted.org/packages/64/b1/96c347c7740efa1ac803be64155159284f92fafcff88c1077344e64eead5/hdbscan-0.8.40-cp311-cp311-win_amd64.whl", hash = "sha256:127cbe8c858dc77adfde33a3e1ce4f3bea810f78b01d2bd47b1147d4b5a50472", size = 732173, upload-time = "2024-11-18T16:18:40.361Z" },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.1.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7a/49/91010b59debc7c862a5fd426d343134dd9a68778dbe570234b6495a4e204/hf_xet-1.1.8.tar.gz", hash = "sha256:62a0043e441753bbc446dcb5a3fe40a4d03f5fb9f13589ef1df9ab19252beb53", size = 484065, upload-time = "2025-08-18T22:01:03.584Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/91/5814db3a0d4a65fb6a87f0931ae28073b87f06307701fe66e7c41513bfb4/hf_xet-1.1.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3d5f82e533fc51c7daad0f9b655d9c7811b5308e5890236828bd1dd3ed8fea74", size = 2752357, upload-time = "2025-08-18T22:00:58.777Z" },
+    { url = "https://files.pythonhosted.org/packages/70/72/ce898516e97341a7a9d450609e130e108643389110261eaee6deb1ba8545/hf_xet-1.1.8-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:8e2dba5896bca3ab61d0bef4f01a1647004de59640701b37e37eaa57087bbd9d", size = 2613142, upload-time = "2025-08-18T22:00:57.252Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/d6/13af5f916cef795ac2b5e4cc1de31f2e0e375f4475d50799915835f301c2/hf_xet-1.1.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfe5700bc729be3d33d4e9a9b5cc17a951bf8c7ada7ba0c9198a6ab2053b7453", size = 3175859, upload-time = "2025-08-18T22:00:55.978Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/ed/34a193c9d1d72b7c3901b3b5153b1be9b2736b832692e1c3f167af537102/hf_xet-1.1.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:09e86514c3c4284ed8a57d6b0f3d089f9836a0af0a1ceb3c9dd664f1f3eaefef", size = 3074178, upload-time = "2025-08-18T22:00:54.147Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/1b/de6817b4bf65385280252dff5c9cceeedfbcb27ddb93923639323c1034a4/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4a9b99ab721d385b83f4fc8ee4e0366b0b59dce03b5888a86029cc0ca634efbf", size = 3238122, upload-time = "2025-08-18T22:01:00.546Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/13/874c85c7ed519ec101deb654f06703d9e5e68d34416730f64c4755ada36a/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:25b9d43333bbef39aeae1616789ec329c21401a7fe30969d538791076227b591", size = 3344325, upload-time = "2025-08-18T22:01:02.013Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/d3/0aaf279f4f3dea58e99401b92c31c0f752924ba0e6c7d7bb07b1dbd7f35e/hf_xet-1.1.8-cp37-abi3-win_amd64.whl", hash = "sha256:4171f31d87b13da4af1ed86c98cf763292e4720c088b4957cf9d564f92904ca9", size = 2801689, upload-time = "2025-08-18T22:01:04.81Z" },
+]
+
+[[package]]
+name = "hpack"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
+]
+
+[[package]]
+name = "httplib2"
+version = "0.31.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyparsing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/52/77/6653db69c1f7ecfe5e3f9726fdadc981794656fcd7d98c4209fecfea9993/httplib2-0.31.0.tar.gz", hash = "sha256:ac7ab497c50975147d4f7b1ade44becc7df2f8954d42b38b3d69c515f531135c", size = 250759, upload-time = "2025-09-11T12:16:03.403Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8c/a2/0d269db0f6163be503775dc8b6a6fa15820cc9fdc866f6ba608d86b721f2/httplib2-0.31.0-py3-none-any.whl", hash = "sha256:b9cd78abea9b4e43a7714c6e0f8b6b8561a6fc1e95d5dbd367f5bf0ef35f5d24", size = 91148, upload-time = "2025-09-11T12:16:01.803Z" },
+]
+
+[[package]]
+name = "httptools"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/46/120a669232c7bdedb9d52d4aeae7e6c7dfe151e99dc70802e2fc7a5e1993/httptools-0.7.1.tar.gz", hash = "sha256:abd72556974f8e7c74a259655924a717a2365b236c882c3f6f8a45fe94703ac9", size = 258961, upload-time = "2025-10-10T03:55:08.559Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/08/17e07e8d89ab8f343c134616d72eebfe03798835058e2ab579dcc8353c06/httptools-0.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:474d3b7ab469fefcca3697a10d11a32ee2b9573250206ba1e50d5980910da657", size = 206521, upload-time = "2025-10-10T03:54:31.002Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/06/c9c1b41ff52f16aee526fd10fbda99fa4787938aa776858ddc4a1ea825ec/httptools-0.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3c3b7366bb6c7b96bd72d0dbe7f7d5eead261361f013be5f6d9590465ea1c70", size = 110375, upload-time = "2025-10-10T03:54:31.941Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/cc/10935db22fda0ee34c76f047590ca0a8bd9de531406a3ccb10a90e12ea21/httptools-0.7.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:379b479408b8747f47f3b253326183d7c009a3936518cdb70db58cffd369d9df", size = 456621, upload-time = "2025-10-10T03:54:33.176Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/84/875382b10d271b0c11aa5d414b44f92f8dd53e9b658aec338a79164fa548/httptools-0.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cad6b591a682dcc6cf1397c3900527f9affef1e55a06c4547264796bbd17cf5e", size = 454954, upload-time = "2025-10-10T03:54:34.226Z" },
+    { url = "https://files.pythonhosted.org/packages/30/e1/44f89b280f7e46c0b1b2ccee5737d46b3bb13136383958f20b580a821ca0/httptools-0.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eb844698d11433d2139bbeeb56499102143beb582bd6c194e3ba69c22f25c274", size = 440175, upload-time = "2025-10-10T03:54:35.942Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/7e/b9287763159e700e335028bc1824359dc736fa9b829dacedace91a39b37e/httptools-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f65744d7a8bdb4bda5e1fa23e4ba16832860606fcc09d674d56e425e991539ec", size = 440310, upload-time = "2025-10-10T03:54:37.1Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/07/5b614f592868e07f5c94b1f301b5e14a21df4e8076215a3bccb830a687d8/httptools-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:135fbe974b3718eada677229312e97f3b31f8a9c8ffa3ae6f565bf808d5b6bcb", size = 86875, upload-time = "2025-10-10T03:54:38.421Z" },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
+]
+
+[package.optional-dependencies]
+http2 = [
+    { name = "h2" },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "0.34.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/45/c9/bdbe19339f76d12985bc03572f330a01a93c04dffecaaea3061bdd7fb892/huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c", size = 459768, upload-time = "2025-08-08T09:14:52.365Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/7b/bb06b061991107cd8783f300adff3e7b7f284e330fd82f507f2a1417b11d/huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a", size = 561452, upload-time = "2025-08-08T09:14:50.159Z" },
+]
+
+[[package]]
+name = "humanfriendly"
+version = "10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyreadline3", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
+]
+
+[[package]]
+name = "hydra-core"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "omegaconf" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" },
+]
+
+[[package]]
+name = "hyperframe"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
+]
+
+[[package]]
+name = "icecream"
+version = "2.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "colorama" },
+    { name = "executing" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d0/be/a89ec4132ddb4481f9587f736b8a01a07378f9e71de73549223ff1cd41f7/icecream-2.1.8.tar.gz", hash = "sha256:37269bbc62b02f0d85bfaf3a0eb4df272c967fad059f7ddcdaee5303ea2b2a62", size = 18337, upload-time = "2025-09-14T09:31:09.938Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/5f/f877d2cfcad41c0db98b120872f67b805fde5e6d407e584e9edfc9dec35c/icecream-2.1.8-py3-none-any.whl", hash = "sha256:10b1c39dcb54cb28eb487bac56c35dbf9c2b2f406d24340e1a615c3f17274852", size = 15714, upload-time = "2025-09-14T09:31:08.647Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
+]
+
+[[package]]
+name = "imagesize"
+version = "1.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" },
+]
+
+[[package]]
+name = "importlib-metadata"
+version = "8.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "zipp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "ipykernel"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "appnope", marker = "sys_platform == 'darwin'" },
+    { name = "comm" },
+    { name = "debugpy" },
+    { name = "ipython" },
+    { name = "jupyter-client" },
+    { name = "jupyter-core" },
+    { name = "matplotlib-inline" },
+    { name = "nest-asyncio" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b9/a4/4948be6eb88628505b83a1f2f40d90254cab66abf2043b3c40fa07dfce0f/ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db", size = 174579, upload-time = "2025-10-27T09:46:39.471Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a3/17/20c2552266728ceba271967b87919664ecc0e33efca29c3efc6baf88c5f9/ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c", size = 117968, upload-time = "2025-10-27T09:46:37.805Z" },
+]
+
+[[package]]
+name = "ipython"
+version = "9.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "decorator" },
+    { name = "ipython-pygments-lexers" },
+    { name = "jedi" },
+    { name = "matplotlib-inline" },
+    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "prompt-toolkit" },
+    { name = "pygments" },
+    { name = "stack-data" },
+    { name = "traitlets" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2a/34/29b18c62e39ee2f7a6a3bba7efd952729d8aadd45ca17efc34453b717665/ipython-9.6.0.tar.gz", hash = "sha256:5603d6d5d356378be5043e69441a072b50a5b33b4503428c77b04cb8ce7bc731", size = 4396932, upload-time = "2025-09-29T10:55:53.948Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/48/c5/d5e07995077e48220269c28a221e168c91123ad5ceee44d548f54a057fc0/ipython-9.6.0-py3-none-any.whl", hash = "sha256:5f77efafc886d2f023442479b8149e7d86547ad0a979e9da9f045d252f648196", size = 616170, upload-time = "2025-09-29T10:55:47.676Z" },
+]
+
+[[package]]
+name = "ipython-pygments-lexers"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
+]
+
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
+[[package]]
+name = "jedi"
+version = "0.19.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "parso" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
+]
+
+[[package]]
+name = "jellyfish"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/14/fc5bdb637996df181e5c4fa3b15dcc27d33215e6c41753564ae453bdb40f/jellyfish-1.2.1.tar.gz", hash = "sha256:72d2fda61b23babe862018729be73c8b0dc12e3e6601f36f6e65d905e249f4db", size = 364417, upload-time = "2025-10-11T19:36:37.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/67/5d5ec4004d92573cbccd33fc84d0ad61e523b29f7b17b062913b183961e1/jellyfish-1.2.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9913789a98ccf49213fbb1dabc597847a0ec33d3b0e151689498f4b38ba9be0f", size = 325488, upload-time = "2025-10-11T19:35:06.87Z" },
+    { url = "https://files.pythonhosted.org/packages/83/21/6cf3add349cd0002cc586178bd8f1fd006894e5c70f959a8db5507cfe075/jellyfish-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4e36d9000d4f7e1a35689a74ec7749d27a216dfa6c47cac2e5ad3de8a523bd69", size = 320226, upload-time = "2025-10-11T19:35:08.314Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/ed/b5458b09482913caece2e9f807599318e48490b01c3c3134b636ecd7af8c/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7853d2ed7d6929c029312ec849410f1ea7ae76ce72ad1140fb73f6e8a1e6aa4f", size = 353091, upload-time = "2025-10-11T19:35:09.395Z" },
+    { url = "https://files.pythonhosted.org/packages/67/be/7e01fda506f3249d3548d35d1203e009a850734297ccfe4039ce76a927dc/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:68080af234256ef943f0add6fc79816b0c643d8df291c17a85c1b6e45bdfbb96", size = 362820, upload-time = "2025-10-11T19:35:11.28Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/95/39302d0df1e1b7c348c1fe6fda27cc6cd4c0bd0b27d79f34de3981a14e55/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c5acb213aa75a61bcfc176566e20f2503069667e760d83d403b59e115fef0dd", size = 360560, upload-time = "2025-10-11T19:35:12.266Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/9d/b477787bc032e8b5b1ffd798e1c638ecbd54621967dc5577ccd10b5e9444/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4b28fcefc0c3534277ff0306e6c10672fb050f4784b5f3be7037e80801569fb5", size = 533823, upload-time = "2025-10-11T19:35:13.268Z" },
+    { url = "https://files.pythonhosted.org/packages/73/7e/c6e389c4fccfc2838b1d3fe21736b5bf9ea1e739287d128a291eb84df158/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f69aeb08659a6c81d559bbe319075e3417434ae5b3a5e4a758d1c4055a03497a", size = 554439, upload-time = "2025-10-11T19:35:14.595Z" },
+    { url = "https://files.pythonhosted.org/packages/17/1e/3239b2dfdfb2f1d8795a8d35936c5eadb90475dbbeebacf45e083579d560/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:63770120cc3386dcc13bcc4df508ab281a6b14c3b2c0e33586439a6c40ee122f", size = 523762, upload-time = "2025-10-11T19:35:15.614Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/05/62f16bec1d2cd74e6944dfb18a8511bd9df9f2d58e041567f909da22ee26/jellyfish-1.2.1-cp311-cp311-win32.whl", hash = "sha256:ecf62d4aad0baa8832ab60f96e7baedbe6558bd292597503d927e9c5bce745d8", size = 208967, upload-time = "2025-10-11T19:35:16.616Z" },
+    { url = "https://files.pythonhosted.org/packages/87/a4/69b65d9090d297407bc530f2e5b8707aa1caa9484e7281a04da6821f13be/jellyfish-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:bd186c041d9be86c4fa5e2490943ce5d7f05b472f45d7f49426f259f3dd20bc4", size = 213556, upload-time = "2025-10-11T19:35:17.528Z" },
+    { url = "https://files.pythonhosted.org/packages/96/72/e4897449abd844d501412873d1d15bd846bcc919648c0b1043e225268b21/jellyfish-1.2.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:748dc45a0394fbe9120b8b3b9a39fab0967c7e2d6ecdd5304af018e774f80f96", size = 326967, upload-time = "2025-10-11T19:36:18.851Z" },
+    { url = "https://files.pythonhosted.org/packages/60/3f/c7a550abd212ae40c2a555055a3f16ba39376e486ba0189e150fb25cf6b1/jellyfish-1.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:13f1ac9caba22af10bfe42f674822643c0266009f882e0fe652079706dc5d13a", size = 321759, upload-time = "2025-10-11T19:36:19.882Z" },
+    { url = "https://files.pythonhosted.org/packages/19/58/a268365ba659f04d4db0c94325042aa9aee69c3a9a5823a5b2a2db308a5c/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ffeeb6c78c45fbb6d2a22b0173fb8a6af849001d6c26fab49c525136dbd9734", size = 354847, upload-time = "2025-10-11T19:36:20.976Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/63/371351a5e0e19d642e33c1c8b4c3ef47538f36bbd8d76a06ee34000b38a2/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1354b558a0a16597b6032dd0af64bebd24994f7e7484cf14993320eb764b06cb", size = 364210, upload-time = "2025-10-11T19:36:22.071Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/ed/f43d79b9b6d846189b1235f8303d1246ca9cba79a61a26cac790b57c1789/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5977810972c6f0b2e61252c4758fd5aee21abf663ff309881195a99d37daa94", size = 361876, upload-time = "2025-10-11T19:36:23.235Z" },
+    { url = "https://files.pythonhosted.org/packages/74/5c/82455195b77cd1996c3618bd5aa8f25a9fc254d401a3c1425fb60cf97742/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:536c80d8d4ec7f39cbb10b85d926ff96cef3cde4a83ca0991c07cd9835d5dc13", size = 535488, upload-time = "2025-10-11T19:36:24.252Z" },
+    { url = "https://files.pythonhosted.org/packages/74/3d/295468c5df5a8d03f522b0c21fc3e694d6be376602a6d755bf7815947522/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_i686.whl", hash = "sha256:21baa92d4a5112167721156f6d061c2ae105f2995b3a5e19cec6662928f0c439", size = 555818, upload-time = "2025-10-11T19:36:25.667Z" },
+    { url = "https://files.pythonhosted.org/packages/41/6d/4029265138a5a0b18e4df381560e467339bec477f4efaa7614736dbc433e/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:68ea3ddd4dae1152a7f7155ef02a7bfad919611158d71b301f9aa167685819af", size = 525259, upload-time = "2025-10-11T19:36:26.726Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "jiter"
+version = "0.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" },
+    { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" },
+    { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" },
+    { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" },
+    { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" },
+    { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" },
+    { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" },
+]
+
+[[package]]
+name = "joblib"
+version = "1.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
+]
+
+[[package]]
+name = "jsonlines"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" },
+]
+
+[[package]]
+name = "jupyter-client"
+version = "8.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jupyter-core" },
+    { name = "python-dateutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" },
+]
+
+[[package]]
+name = "jupyter-core"
+version = "5.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "platformdirs" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" },
+]
+
+[[package]]
+name = "kiwisolver"
+version = "1.4.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6f/ab/c80b0d5a9d8a1a65f4f815f2afff9798b12c3b9f31f1d304dd233dd920e2/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16", size = 124167, upload-time = "2025-08-10T21:25:53.403Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c0/27fe1a68a39cf62472a300e2879ffc13c0538546c359b86f149cc19f6ac3/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089", size = 66579, upload-time = "2025-08-10T21:25:54.79Z" },
+    { url = "https://files.pythonhosted.org/packages/31/a2/a12a503ac1fd4943c50f9822678e8015a790a13b5490354c68afb8489814/kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543", size = 65309, upload-time = "2025-08-10T21:25:55.76Z" },
+    { url = "https://files.pythonhosted.org/packages/66/e1/e533435c0be77c3f64040d68d7a657771194a63c279f55573188161e81ca/kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61", size = 1435596, upload-time = "2025-08-10T21:25:56.861Z" },
+    { url = "https://files.pythonhosted.org/packages/67/1e/51b73c7347f9aabdc7215aa79e8b15299097dc2f8e67dee2b095faca9cb0/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1", size = 1246548, upload-time = "2025-08-10T21:25:58.246Z" },
+    { url = "https://files.pythonhosted.org/packages/21/aa/72a1c5d1e430294f2d32adb9542719cfb441b5da368d09d268c7757af46c/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872", size = 1263618, upload-time = "2025-08-10T21:25:59.857Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/af/db1509a9e79dbf4c260ce0cfa3903ea8945f6240e9e59d1e4deb731b1a40/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26", size = 1317437, upload-time = "2025-08-10T21:26:01.105Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/f2/3ea5ee5d52abacdd12013a94130436e19969fa183faa1e7c7fbc89e9a42f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028", size = 2195742, upload-time = "2025-08-10T21:26:02.675Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/9b/1efdd3013c2d9a2566aa6a337e9923a00590c516add9a1e89a768a3eb2fc/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771", size = 2290810, upload-time = "2025-08-10T21:26:04.009Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/e5/cfdc36109ae4e67361f9bc5b41323648cb24a01b9ade18784657e022e65f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a", size = 2461579, upload-time = "2025-08-10T21:26:05.317Z" },
+    { url = "https://files.pythonhosted.org/packages/62/86/b589e5e86c7610842213994cdea5add00960076bef4ae290c5fa68589cac/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464", size = 2268071, upload-time = "2025-08-10T21:26:06.686Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/c6/f8df8509fd1eee6c622febe54384a96cfaf4d43bf2ccec7a0cc17e4715c9/kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2", size = 73840, upload-time = "2025-08-10T21:26:07.94Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/2d/16e0581daafd147bc11ac53f032a2b45eabac897f42a338d0a13c1e5c436/kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7", size = 65159, upload-time = "2025-08-10T21:26:09.048Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/0f/36d89194b5a32c054ce93e586d4049b6c2c22887b0eb229c61c68afd3078/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5", size = 60104, upload-time = "2025-08-10T21:27:43.287Z" },
+    { url = "https://files.pythonhosted.org/packages/52/ba/4ed75f59e4658fd21fe7dde1fee0ac397c678ec3befba3fe6482d987af87/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa", size = 58592, upload-time = "2025-08-10T21:27:44.314Z" },
+    { url = "https://files.pythonhosted.org/packages/33/01/a8ea7c5ea32a9b45ceeaee051a04c8ed4320f5add3c51bfa20879b765b70/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2", size = 80281, upload-time = "2025-08-10T21:27:45.369Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/dbd2ecdce306f1d07a1aaf324817ee993aab7aee9db47ceac757deabafbe/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f", size = 78009, upload-time = "2025-08-10T21:27:46.376Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" },
+]
+
+[[package]]
+name = "lazy-loader"
+version = "0.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6b/c875b30a1ba490860c93da4cabf479e03f584eba06fe5963f6f6644653d8/lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1", size = 15431, upload-time = "2024-04-05T13:03:12.261Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc", size = 12097, upload-time = "2024-04-05T13:03:10.514Z" },
+]
+
+[[package]]
+name = "levenshtein"
+version = "0.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rapidfuzz" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7e/b3/b5f8011483ba9083a0bc74c4d58705e9cf465fbe55c948a1b1357d0a2aa8/levenshtein-0.27.1.tar.gz", hash = "sha256:3e18b73564cfc846eec94dd13fab6cb006b5d2e0cc56bad1fd7d5585881302e3", size = 382571, upload-time = "2025-03-02T19:44:56.148Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/84/110136e740655779aceb0da2399977362f21b2dbf3ea3646557f9c2237c4/levenshtein-0.27.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e6f1760108319a108dceb2f02bc7cdb78807ad1f9c673c95eaa1d0fe5dfcaae", size = 174555, upload-time = "2025-03-02T19:42:51.781Z" },
+    { url = "https://files.pythonhosted.org/packages/19/5b/176d96959f5c5969f356d8856f8e20d2e72f7e4879f6d1cda8e5c2ac2614/levenshtein-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c4ed8400d94ab348099395e050b8ed9dd6a5d6b5b9e75e78b2b3d0b5f5b10f38", size = 156286, upload-time = "2025-03-02T19:42:53.106Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/2d/a75abaafc8a46b0dc52ab14dc96708989a31799a02a4914f9210c3415f04/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7826efe51be8ff58bc44a633e022fdd4b9fc07396375a6dbc4945a3bffc7bf8f", size = 152413, upload-time = "2025-03-02T19:42:55.129Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/5f/533f4adf964b10817a1d0ecca978b3542b3b9915c96172d20162afe18bed/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff5afb78719659d353055863c7cb31599fbea6865c0890b2d840ee40214b3ddb", size = 184236, upload-time = "2025-03-02T19:42:56.427Z" },
+    { url = "https://files.pythonhosted.org/packages/02/79/e698623795e36e0d166a3aa1eac6fe1e446cac3a5c456664a95c351571d1/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:201dafd5c004cd52018560cf3213da799534d130cf0e4db839b51f3f06771de0", size = 185502, upload-time = "2025-03-02T19:42:57.596Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/94/76b64762f4af6e20bbab79713c4c48783240e6e502b2f52e5037ddda688a/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5ddd59f3cfaec216811ee67544779d9e2d6ed33f79337492a248245d6379e3d", size = 161749, upload-time = "2025-03-02T19:42:59.222Z" },
+    { url = "https://files.pythonhosted.org/packages/56/d0/d10eff9224c94a478078a469aaeb43471fdeddad035f443091224c7544b8/levenshtein-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6afc241d27ecf5b921063b796812c55b0115423ca6fa4827aa4b1581643d0a65", size = 246686, upload-time = "2025-03-02T19:43:00.454Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/8a/ebbeff74461da3230d00e8a8197480a2ea1a9bbb7dbc273214d7ea3896cb/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee2e766277cceb8ca9e584ea03b8dc064449ba588d3e24c1923e4b07576db574", size = 1116616, upload-time = "2025-03-02T19:43:02.431Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/9b/e7323684f833ede13113fba818c3afe665a78b47d720afdeb2e530c1ecb3/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:920b23d6109453913ce78ec451bc402ff19d020ee8be4722e9d11192ec2fac6f", size = 1401483, upload-time = "2025-03-02T19:43:04.62Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/1d/9b6ab30ff086a33492d6f7de86a07050b15862ccf0d9feeccfbe26af52d8/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:560d7edba126e2eea3ac3f2f12e7bd8bc9c6904089d12b5b23b6dfa98810b209", size = 1225805, upload-time = "2025-03-02T19:43:06.734Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/07/ae2f31e87ff65ba4857e25192646f1f3c8cca83c2ac1c27e551215b7e1b6/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8d5362b6c7aa4896dc0cb1e7470a4ad3c06124e0af055dda30d81d3c5549346b", size = 1419860, upload-time = "2025-03-02T19:43:08.084Z" },
+    { url = "https://files.pythonhosted.org/packages/43/d2/dfcc5c22c07bab9be99f3f47a907be583bcd37bfd2eec57a205e59671019/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:65ba880815b0f80a80a293aeebac0fab8069d03ad2d6f967a886063458f9d7a1", size = 1188823, upload-time = "2025-03-02T19:43:09.592Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/96/713335623f8ab50eba0627c8685618dc3a985aedaaea9f492986b9443551/levenshtein-0.27.1-cp311-cp311-win32.whl", hash = "sha256:fcc08effe77fec0bc5b0f6f10ff20b9802b961c4a69047b5499f383119ddbe24", size = 88156, upload-time = "2025-03-02T19:43:11.442Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ae/444d6e8ba9a35379a56926716f18bb2e77c6cf69e5324521fbe6885f14f6/levenshtein-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:0ed402d8902be7df212ac598fc189f9b2d520817fdbc6a05e2ce44f7f3ef6857", size = 100399, upload-time = "2025-03-02T19:43:13.066Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c0/ff226897a238a2deb2ca2c00d658755a1aa01884b0ddc8f5d406cb5f2b0d/levenshtein-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:7fdaab29af81a8eb981043737f42450efca64b9761ca29385487b29c506da5b5", size = 88033, upload-time = "2025-03-02T19:43:14.211Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/44/c5955d0b6830925559b00617d80c9f6e03a9b00c451835ee4da7010e71cd/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:909b7b6bce27a4ec90576c9a9bd9af5a41308dfecf364b410e80b58038277bbe", size = 170533, upload-time = "2025-03-02T19:44:38.096Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/3f/858572d68b33e13a9c154b99f153317efe68381bf63cc4e986e820935fc3/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d193a7f97b8c6a350e36ec58e41a627c06fa4157c3ce4b2b11d90cfc3c2ebb8f", size = 153119, upload-time = "2025-03-02T19:44:39.388Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/60/2bd8d001ea4eb53ca16faa7a649d56005ba22b1bcc2a4f1617ab27ed7e48/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:614be316e3c06118705fae1f717f9072d35108e5fd4e66a7dd0e80356135340b", size = 149576, upload-time = "2025-03-02T19:44:40.617Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/db/0580797e1e4ac26cf67761a235b29b49f62d2b175dbbc609882f2aecd4e4/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31fc0a5bb070722bdabb6f7e14955a294a4a968c68202d294699817f21545d22", size = 157445, upload-time = "2025-03-02T19:44:41.901Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/de/9c171c96d1f15c900086d7212b5543a85539e767689fc4933d14048ba1ec/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9415aa5257227af543be65768a80c7a75e266c3c818468ce6914812f88f9c3df", size = 243141, upload-time = "2025-03-02T19:44:43.228Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/1e/408fd10217eac0e43aea0604be22b4851a09e03d761d44d4ea12089dd70e/levenshtein-0.27.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:7987ef006a3cf56a4532bd4c90c2d3b7b4ca9ad3bf8ae1ee5713c4a3bdfda913", size = 98045, upload-time = "2025-03-02T19:44:44.527Z" },
+]
+
+[[package]]
+name = "llvmlite"
+version = "0.45.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/99/8d/5baf1cef7f9c084fb35a8afbde88074f0d6a727bc63ef764fe0e7543ba40/llvmlite-0.45.1.tar.gz", hash = "sha256:09430bb9d0bb58fc45a45a57c7eae912850bedc095cd0810a57de109c69e1c32", size = 185600, upload-time = "2025-10-01T17:59:52.046Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/ad/9bdc87b2eb34642c1cfe6bcb4f5db64c21f91f26b010f263e7467e7536a3/llvmlite-0.45.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:60f92868d5d3af30b4239b50e1717cb4e4e54f6ac1c361a27903b318d0f07f42", size = 43043526, upload-time = "2025-10-01T18:03:15.051Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/ea/c25c6382f452a943b4082da5e8c1665ce29a62884e2ec80608533e8e82d5/llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98baab513e19beb210f1ef39066288784839a44cd504e24fff5d17f1b3cf0860", size = 37253118, upload-time = "2025-10-01T18:04:06.783Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/af/85fc237de98b181dbbe8647324331238d6c52a3554327ccdc83ced28efba/llvmlite-0.45.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3adc2355694d6a6fbcc024d59bb756677e7de506037c878022d7b877e7613a36", size = 56288209, upload-time = "2025-10-01T18:01:00.168Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/df/3daf95302ff49beff4230065e3178cd40e71294968e8d55baf4a9e560814/llvmlite-0.45.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f3377a6db40f563058c9515dedcc8a3e562d8693a106a28f2ddccf2c8fcf6ca", size = 55140958, upload-time = "2025-10-01T18:02:11.199Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/56/4c0d503fe03bac820ecdeb14590cf9a248e120f483bcd5c009f2534f23f0/llvmlite-0.45.1-cp311-cp311-win_amd64.whl", hash = "sha256:f9c272682d91e0d57f2a76c6d9ebdfccc603a01828cdbe3d15273bdca0c3363a", size = 38132232, upload-time = "2025-10-01T18:04:52.181Z" },
+]
+
+[[package]]
+name = "lxml"
+version = "6.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/d5/becbe1e2569b474a23f0c672ead8a29ac50b2dc1d5b9de184831bda8d14c/lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607", size = 8634365, upload-time = "2025-09-22T04:00:45.672Z" },
+    { url = "https://files.pythonhosted.org/packages/28/66/1ced58f12e804644426b85d0bb8a4478ca77bc1761455da310505f1a3526/lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938", size = 4650793, upload-time = "2025-09-22T04:00:47.783Z" },
+    { url = "https://files.pythonhosted.org/packages/11/84/549098ffea39dfd167e3f174b4ce983d0eed61f9d8d25b7bf2a57c3247fc/lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d", size = 4944362, upload-time = "2025-09-22T04:00:49.845Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/bd/f207f16abf9749d2037453d56b643a7471d8fde855a231a12d1e095c4f01/lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438", size = 5083152, upload-time = "2025-09-22T04:00:51.709Z" },
+    { url = "https://files.pythonhosted.org/packages/15/ae/bd813e87d8941d52ad5b65071b1affb48da01c4ed3c9c99e40abb266fbff/lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964", size = 5023539, upload-time = "2025-09-22T04:00:53.593Z" },
+    { url = "https://files.pythonhosted.org/packages/02/cd/9bfef16bd1d874fbe0cb51afb00329540f30a3283beb9f0780adbb7eec03/lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d", size = 5344853, upload-time = "2025-09-22T04:00:55.524Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/89/ea8f91594bc5dbb879734d35a6f2b0ad50605d7fb419de2b63d4211765cc/lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7", size = 5225133, upload-time = "2025-09-22T04:00:57.269Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/37/9c735274f5dbec726b2db99b98a43950395ba3d4a1043083dba2ad814170/lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178", size = 4677944, upload-time = "2025-09-22T04:00:59.052Z" },
+    { url = "https://files.pythonhosted.org/packages/20/28/7dfe1ba3475d8bfca3878365075abe002e05d40dfaaeb7ec01b4c587d533/lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553", size = 5284535, upload-time = "2025-09-22T04:01:01.335Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/cf/5f14bc0de763498fc29510e3532bf2b4b3a1c1d5d0dff2e900c16ba021ef/lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb", size = 5067343, upload-time = "2025-09-22T04:01:03.13Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/b0/bb8275ab5472f32b28cfbbcc6db7c9d092482d3439ca279d8d6fa02f7025/lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a", size = 4725419, upload-time = "2025-09-22T04:01:05.013Z" },
+    { url = "https://files.pythonhosted.org/packages/25/4c/7c222753bc72edca3b99dbadba1b064209bc8ed4ad448af990e60dcce462/lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c", size = 5275008, upload-time = "2025-09-22T04:01:07.327Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/8c/478a0dc6b6ed661451379447cdbec77c05741a75736d97e5b2b729687828/lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7", size = 5248906, upload-time = "2025-09-22T04:01:09.452Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/d9/5be3a6ab2784cdf9accb0703b65e1b64fcdd9311c9f007630c7db0cfcce1/lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46", size = 3610357, upload-time = "2025-09-22T04:01:11.102Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/7d/ca6fb13349b473d5732fb0ee3eec8f6c80fc0688e76b7d79c1008481bf1f/lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078", size = 4036583, upload-time = "2025-09-22T04:01:12.766Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/a2/51363b5ecd3eab46563645f3a2c3836a2fc67d01a1b87c5017040f39f567/lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285", size = 3680591, upload-time = "2025-09-22T04:01:14.874Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/11/29d08bc103a62c0eba8016e7ed5aeebbf1e4312e83b0b1648dd203b0e87d/lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700", size = 3949829, upload-time = "2025-09-22T04:04:45.608Z" },
+    { url = "https://files.pythonhosted.org/packages/12/b3/52ab9a3b31e5ab8238da241baa19eec44d2ab426532441ee607165aebb52/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee", size = 4226277, upload-time = "2025-09-22T04:04:47.754Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/33/1eaf780c1baad88224611df13b1c2a9dfa460b526cacfe769103ff50d845/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f", size = 4330433, upload-time = "2025-09-22T04:04:49.907Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/c1/27428a2ff348e994ab4f8777d3a0ad510b6b92d37718e5887d2da99952a2/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9", size = 4272119, upload-time = "2025-09-22T04:04:51.801Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/d0/3020fa12bcec4ab62f97aab026d57c2f0cfd480a558758d9ca233bb6a79d/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a", size = 4417314, upload-time = "2025-09-22T04:04:55.024Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" },
+]
+
+[[package]]
+name = "markdown"
+version = "3.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353, upload-time = "2024-10-18T15:21:02.187Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392, upload-time = "2024-10-18T15:21:02.941Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984, upload-time = "2024-10-18T15:21:03.953Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120, upload-time = "2024-10-18T15:21:06.495Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032, upload-time = "2024-10-18T15:21:07.295Z" },
+    { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057, upload-time = "2024-10-18T15:21:08.073Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359, upload-time = "2024-10-18T15:21:09.318Z" },
+    { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306, upload-time = "2024-10-18T15:21:10.185Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094, upload-time = "2024-10-18T15:21:11.005Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521, upload-time = "2024-10-18T15:21:12.911Z" },
+]
+
+[[package]]
+name = "matplotlib"
+version = "3.10.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "contourpy" },
+    { name = "cycler" },
+    { name = "fonttools" },
+    { name = "kiwisolver" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "pyparsing" },
+    { name = "python-dateutil" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/e2/d2d5295be2f44c678ebaf3544ba32d20c1f9ef08c49fe47f496180e1db15/matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7", size = 34804865, upload-time = "2025-10-09T00:28:00.669Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/bc/0fb489005669127ec13f51be0c6adc074d7cf191075dab1da9fe3b7a3cfc/matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a", size = 8257507, upload-time = "2025-10-09T00:26:19.073Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/6a/d42588ad895279ff6708924645b5d2ed54a7fb2dc045c8a804e955aeace1/matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6", size = 8119565, upload-time = "2025-10-09T00:26:21.023Z" },
+    { url = "https://files.pythonhosted.org/packages/10/b7/4aa196155b4d846bd749cf82aa5a4c300cf55a8b5e0dfa5b722a63c0f8a0/matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a", size = 8692668, upload-time = "2025-10-09T00:26:22.967Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/e7/664d2b97016f46683a02d854d730cfcf54ff92c1dafa424beebef50f831d/matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1", size = 9521051, upload-time = "2025-10-09T00:26:25.041Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/a3/37aef1404efa615f49b5758a5e0261c16dd88f389bc1861e722620e4a754/matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc", size = 9576878, upload-time = "2025-10-09T00:26:27.478Z" },
+    { url = "https://files.pythonhosted.org/packages/33/cd/b145f9797126f3f809d177ca378de57c45413c5099c5990de2658760594a/matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e", size = 8115142, upload-time = "2025-10-09T00:26:29.774Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/39/63bca9d2b78455ed497fcf51a9c71df200a11048f48249038f06447fa947/matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9", size = 7992439, upload-time = "2025-10-09T00:26:40.32Z" },
+    { url = "https://files.pythonhosted.org/packages/58/8f/76d5dc21ac64a49e5498d7f0472c0781dae442dd266a67458baec38288ec/matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0", size = 8252283, upload-time = "2025-10-09T00:27:54.739Z" },
+    { url = "https://files.pythonhosted.org/packages/27/0d/9c5d4c2317feb31d819e38c9f947c942f42ebd4eb935fc6fd3518a11eaa7/matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68", size = 8116733, upload-time = "2025-10-09T00:27:56.406Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/cc/3fe688ff1355010937713164caacf9ed443675ac48a997bab6ed23b3f7c0/matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91", size = 8693919, upload-time = "2025-10-09T00:27:58.41Z" },
+]
+
+[[package]]
+name = "matplotlib-inline"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "mmcv"
+version = "2.1.0"
+source = { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" }
+dependencies = [
+    { name = "addict" },
+    { name = "mmengine" },
+    { name = "numpy" },
+    { name = "opencv-python" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "pyyaml" },
+    { name = "regex", marker = "sys_platform == 'win32'" },
+    { name = "yapf" },
+]
+wheels = [
+    { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e0ee80f8f86e4227eb58636406517a92699a806db427846cceb2ad28ecd73243" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "addict" },
+    { name = "addict", marker = "extra == 'all'" },
+    { name = "coverage", marker = "extra == 'all'" },
+    { name = "coverage", marker = "extra == 'tests'" },
+    { name = "lmdb", marker = "extra == 'all'" },
+    { name = "lmdb", marker = "extra == 'tests'" },
+    { name = "mmengine", specifier = ">=0.3.0" },
+    { name = "mmengine", marker = "extra == 'all'", specifier = ">=0.3.0" },
+    { name = "ninja", marker = "extra == 'all'" },
+    { name = "ninja", marker = "extra == 'optional'" },
+    { name = "numpy" },
+    { name = "numpy", marker = "extra == 'all'" },
+    { name = "onnx", marker = "extra == 'all'" },
+    { name = "onnx", marker = "extra == 'tests'" },
+    { name = "onnxoptimizer", marker = "extra == 'all'" },
+    { name = "onnxoptimizer", marker = "extra == 'tests'" },
+    { name = "onnxruntime", marker = "extra == 'all'" },
+    { name = "onnxruntime", marker = "extra == 'tests'" },
+    { name = "opencv-python", specifier = ">=3" },
+    { name = "packaging" },
+    { name = "packaging", marker = "extra == 'all'" },
+    { name = "pillow" },
+    { name = "pillow", marker = "extra == 'all'" },
+    { name = "psutil", marker = "extra == 'all'" },
+    { name = "psutil", marker = "extra == 'optional'" },
+    { name = "pytest", marker = "extra == 'all'" },
+    { name = "pytest", marker = "extra == 'tests'" },
+    { name = "pytest-runner", marker = "extra == 'all'" },
+    { name = "pytest-runner", marker = "extra == 'build'" },
+    { name = "pyturbojpeg", marker = "extra == 'all'" },
+    { name = "pyturbojpeg", marker = "extra == 'tests'" },
+    { name = "pyyaml" },
+    { name = "pyyaml", marker = "extra == 'all'" },
+    { name = "regex", marker = "sys_platform == 'win32'" },
+    { name = "regex", marker = "sys_platform == 'win32' and extra == 'all'" },
+    { name = "scipy", marker = "extra == 'all'" },
+    { name = "scipy", marker = "extra == 'tests'" },
+    { name = "tifffile", marker = "extra == 'all'" },
+    { name = "tifffile", marker = "extra == 'tests'" },
+    { name = "yapf" },
+    { name = "yapf", marker = "extra == 'all'" },
+]
+provides-extras = ["all", "build", "optional", "tests"]
+
+[[package]]
+name = "mmdet"
+version = "3.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pycocotools" },
+    { name = "scipy" },
+    { name = "shapely" },
+    { name = "six" },
+    { name = "terminaltables" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/c897d2fe3c3aa40fd83ea04c6103412cf0bd4db4bb20db4248f5c09673e7/mmdet-3.3.0.tar.gz", hash = "sha256:fe8cc2685d60a2a4f2530a4e92aa6269fe45af93265303a31bf4ea463eb3164f", size = 1249318, upload-time = "2024-01-05T06:25:32.637Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/c7/c2d91161c9b3e1c237ea00e9cefb7f4bfe2854769f56025db415b734aedb/mmdet-3.3.0-py3-none-any.whl", hash = "sha256:2e23e291281ac57e7dccf8678e957da45fbe560ce78a1f5ded6afeccd3730f17", size = 2231444, upload-time = "2024-01-05T06:25:30.116Z" },
+]
+
+[[package]]
+name = "mmengine"
+version = "0.10.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "addict" },
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "opencv-python" },
+    { name = "pyyaml" },
+    { name = "regex", marker = "sys_platform == 'win32'" },
+    { name = "rich" },
+    { name = "termcolor" },
+    { name = "yapf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/17/14/959360bbd8374e23fc1b720906999add16a3ac071a501636db12c5861ff5/mmengine-0.10.7.tar.gz", hash = "sha256:d20ffcc31127567e53dceff132612a87f0081de06cbb7ab2bdb7439125a69225", size = 378090, upload-time = "2025-03-04T12:23:09.568Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/8e/f98332248aad102511bea4ae19c0ddacd2f0a994f3ca4c82b7a369e0af8b/mmengine-0.10.7-py3-none-any.whl", hash = "sha256:262ac976a925562f78cd5fd14dd1bc9b680ed0aa81f0d85b723ef782f99c54ee", size = 452720, upload-time = "2025-03-04T12:23:06.339Z" },
+]
+
+[[package]]
+name = "more-itertools"
+version = "10.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },
+]
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
+]
+
+[[package]]
+name = "msgpack"
+version = "1.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/97/560d11202bcd537abca693fd85d81cebe2107ba17301de42b01ac1677b69/msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c", size = 82271, upload-time = "2025-10-08T09:14:49.967Z" },
+    { url = "https://files.pythonhosted.org/packages/83/04/28a41024ccbd67467380b6fb440ae916c1e4f25e2cd4c63abe6835ac566e/msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0", size = 84914, upload-time = "2025-10-08T09:14:50.958Z" },
+    { url = "https://files.pythonhosted.org/packages/71/46/b817349db6886d79e57a966346cf0902a426375aadc1e8e7a86a75e22f19/msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296", size = 416962, upload-time = "2025-10-08T09:14:51.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e0/6cc2e852837cd6086fe7d8406af4294e66827a60a4cf60b86575a4a65ca8/msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef", size = 426183, upload-time = "2025-10-08T09:14:53.477Z" },
+    { url = "https://files.pythonhosted.org/packages/25/98/6a19f030b3d2ea906696cedd1eb251708e50a5891d0978b012cb6107234c/msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c", size = 411454, upload-time = "2025-10-08T09:14:54.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/cd/9098fcb6adb32187a70b7ecaabf6339da50553351558f37600e53a4a2a23/msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e", size = 422341, upload-time = "2025-10-08T09:14:56.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ae/270cecbcf36c1dc85ec086b33a51a4d7d08fc4f404bdbc15b582255d05ff/msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e", size = 64747, upload-time = "2025-10-08T09:14:57.882Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/79/309d0e637f6f37e83c711f547308b91af02b72d2326ddd860b966080ef29/msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68", size = 71633, upload-time = "2025-10-08T09:14:59.177Z" },
+    { url = "https://files.pythonhosted.org/packages/73/4d/7c4e2b3d9b1106cd0aa6cb56cc57c6267f59fa8bfab7d91df5adc802c847/msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406", size = 64755, upload-time = "2025-10-08T09:15:00.48Z" },
+]
+
+[[package]]
+name = "msgpack-numpy"
+version = "0.4.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "msgpack" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/08/94/61e8aee142733ebfdc400a05bdac6e1763c4514bba3b42743d223f388450/msgpack-numpy-0.4.8.tar.gz", hash = "sha256:c667d3180513422f9c7545be5eec5d296dcbb357e06f72ed39cc683797556e69", size = 10923, upload-time = "2022-06-09T03:43:08.739Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/5d/f25ac7d4fb77cbd53ddc6d05d833c6bf52b12770a44fa9a447eed470ca9a/msgpack_numpy-0.4.8-py2.py3-none-any.whl", hash = "sha256:773c19d4dfbae1b3c7b791083e2caf66983bb19b40901646f61d8731554ae3da", size = 6919, upload-time = "2022-06-09T03:43:06.82Z" },
+]
+
+[[package]]
+name = "multidict"
+version = "6.6.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6b/7f/90a7f01e2d005d6653c689039977f6856718c75c5579445effb7e60923d1/multidict-6.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c7a0e9b561e6460484318a7612e725df1145d46b0ef57c6b9866441bf6e27e0c", size = 76472, upload-time = "2025-08-11T12:06:29.006Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a3/bed07bc9e2bb302ce752f1dabc69e884cd6a676da44fb0e501b246031fdd/multidict-6.6.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6bf2f10f70acc7a2446965ffbc726e5fc0b272c97a90b485857e5c70022213eb", size = 44634, upload-time = "2025-08-11T12:06:30.374Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/4b/ceeb4f8f33cf81277da464307afeaf164fb0297947642585884f5cad4f28/multidict-6.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:66247d72ed62d5dd29752ffc1d3b88f135c6a8de8b5f63b7c14e973ef5bda19e", size = 44282, upload-time = "2025-08-11T12:06:31.958Z" },
+    { url = "https://files.pythonhosted.org/packages/03/35/436a5da8702b06866189b69f655ffdb8f70796252a8772a77815f1812679/multidict-6.6.4-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:105245cc6b76f51e408451a844a54e6823bbd5a490ebfe5bdfc79798511ceded", size = 229696, upload-time = "2025-08-11T12:06:33.087Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/0e/915160be8fecf1fca35f790c08fb74ca684d752fcba62c11daaf3d92c216/multidict-6.6.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cbbc54e58b34c3bae389ef00046be0961f30fef7cb0dd9c7756aee376a4f7683", size = 246665, upload-time = "2025-08-11T12:06:34.448Z" },
+    { url = "https://files.pythonhosted.org/packages/08/ee/2f464330acd83f77dcc346f0b1a0eaae10230291450887f96b204b8ac4d3/multidict-6.6.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:56c6b3652f945c9bc3ac6c8178cd93132b8d82dd581fcbc3a00676c51302bc1a", size = 225485, upload-time = "2025-08-11T12:06:35.672Z" },
+    { url = "https://files.pythonhosted.org/packages/71/cc/9a117f828b4d7fbaec6adeed2204f211e9caf0a012692a1ee32169f846ae/multidict-6.6.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b95494daf857602eccf4c18ca33337dd2be705bccdb6dddbfc9d513e6addb9d9", size = 257318, upload-time = "2025-08-11T12:06:36.98Z" },
+    { url = "https://files.pythonhosted.org/packages/25/77/62752d3dbd70e27fdd68e86626c1ae6bccfebe2bb1f84ae226363e112f5a/multidict-6.6.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e5b1413361cef15340ab9dc61523e653d25723e82d488ef7d60a12878227ed50", size = 254689, upload-time = "2025-08-11T12:06:38.233Z" },
+    { url = "https://files.pythonhosted.org/packages/00/6e/fac58b1072a6fc59af5e7acb245e8754d3e1f97f4f808a6559951f72a0d4/multidict-6.6.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e167bf899c3d724f9662ef00b4f7fef87a19c22b2fead198a6f68b263618df52", size = 246709, upload-time = "2025-08-11T12:06:39.517Z" },
+    { url = "https://files.pythonhosted.org/packages/01/ef/4698d6842ef5e797c6db7744b0081e36fb5de3d00002cc4c58071097fac3/multidict-6.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aaea28ba20a9026dfa77f4b80369e51cb767c61e33a2d4043399c67bd95fb7c6", size = 243185, upload-time = "2025-08-11T12:06:40.796Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/c9/d82e95ae1d6e4ef396934e9b0e942dfc428775f9554acf04393cce66b157/multidict-6.6.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:8c91cdb30809a96d9ecf442ec9bc45e8cfaa0f7f8bdf534e082c2443a196727e", size = 237838, upload-time = "2025-08-11T12:06:42.595Z" },
+    { url = "https://files.pythonhosted.org/packages/57/cf/f94af5c36baaa75d44fab9f02e2a6bcfa0cd90acb44d4976a80960759dbc/multidict-6.6.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a0ccbfe93ca114c5d65a2471d52d8829e56d467c97b0e341cf5ee45410033b3", size = 246368, upload-time = "2025-08-11T12:06:44.304Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/fe/29f23460c3d995f6a4b678cb2e9730e7277231b981f0b234702f0177818a/multidict-6.6.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:55624b3f321d84c403cb7d8e6e982f41ae233d85f85db54ba6286f7295dc8a9c", size = 253339, upload-time = "2025-08-11T12:06:45.597Z" },
+    { url = "https://files.pythonhosted.org/packages/29/b6/fd59449204426187b82bf8a75f629310f68c6adc9559dc922d5abe34797b/multidict-6.6.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4a1fb393a2c9d202cb766c76208bd7945bc194eba8ac920ce98c6e458f0b524b", size = 246933, upload-time = "2025-08-11T12:06:46.841Z" },
+    { url = "https://files.pythonhosted.org/packages/19/52/d5d6b344f176a5ac3606f7a61fb44dc746e04550e1a13834dff722b8d7d6/multidict-6.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:43868297a5759a845fa3a483fb4392973a95fb1de891605a3728130c52b8f40f", size = 242225, upload-time = "2025-08-11T12:06:48.588Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/d3/5b2281ed89ff4d5318d82478a2a2450fcdfc3300da48ff15c1778280ad26/multidict-6.6.4-cp311-cp311-win32.whl", hash = "sha256:ed3b94c5e362a8a84d69642dbeac615452e8af9b8eb825b7bc9f31a53a1051e2", size = 41306, upload-time = "2025-08-11T12:06:49.95Z" },
+    { url = "https://files.pythonhosted.org/packages/74/7d/36b045c23a1ab98507aefd44fd8b264ee1dd5e5010543c6fccf82141ccef/multidict-6.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:d8c112f7a90d8ca5d20213aa41eac690bb50a76da153e3afb3886418e61cb22e", size = 46029, upload-time = "2025-08-11T12:06:51.082Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/5e/553d67d24432c5cd52b49047f2d248821843743ee6d29a704594f656d182/multidict-6.6.4-cp311-cp311-win_arm64.whl", hash = "sha256:3bb0eae408fa1996d87247ca0d6a57b7fc1dcf83e8a5c47ab82c558c250d4adf", size = 43017, upload-time = "2025-08-11T12:06:52.243Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.16"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" },
+    { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload-time = "2024-01-28T18:52:30.853Z" },
+    { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" },
+]
+
+[[package]]
+name = "narwhals"
+version = "2.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/e5/ef07d31c2e07d99eecac8e14ace5c20aeb00ecba4ed5bb00343136380524/narwhals-2.10.0.tar.gz", hash = "sha256:1c05bbef2048a4045263de7d98c3d06140583eb13d796dd733b2157f05d24485", size = 582423, upload-time = "2025-10-27T17:55:55.632Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/13/024ae0586d901f8a6f99e2d29b4ae217e8ef11d3fd944cdfc3bbde5f2a08/narwhals-2.10.0-py3-none-any.whl", hash = "sha256:baed44e8fc38e800e3a585e3fa9843a7079a6fad5fbffbecee4348d6ac52298c", size = 418077, upload-time = "2025-10-27T17:55:53.709Z" },
+]
+
+[[package]]
+name = "natsort"
+version = "8.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581", size = 76575, upload-time = "2023-06-20T04:17:19.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/82/7a9d0550484a62c6da82858ee9419f3dd1ccc9aa1c26a1e43da3ecd20b0d/natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c", size = 38268, upload-time = "2023-06-20T04:17:17.522Z" },
+]
+
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" },
+]
+
+[[package]]
+name = "networkx"
+version = "3.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" },
+]
+
+[[package]]
+name = "numba"
+version = "0.62.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "llvmlite" },
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/20/33dbdbfe60e5fd8e3dbfde299d106279a33d9f8308346022316781368591/numba-0.62.1.tar.gz", hash = "sha256:7b774242aa890e34c21200a1fc62e5b5757d5286267e71103257f4e2af0d5161", size = 2749817, upload-time = "2025-09-29T10:46:31.551Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/5f/8b3491dd849474f55e33c16ef55678ace1455c490555337899c35826836c/numba-0.62.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:f43e24b057714e480fe44bc6031de499e7cf8150c63eb461192caa6cc8530bc8", size = 2684279, upload-time = "2025-09-29T10:43:37.213Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/18/71969149bfeb65a629e652b752b80167fe8a6a6f6e084f1f2060801f7f31/numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:57cbddc53b9ee02830b828a8428757f5c218831ccc96490a314ef569d8342b7b", size = 2687330, upload-time = "2025-09-29T10:43:59.601Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/7d/403be3fecae33088027bc8a95dc80a2fda1e3beff3e0e5fc4374ada3afbe/numba-0.62.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:604059730c637c7885386521bb1b0ddcbc91fd56131a6dcc54163d6f1804c872", size = 3739727, upload-time = "2025-09-29T10:42:45.922Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/c3/3d910d08b659a6d4c62ab3cd8cd93c4d8b7709f55afa0d79a87413027ff6/numba-0.62.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d6c540880170bee817011757dc9049dba5a29db0c09b4d2349295991fe3ee55f", size = 3445490, upload-time = "2025-09-29T10:43:12.692Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/82/9d425c2f20d9f0a37f7cb955945a553a00fa06a2b025856c3550227c5543/numba-0.62.1-cp311-cp311-win_amd64.whl", hash = "sha256:03de6d691d6b6e2b76660ba0f38f37b81ece8b2cc524a62f2a0cfae2bfb6f9da", size = 2745550, upload-time = "2025-09-29T10:44:20.571Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "1.26.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" },
+    { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" },
+    { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" },
+    { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" },
+]
+
+[[package]]
+name = "oauth2client"
+version = "4.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httplib2" },
+    { name = "pyasn1" },
+    { name = "pyasn1-modules" },
+    { name = "rsa" },
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a6/7b/17244b1083e8e604bf154cf9b716aecd6388acd656dd01893d0d244c94d9/oauth2client-4.1.3.tar.gz", hash = "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6", size = 155910, upload-time = "2018-09-07T21:38:18.036Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/a9/4f25a14d23f0786b64875b91784607c2277eff25d48f915e39ff0cff505a/oauth2client-4.1.3-py2.py3-none-any.whl", hash = "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac", size = 98206, upload-time = "2018-09-07T21:38:16.742Z" },
+]
+
+[[package]]
+name = "oauthlib"
+version = "3.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" },
+]
+
+[[package]]
+name = "omegaconf"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" },
+]
+
+[[package]]
+name = "opencv-python"
+version = "4.11.0.86"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322, upload-time = "2025-01-16T13:52:25.887Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197, upload-time = "2025-01-16T13:55:21.222Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439, upload-time = "2025-01-16T13:51:35.822Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597, upload-time = "2025-01-16T13:52:08.836Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" },
+]
+
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "25.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
+]
+
+[[package]]
+name = "pandas"
+version = "2.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/1c/ccf70029e927e473a4476c00e0d5b32e623bff27f0402d0a92b7fc29bb9f/pandas-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b", size = 11566608, upload-time = "2025-07-07T19:18:33.86Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/d3/3c37cb724d76a841f14b8f5fe57e5e3645207cc67370e4f84717e8bb7657/pandas-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f", size = 10823181, upload-time = "2025-07-07T19:18:36.151Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/4c/367c98854a1251940edf54a4df0826dcacfb987f9068abf3e3064081a382/pandas-2.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85", size = 11793570, upload-time = "2025-07-07T19:18:38.385Z" },
+    { url = "https://files.pythonhosted.org/packages/07/5f/63760ff107bcf5146eee41b38b3985f9055e710a72fdd637b791dea3495c/pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d", size = 12378887, upload-time = "2025-07-07T19:18:41.284Z" },
+    { url = "https://files.pythonhosted.org/packages/15/53/f31a9b4dfe73fe4711c3a609bd8e60238022f48eacedc257cd13ae9327a7/pandas-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678", size = 13230957, upload-time = "2025-07-07T19:18:44.187Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/94/6fce6bf85b5056d065e0a7933cba2616dcb48596f7ba3c6341ec4bcc529d/pandas-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299", size = 13883883, upload-time = "2025-07-07T19:18:46.498Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/7b/bdcb1ed8fccb63d04bdb7635161d0ec26596d92c9d7a6cce964e7876b6c1/pandas-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab", size = 11340212, upload-time = "2025-07-07T19:18:49.293Z" },
+]
+
+[[package]]
+name = "parso"
+version = "0.8.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" },
+]
+
+[[package]]
+name = "pdf2image"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" },
+]
+
+[[package]]
+name = "pdfminer-six"
+version = "20251230"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "charset-normalizer" },
+    { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/9a/d79d8fa6d47a0338846bb558b39b9963b8eb2dfedec61867c138c1b17eeb/pdfminer_six-20251230.tar.gz", hash = "sha256:e8f68a14c57e00c2d7276d26519ea64be1b48f91db1cdc776faa80528ca06c1e", size = 8511285, upload-time = "2025-12-30T15:49:13.104Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/d7/b288ea32deb752a09aab73c75e1e7572ab2a2b56c3124a5d1eb24c62ceb3/pdfminer_six-20251230-py3-none-any.whl", hash = "sha256:9ff2e3466a7dfc6de6fd779478850b6b7c2d9e9405aa2a5869376a822771f485", size = 6591909, upload-time = "2025-12-30T15:49:10.76Z" },
+]
+
+[[package]]
+name = "pdfplumber"
+version = "0.11.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pdfminer-six" },
+    { name = "pillow" },
+    { name = "pypdfium2" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/37/9ca3519e92a8434eb93be570b131476cc0a4e840bb39c62ddb7813a39d53/pdfplumber-0.11.9.tar.gz", hash = "sha256:481224b678b2bbdbf376e2c39bf914144eef7c3d301b4a28eebf0f7f6109d6dc", size = 102768, upload-time = "2026-01-05T08:10:29.072Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/c8/cdbc975f5b634e249cfa6597e37c50f3078412474f21c015e508bfbfe3c3/pdfplumber-0.11.9-py3-none-any.whl", hash = "sha256:33ec5580959ba524e9100138746e090879504c42955df1b8a997604dd326c443", size = 60045, upload-time = "2026-01-05T08:10:27.512Z" },
+]
+
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ptyprocess" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
+]
+
+[[package]]
+name = "pillow"
+version = "11.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/db/26/77f8ed17ca4ffd60e1dcd220a6ec6d71210ba398cfa33a13a1cd614c5613/pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722", size = 5316531, upload-time = "2025-07-01T09:13:59.203Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/39/ee475903197ce709322a17a866892efb560f57900d9af2e55f86db51b0a5/pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288", size = 4686560, upload-time = "2025-07-01T09:14:01.101Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/90/442068a160fd179938ba55ec8c97050a612426fae5ec0a764e345839f76d/pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d", size = 5870978, upload-time = "2025-07-03T13:09:55.638Z" },
+    { url = "https://files.pythonhosted.org/packages/13/92/dcdd147ab02daf405387f0218dcf792dc6dd5b14d2573d40b4caeef01059/pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494", size = 7641168, upload-time = "2025-07-03T13:10:00.37Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/db/839d6ba7fd38b51af641aa904e2960e7a5644d60ec754c046b7d2aee00e5/pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58", size = 5973053, upload-time = "2025-07-01T09:14:04.491Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/2f/d7675ecae6c43e9f12aa8d58b6012683b20b6edfbdac7abcb4e6af7a3784/pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f", size = 6640273, upload-time = "2025-07-01T09:14:06.235Z" },
+    { url = "https://files.pythonhosted.org/packages/45/ad/931694675ede172e15b2ff03c8144a0ddaea1d87adb72bb07655eaffb654/pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e", size = 6082043, upload-time = "2025-07-01T09:14:07.978Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/04/ba8f2b11fc80d2dd462d7abec16351b45ec99cbbaea4387648a44190351a/pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94", size = 6715516, upload-time = "2025-07-01T09:14:10.233Z" },
+    { url = "https://files.pythonhosted.org/packages/48/59/8cd06d7f3944cc7d892e8533c56b0acb68399f640786313275faec1e3b6f/pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0", size = 6274768, upload-time = "2025-07-01T09:14:11.921Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/cc/29c0f5d64ab8eae20f3232da8f8571660aa0ab4b8f1331da5c2f5f9a938e/pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac", size = 6986055, upload-time = "2025-07-01T09:14:13.623Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/df/90bd886fabd544c25addd63e5ca6932c86f2b701d5da6c7839387a076b4a/pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd", size = 2423079, upload-time = "2025-07-01T09:14:15.268Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/e3/6fa84033758276fb31da12e5fb66ad747ae83b93c67af17f8c6ff4cc8f34/pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6", size = 5270566, upload-time = "2025-07-01T09:16:19.801Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/ee/e8d2e1ab4892970b561e1ba96cbd59c0d28cf66737fc44abb2aec3795a4e/pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438", size = 4654618, upload-time = "2025-07-01T09:16:21.818Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6d/17f80f4e1f0761f02160fc433abd4109fa1548dcfdca46cfdadaf9efa565/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3", size = 4874248, upload-time = "2025-07-03T13:11:20.738Z" },
+    { url = "https://files.pythonhosted.org/packages/de/5f/c22340acd61cef960130585bbe2120e2fd8434c214802f07e8c03596b17e/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c", size = 6583963, upload-time = "2025-07-03T13:11:26.283Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5e/03966aedfbfcbb4d5f8aa042452d3361f325b963ebbadddac05b122e47dd/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361", size = 4957170, upload-time = "2025-07-01T09:16:23.762Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/2d/e082982aacc927fc2cab48e1e731bdb1643a1406acace8bed0900a61464e/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7", size = 5581505, upload-time = "2025-07-01T09:16:25.593Z" },
+    { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
+]
+
+[[package]]
+name = "playwright"
+version = "1.55.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "greenlet" },
+    { name = "pyee" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/3a/c81ff76df266c62e24f19718df9c168f49af93cabdbc4608ae29656a9986/playwright-1.55.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:d7da108a95001e412effca4f7610de79da1637ccdf670b1ae3fdc08b9694c034", size = 40428109, upload-time = "2025-08-28T15:46:20.357Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/f5/bdb61553b20e907196a38d864602a9b4a461660c3a111c67a35179b636fa/playwright-1.55.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:8290cf27a5d542e2682ac274da423941f879d07b001f6575a5a3a257b1d4ba1c", size = 38687254, upload-time = "2025-08-28T15:46:23.925Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/64/48b2837ef396487807e5ab53c76465747e34c7143fac4a084ef349c293a8/playwright-1.55.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:25b0d6b3fd991c315cca33c802cf617d52980108ab8431e3e1d37b5de755c10e", size = 40428108, upload-time = "2025-08-28T15:46:27.119Z" },
+    { url = "https://files.pythonhosted.org/packages/08/33/858312628aa16a6de97839adc2ca28031ebc5391f96b6fb8fdf1fcb15d6c/playwright-1.55.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c6d4d8f6f8c66c483b0835569c7f0caa03230820af8e500c181c93509c92d831", size = 45905643, upload-time = "2025-08-28T15:46:30.312Z" },
+    { url = "https://files.pythonhosted.org/packages/83/83/b8d06a5b5721931aa6d5916b83168e28bd891f38ff56fe92af7bdee9860f/playwright-1.55.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29a0777c4ce1273acf90c87e4ae2fe0130182100d99bcd2ae5bf486093044838", size = 45296647, upload-time = "2025-08-28T15:46:33.221Z" },
+    { url = "https://files.pythonhosted.org/packages/06/2e/9db64518aebcb3d6ef6cd6d4d01da741aff912c3f0314dadb61226c6a96a/playwright-1.55.0-py3-none-win32.whl", hash = "sha256:29e6d1558ad9d5b5c19cbec0a72f6a2e35e6353cd9f262e22148685b86759f90", size = 35476046, upload-time = "2025-08-28T15:46:36.184Z" },
+    { url = "https://files.pythonhosted.org/packages/46/4f/9ba607fa94bb9cee3d4beb1c7b32c16efbfc9d69d5037fa85d10cafc618b/playwright-1.55.0-py3-none-win_amd64.whl", hash = "sha256:7eb5956473ca1951abb51537e6a0da55257bb2e25fc37c2b75af094a5c93736c", size = 35476048, upload-time = "2025-08-28T15:46:38.867Z" },
+    { url = "https://files.pythonhosted.org/packages/21/98/5ca173c8ec906abde26c28e1ecb34887343fd71cc4136261b90036841323/playwright-1.55.0-py3-none-win_arm64.whl", hash = "sha256:012dc89ccdcbd774cdde8aeee14c08e0dd52ddb9135bf10e9db040527386bd76", size = 31225543, upload-time = "2025-08-28T15:46:41.613Z" },
+]
+
+[[package]]
+name = "plotly"
+version = "6.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "narwhals" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0c/63/961d47c9ffd592a575495891cdcf7875dc0903ebb33ac238935714213789/plotly-6.3.1.tar.gz", hash = "sha256:dd896e3d940e653a7ce0470087e82c2bd903969a55e30d1b01bb389319461bb0", size = 6956460, upload-time = "2025-10-02T16:10:34.16Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e32393b6ab9d11a664a60e9b7b7/plotly-6.3.1-py3-none-any.whl", hash = "sha256:8b4420d1dcf2b040f5983eed433f95732ed24930e496d36eb70d211923532e64", size = 9833698, upload-time = "2025-10-02T16:10:22.584Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "postgrest"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecation" },
+    { name = "httpx", extra = ["http2"] },
+    { name = "pydantic" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/50/ce/cf638adae7b454650aeba0537886a4ab23327d0bfdf7842d74173584345b/postgrest-2.25.1.tar.gz", hash = "sha256:73fcf2acfc0724702c0487224e3a1fdb888f7bfd9644eeb225a94d91be0920f9", size = 13681, upload-time = "2025-12-10T21:48:28.22Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/b6/b20f69577f693d415981af21e2b3b03308ef50c79a2ee7a8bed796791965/postgrest-2.25.1-py3-none-any.whl", hash = "sha256:8fb7944c613022398ff1e643621c232b170d363a7333b9dd316360ab37dc5b4e", size = 21582, upload-time = "2025-12-10T21:48:27.017Z" },
+]
+
+[[package]]
+name = "prompt-toolkit"
+version = "3.0.52"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wcwidth" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" },
+]
+
+[[package]]
+name = "propcache"
+version = "0.3.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/8d/e8b436717ab9c2cfc23b116d2c297305aa4cd8339172a456d61ebf5669b8/propcache-0.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b8d2f607bd8f80ddc04088bc2a037fdd17884a6fcadc47a96e334d72f3717be", size = 74207, upload-time = "2025-06-09T22:54:05.399Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/29/1e34000e9766d112171764b9fa3226fa0153ab565d0c242c70e9945318a7/propcache-0.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06766d8f34733416e2e34f46fea488ad5d60726bb9481d3cddf89a6fa2d9603f", size = 43648, upload-time = "2025-06-09T22:54:08.023Z" },
+    { url = "https://files.pythonhosted.org/packages/46/92/1ad5af0df781e76988897da39b5f086c2bf0f028b7f9bd1f409bb05b6874/propcache-0.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2dc1f4a1df4fecf4e6f68013575ff4af84ef6f478fe5344317a65d38a8e6dc9", size = 43496, upload-time = "2025-06-09T22:54:09.228Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/ce/e96392460f9fb68461fabab3e095cb00c8ddf901205be4eae5ce246e5b7e/propcache-0.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be29c4f4810c5789cf10ddf6af80b041c724e629fa51e308a7a0fb19ed1ef7bf", size = 217288, upload-time = "2025-06-09T22:54:10.466Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/2a/866726ea345299f7ceefc861a5e782b045545ae6940851930a6adaf1fca6/propcache-0.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d61f6970ecbd8ff2e9360304d5c8876a6abd4530cb752c06586849ac8a9dc9", size = 227456, upload-time = "2025-06-09T22:54:11.828Z" },
+    { url = "https://files.pythonhosted.org/packages/de/03/07d992ccb6d930398689187e1b3c718339a1c06b8b145a8d9650e4726166/propcache-0.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62180e0b8dbb6b004baec00a7983e4cc52f5ada9cd11f48c3528d8cfa7b96a66", size = 225429, upload-time = "2025-06-09T22:54:13.823Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/e6/116ba39448753b1330f48ab8ba927dcd6cf0baea8a0ccbc512dfb49ba670/propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c144ca294a204c470f18cf4c9d78887810d04a3e2fbb30eea903575a779159df", size = 213472, upload-time = "2025-06-09T22:54:15.232Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/85/f01f5d97e54e428885a5497ccf7f54404cbb4f906688a1690cd51bf597dc/propcache-0.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5c2a784234c28854878d68978265617aa6dc0780e53d44b4d67f3651a17a9a2", size = 204480, upload-time = "2025-06-09T22:54:17.104Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/79/7bf5ab9033b8b8194cc3f7cf1aaa0e9c3256320726f64a3e1f113a812dce/propcache-0.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5745bc7acdafa978ca1642891b82c19238eadc78ba2aaa293c6863b304e552d7", size = 214530, upload-time = "2025-06-09T22:54:18.512Z" },
+    { url = "https://files.pythonhosted.org/packages/31/0b/bd3e0c00509b609317df4a18e6b05a450ef2d9a963e1d8bc9c9415d86f30/propcache-0.3.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:c0075bf773d66fa8c9d41f66cc132ecc75e5bb9dd7cce3cfd14adc5ca184cb95", size = 205230, upload-time = "2025-06-09T22:54:19.947Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/23/fae0ff9b54b0de4e819bbe559508da132d5683c32d84d0dc2ccce3563ed4/propcache-0.3.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5f57aa0847730daceff0497f417c9de353c575d8da3579162cc74ac294c5369e", size = 206754, upload-time = "2025-06-09T22:54:21.716Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/7f/ad6a3c22630aaa5f618b4dc3c3598974a72abb4c18e45a50b3cdd091eb2f/propcache-0.3.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:eef914c014bf72d18efb55619447e0aecd5fb7c2e3fa7441e2e5d6099bddff7e", size = 218430, upload-time = "2025-06-09T22:54:23.17Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/2c/ba4f1c0e8a4b4c75910742f0d333759d441f65a1c7f34683b4a74c0ee015/propcache-0.3.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2a4092e8549031e82facf3decdbc0883755d5bbcc62d3aea9d9e185549936dcf", size = 223884, upload-time = "2025-06-09T22:54:25.539Z" },
+    { url = "https://files.pythonhosted.org/packages/88/e4/ebe30fc399e98572019eee82ad0caf512401661985cbd3da5e3140ffa1b0/propcache-0.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:85871b050f174bc0bfb437efbdb68aaf860611953ed12418e4361bc9c392749e", size = 211480, upload-time = "2025-06-09T22:54:26.892Z" },
+    { url = "https://files.pythonhosted.org/packages/96/0a/7d5260b914e01d1d0906f7f38af101f8d8ed0dc47426219eeaf05e8ea7c2/propcache-0.3.2-cp311-cp311-win32.whl", hash = "sha256:36c8d9b673ec57900c3554264e630d45980fd302458e4ac801802a7fd2ef7897", size = 37757, upload-time = "2025-06-09T22:54:28.241Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/2d/89fe4489a884bc0da0c3278c552bd4ffe06a1ace559db5ef02ef24ab446b/propcache-0.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53af8cb6a781b02d2ea079b5b853ba9430fcbe18a8e3ce647d5982a3ff69f39", size = 41500, upload-time = "2025-06-09T22:54:29.4Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" },
+]
+
+[[package]]
+name = "proto-plus"
+version = "1.26.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" },
+]
+
+[[package]]
+name = "protobuf"
+version = "6.32.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2", size = 440614, upload-time = "2025-08-14T21:21:25.015Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741", size = 424409, upload-time = "2025-08-14T21:21:12.366Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e", size = 435735, upload-time = "2025-08-14T21:21:15.046Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0", size = 426449, upload-time = "2025-08-14T21:21:16.687Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1", size = 322869, upload-time = "2025-08-14T21:21:18.282Z" },
+    { url = "https://files.pythonhosted.org/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c", size = 322009, upload-time = "2025-08-14T21:21:19.893Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783", size = 169287, upload-time = "2025-08-14T21:21:23.515Z" },
+]
+
+[[package]]
+name = "psutil"
+version = "7.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/ec/7b8e6b9b1d22708138630ef34c53ab2b61032c04f16adfdbb96791c8c70c/psutil-7.1.2.tar.gz", hash = "sha256:aa225cdde1335ff9684708ee8c72650f6598d5ed2114b9a7c5802030b1785018", size = 487424, upload-time = "2025-10-25T10:46:34.931Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/89/b9f8d47ddbc52d7301fc868e8224e5f44ed3c7f55e6d0f54ecaf5dd9ff5e/psutil-7.1.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c9ba5c19f2d46203ee8c152c7b01df6eec87d883cfd8ee1af2ef2727f6b0f814", size = 237244, upload-time = "2025-10-25T10:47:07.086Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/7a/8628c2f6b240680a67d73d8742bb9ff39b1820a693740e43096d5dcb01e5/psutil-7.1.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:2a486030d2fe81bec023f703d3d155f4823a10a47c36784c84f1cc7f8d39bedb", size = 238101, upload-time = "2025-10-25T10:47:09.523Z" },
+    { url = "https://files.pythonhosted.org/packages/30/28/5e27f4d5a0e347f8e3cc16cd7d35533dbce086c95807f1f0e9cd77e26c10/psutil-7.1.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3efd8fc791492e7808a51cb2b94889db7578bfaea22df931424f874468e389e3", size = 258675, upload-time = "2025-10-25T10:47:11.082Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/5c/79cf60c9acf36d087f0db0f82066fca4a780e97e5b3a2e4c38209c03d170/psutil-7.1.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2aeb9b64f481b8eabfc633bd39e0016d4d8bbcd590d984af764d80bf0851b8a", size = 260203, upload-time = "2025-10-25T10:47:13.226Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/03/0a464404c51685dcb9329fdd660b1721e076ccd7b3d97dee066bcc9ffb15/psutil-7.1.2-cp37-abi3-win_amd64.whl", hash = "sha256:8e17852114c4e7996fe9da4745c2bdef001ebbf2f260dec406290e66628bdb91", size = 246714, upload-time = "2025-10-25T10:47:15.093Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/32/97ca2090f2f1b45b01b6aa7ae161cfe50671de097311975ca6eea3e7aabc/psutil-7.1.2-cp37-abi3-win_arm64.whl", hash = "sha256:3e988455e61c240cc879cb62a008c2699231bf3e3d061d7fce4234463fd2abb4", size = 243742, upload-time = "2025-10-25T10:47:17.302Z" },
+]
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
+]
+
+[[package]]
+name = "pure-eval"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
+]
+
+[[package]]
+name = "pyarrow"
+version = "21.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" },
+    { url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" },
+    { url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" },
+]
+
+[[package]]
+name = "pyasn1"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" },
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
+]
+
+[[package]]
+name = "pycocotools"
+version = "2.0.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/35/a6/694fd661f0feb5e91f7049a202ea12de312ca9010c33bd9d9f0c63046c01/pycocotools-2.0.10.tar.gz", hash = "sha256:7a47609cdefc95e5e151313c7d93a61cf06e15d42c7ba99b601e3bc0f9ece2e1", size = 25389, upload-time = "2025-06-04T23:37:47.879Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/36/aebbbddd9c659f1fc9d78daeaf6e39860813bb014b0de873073361ad40f1/pycocotools-2.0.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:68846da0ee3ea82d71bcbd99ed28271633a67a899cfbacd2ef309b2e455524b2", size = 155033, upload-time = "2025-06-04T23:37:01.835Z" },
+    { url = "https://files.pythonhosted.org/packages/57/c2/e4c96950604c709fbd71c49828968fadd9d8ca8cf74f52be4cd4b2ff9300/pycocotools-2.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20831839a771d4bc60a814e7b54a92d9a45a773dee47959d30888d00066059c3", size = 470328, upload-time = "2025-06-04T23:37:03.675Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/ec/7827cd9ce6e80f739fab0163ecb3765df54af744a9bab64b0058bdce47ef/pycocotools-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1760c10459dfb4229e7436ae380228428efb0115bbe332a51b72d07fa085d8c0", size = 477331, upload-time = "2025-06-04T23:37:05.703Z" },
+    { url = "https://files.pythonhosted.org/packages/81/74/33ce685ae1cd6312b2526f701e43dfeb73d1c860878b72a30ac1cc322536/pycocotools-2.0.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5146bc881f380e8fb493e49216083298e4a06f778841f8b9b1d45b21e211d0e4", size = 489735, upload-time = "2025-06-04T23:37:08.488Z" },
+    { url = "https://files.pythonhosted.org/packages/17/79/0e02ce700ff9c9fd30e57a84add42bd6fc033e743b76870ef68215d3f3f4/pycocotools-2.0.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:23f7d0c551d4c31cab629ce177186db9562f10414320add5267707a84cf6cdfa", size = 507779, upload-time = "2025-06-04T23:37:10.159Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/12/00fac39ad26f762c50e5428cc8b3c83de28c5d64b5b858181583522a4e28/pycocotools-2.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:03c3aacec2a6aa5171016303a539d07a7b22a34557456eadf0eb40853bdd813e", size = 80808, upload-time = "2025-06-04T23:37:11.865Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/cd/50970a64365f013151086d54d60b40369cf612f117d72cd9d6bd2966932c/pycocotools-2.0.10-cp311-cp311-win_arm64.whl", hash = "sha256:1f942352b1ab11b9732443ab832cbe5836441f4ec30e1f61b44e1421dbb0a0f5", size = 69566, upload-time = "2025-06-04T23:37:13.067Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/b4/3b87dce90fc81b8283b2b0e32b22642939e25f3a949581cb6777f5eebb12/pycocotools-2.0.10-cp312-abi3-macosx_10_13_universal2.whl", hash = "sha256:e1359f556986c8c4ac996bf8e473ff891d87630491357aaabd12601687af5edb", size = 142896, upload-time = "2025-06-04T23:37:14.748Z" },
+    { url = "https://files.pythonhosted.org/packages/29/d5/b17bb67722432a191cb86121cda33cd8edb4d5b15beda43bc97a7d5ae404/pycocotools-2.0.10-cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:075788c90bfa6a8989d628932854f3e32c25dac3c1bf7c1183cefad29aee16c8", size = 390111, upload-time = "2025-06-04T23:37:16.588Z" },
+    { url = "https://files.pythonhosted.org/packages/49/80/912b4c60f94e747dd2c3adbda5d4a4edc1d735fbfa0d91ab2eb231decb5d/pycocotools-2.0.10-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4539d8b29230de042f574012edd0b5227528da083c4f12bbd6488567aabd3920", size = 397099, upload-time = "2025-06-04T23:37:18.105Z" },
+    { url = "https://files.pythonhosted.org/packages/df/d7/b3c2f731252a096bbae1a47cb1bbeab4560620a82585d40cce67eca5f043/pycocotools-2.0.10-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:da7b339624d0f78aa5bdc1c86a53f2dcb36ae7e10ab5fe45ba69878bb7837c7a", size = 396111, upload-time = "2025-06-04T23:37:20.642Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/6f/2eceba57245bfc86174263e12716cbe91b329a3677fbeff246148ce6a664/pycocotools-2.0.10-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffdbf8810f27b32c5c5c85d9cd65e8e066852fef9775e58a7b23abdffeaf8252", size = 416393, upload-time = "2025-06-04T23:37:22.287Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/31/d87f781759b2ad177dd6d41c5fe0ce154f14fc8b384e9b80cd21a157395b/pycocotools-2.0.10-cp312-abi3-win_amd64.whl", hash = "sha256:998a88f90bb663548e767470181175343d406b6673b8b9ef5bdbb3a6d3eb3b11", size = 76824, upload-time = "2025-06-04T23:37:23.744Z" },
+    { url = "https://files.pythonhosted.org/packages/27/13/7674d61658b58b8310e3de1270bce18f92a6ee8136e54a7e5696d6f72fd4/pycocotools-2.0.10-cp312-abi3-win_arm64.whl", hash = "sha256:76cd86a80171f8f7da3250be0e40d75084f1f1505d376ae0d08ed0be1ba8a90d", size = 64753, upload-time = "2025-06-04T23:37:25.202Z" },
+]
+
+[[package]]
+name = "pycparser"
+version = "2.23"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/cf/d2d3b9f5699fb1e4615c8e32ff220203e43b248e1dfcc6736ad9057731ca/pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", size = 173734, upload-time = "2025-09-09T13:23:47.91Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.11.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
+]
+
+[[package]]
+name = "pydantic-argparse"
+version = "0.10.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/94/ea/e63d587294c20d3b83e9c312b5d577c9ec28962ee8490839ca9996672849/pydantic_argparse-0.10.0.tar.gz", hash = "sha256:d57eb0a84c8f0af6605376157d3f445cfd786700f2e596ba9d48d15d557185eb", size = 15928, upload-time = "2025-02-09T08:18:30.425Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/14/9ee71e3a183f76ff93e46b36157d6ddbf29ec2547b7d2c57931cd5d3aecc/pydantic_argparse-0.10.0-py3-none-any.whl", hash = "sha256:e317f001208d77a5600ece6f7ac78d768d8221a7d64a958980705e9630c2e299", size = 25265, upload-time = "2025-02-09T08:18:27.671Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.33.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" },
+    { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" },
+    { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" },
+    { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" },
+    { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" },
+    { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" },
+    { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" },
+    { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" },
+    { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
+]
+
+[[package]]
+name = "pydantic-settings"
+version = "2.11.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/20/c5/dbbc27b814c71676593d1c3f718e6cd7d4f00652cefa24b75f7aa3efb25e/pydantic_settings-2.11.0.tar.gz", hash = "sha256:d0e87a1c7d33593beb7194adb8470fc426e95ba02af83a0f23474a04c9a08180", size = 188394, upload-time = "2025-09-24T14:19:11.764Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/d6/887a1ff844e64aa823fb4905978d882a633cfe295c32eacad582b78a7d8b/pydantic_settings-2.11.0-py3-none-any.whl", hash = "sha256:fe2cea3413b9530d10f3a5875adffb17ada5c1e1bab0b2885546d7310415207c", size = 48608, upload-time = "2025-09-24T14:19:10.015Z" },
+]
+
+[[package]]
+name = "pydrive2"
+version = "1.21.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+    { name = "google-api-python-client" },
+    { name = "oauth2client" },
+    { name = "pyopenssl" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3f/dc/92b0beba58f09441219bb6720bebdb895317632db4778cfe1d21532d27e5/pydrive2-1.21.3.tar.gz", hash = "sha256:649b84d60c637bc7146485039535aa8f1254ad156423739f07e5d32507447c13", size = 63348, upload-time = "2024-11-29T09:49:53.556Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8c/de/eef2e2661371b02d4231c5cacbb758a52ea9ea98cb5f52d69298641e2631/PyDrive2-1.21.3-py3-none-any.whl", hash = "sha256:843a304f500e71508162807001f5e19487f272e8ff5648f43582bd24c6250200", size = 47972, upload-time = "2024-11-29T09:49:51.254Z" },
+]
+
+[[package]]
+name = "pyee"
+version = "13.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pyjwt"
+version = "2.11.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5c/5a/b46fa56bf322901eee5b0454a34343cdbdae202cd421775a8ee4e42fd519/pyjwt-2.11.0.tar.gz", hash = "sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623", size = 98019, upload-time = "2026-01-30T19:59:55.694Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6f/01/c26ce75ba460d5cd503da9e13b21a33804d38c2165dec7b716d06b13010c/pyjwt-2.11.0-py3-none-any.whl", hash = "sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469", size = 28224, upload-time = "2026-01-30T19:59:54.539Z" },
+]
+
+[package.optional-dependencies]
+crypto = [
+    { name = "cryptography" },
+]
+
+[[package]]
+name = "pymupdf"
+version = "1.26.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/d4/70a265e4bcd43e97480ae62da69396ef4507c8f9cfd179005ee731c92a04/pymupdf-1.26.3.tar.gz", hash = "sha256:b7d2c3ffa9870e1e4416d18862f5ccd356af5fe337b4511093bbbce2ca73b7e5", size = 75990308, upload-time = "2025-07-02T21:34:22.243Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/70/d3/c7af70545cd3097a869fd635bb6222108d3a0fb28c0b8254754a126c4cbb/pymupdf-1.26.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ded891963944e5f13b03b88f6d9e982e816a4ec8689fe360876eef000c161f2b", size = 23057205, upload-time = "2025-07-02T21:26:16.326Z" },
+    { url = "https://files.pythonhosted.org/packages/04/3d/ec5b69bfeaa5deefa7141fc0b20d77bb20404507cf17196b4eb59f1f2977/pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:436a33c738bb10eadf00395d18a6992b801ffb26521ee1f361ae786dd283327a", size = 22406630, upload-time = "2025-07-02T21:27:10.112Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/20/661d3894bb05ad75ed6ca103ee2c3fa44d88a458b5c8d4a946b9c0f2569b/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a2d7a3cd442f12f05103cb3bb1415111517f0a97162547a3720f3bbbc5e0b51c", size = 23450287, upload-time = "2025-07-03T07:22:19.317Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/7f/21828f018e65b16a033731d21f7b46d93fa81c6e8257f769ca4a1c2a1cb0/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:454f38c8cf07eb333eb4646dca10517b6e90f57ce2daa2265a78064109d85555", size = 24057319, upload-time = "2025-07-02T21:28:26.697Z" },
+    { url = "https://files.pythonhosted.org/packages/71/5d/e8f88cd5a45b8f5fa6590ce8cef3ce0fad30eac6aac8aea12406f95bee7d/pymupdf-1.26.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:759b75d2f710ff4edf8d097d2e98f60e9ecef47632cead6f949b3412facdb9f0", size = 24261350, upload-time = "2025-07-02T21:29:21.733Z" },
+    { url = "https://files.pythonhosted.org/packages/82/22/ecc560e4f281b5dffafbf3a81f023d268b1746d028044f495115b74a2e70/pymupdf-1.26.3-cp39-abi3-win32.whl", hash = "sha256:a839ed44742faa1cd4956bb18068fe5aae435d67ce915e901318646c4e7bbea6", size = 17116371, upload-time = "2025-07-02T21:30:23.253Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl", hash = "sha256:b4cd5124d05737944636cf45fc37ce5824f10e707b0342efe109c7b6bd37a9cc", size = 18735124, upload-time = "2025-07-02T21:31:10.992Z" },
+]
+
+[[package]]
+name = "pynndescent"
+version = "0.5.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "llvmlite" },
+    { name = "numba" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7e/58/560a4db5eb3794d922fe55804b10326534ded3d971e1933c1eef91193f5e/pynndescent-0.5.13.tar.gz", hash = "sha256:d74254c0ee0a1eeec84597d5fe89fedcf778593eeabe32c2f97412934a9800fb", size = 2975955, upload-time = "2024-06-17T15:48:32.914Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/53/d23a97e0a2c690d40b165d1062e2c4ccc796be458a1ce59f6ba030434663/pynndescent-0.5.13-py3-none-any.whl", hash = "sha256:69aabb8f394bc631b6ac475a1c7f3994c54adf3f51cd63b2730fefba5771b949", size = 56850, upload-time = "2024-06-17T15:48:31.184Z" },
+]
+
+[[package]]
+name = "pyopenssl"
+version = "24.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5d/70/ff56a63248562e77c0c8ee4aefc3224258f1856977e0c1472672b62dadb8/pyopenssl-24.2.1.tar.gz", hash = "sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95", size = 184323, upload-time = "2024-07-20T17:26:31.252Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/dd/e0aa7ebef5168c75b772eda64978c597a9129b46be17779054652a7999e4/pyOpenSSL-24.2.1-py3-none-any.whl", hash = "sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d", size = 58390, upload-time = "2024-07-20T17:26:29.057Z" },
+]
+
+[[package]]
+name = "pyparsing"
+version = "3.2.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f2/a5/181488fc2b9d093e3972d2a472855aae8a03f000592dbfce716a512b3359/pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6", size = 1099274, upload-time = "2025-09-21T04:11:06.277Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" },
+]
+
+[[package]]
+name = "pypdf2"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
+]
+
+[[package]]
+name = "pypdfium2"
+version = "5.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/83/173dab58beb6c7e772b838199014c173a2436018dd7cfde9bbf4a3be15da/pypdfium2-5.3.0.tar.gz", hash = "sha256:2873ffc95fcb01f329257ebc64a5fdce44b36447b6b171fe62f7db5dc3269885", size = 268742, upload-time = "2026-01-05T16:29:03.02Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/a4/6bb5b5918c7fc236ec426be8a0205a984fe0a26ae23d5e4dd497398a6571/pypdfium2-5.3.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:885df6c78d41600cb086dc0c76b912d165b5bd6931ca08138329ea5a991b3540", size = 2763287, upload-time = "2026-01-05T16:28:24.21Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/64/24b41b906006bf07099b095f0420ee1f01a3a83a899f3e3731e4da99c06a/pypdfium2-5.3.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:6e53dee6b333ee77582499eff800300fb5aa0c7eb8f52f95ccb5ca35ebc86d48", size = 2303285, upload-time = "2026-01-05T16:28:26.274Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/c0/3ec73f4ded83ba6c02acf6e9d228501759d5d74fe57f1b93849ab92dcc20/pypdfium2-5.3.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ce4466bdd62119fe25a5f74d107acc9db8652062bf217057630c6ff0bb419523", size = 2816066, upload-time = "2026-01-05T16:28:28.099Z" },
+    { url = "https://files.pythonhosted.org/packages/62/ca/e553b3b8b5c2cdc3d955cc313493ac27bbe63fc22624769d56ded585dd5e/pypdfium2-5.3.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:cc2647fd03db42b8a56a8835e8bc7899e604e2042cd6fedeea53483185612907", size = 2945545, upload-time = "2026-01-05T16:28:29.489Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/56/615b776071e95c8570d579038256d0c77969ff2ff381e427be4ab8967f44/pypdfium2-5.3.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35e205f537ddb4069e4b4e22af7ffe84fcf2d686c3fee5e5349f73268a0ef1ca", size = 2979892, upload-time = "2026-01-05T16:28:31.088Z" },
+    { url = "https://files.pythonhosted.org/packages/df/10/27114199b765bdb7d19a9514c07036ad2fc3a579b910e7823ba167ead6de/pypdfium2-5.3.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5795298f44050797ac030994fc2525ea35d2d714efe70058e0ee22e5f613f27", size = 2765738, upload-time = "2026-01-05T16:28:33.18Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/d7/2a3afa35e6c205a4f6264c33b8d2f659707989f93c30b336aa58575f66fa/pypdfium2-5.3.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7cd43dfceb77137e69e74c933d41506da1dddaff70f3a794fb0ad0d73e90d75", size = 3064338, upload-time = "2026-01-05T16:28:34.731Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/f1/6658755cf6e369bb51d0bccb81c51c300404fbe67c2f894c90000b6442dd/pypdfium2-5.3.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5956867558fd3a793e58691cf169718864610becb765bfe74dd83f05cbf1ae3", size = 3415059, upload-time = "2026-01-05T16:28:37.313Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/34/f86482134fa641deb1f524c45ec7ebd6fc8d404df40c5657ddfce528593e/pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ff1071e9a782625822658dfe6e29e3a644a66960f8713bb17819f5a0ac5987", size = 2998517, upload-time = "2026-01-05T16:28:38.873Z" },
+    { url = "https://files.pythonhosted.org/packages/09/34/40ab99425dcf503c172885904c5dc356c052bfdbd085f9f3cc920e0b8b25/pypdfium2-5.3.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f319c46ead49d289ab8c1ed2ea63c91e684f35bdc4cf4dc52191c441182ac481", size = 3673154, upload-time = "2026-01-05T16:28:40.347Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/67/0f7532f80825a7728a5cbff3f1104857f8f9fe49ebfd6cb25582a89ae8e1/pypdfium2-5.3.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6dc67a186da0962294321cace6ccc0a4d212dbc5e9522c640d35725a812324b8", size = 2965002, upload-time = "2026-01-05T16:28:42.143Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/6c/c03d2a3d6621b77aac9604bce1c060de2af94950448787298501eac6c6a2/pypdfium2-5.3.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0ad0afd3d2b5b54d86287266fd6ae3fef0e0a1a3df9d2c4984b3e3f8f70e6330", size = 4130530, upload-time = "2026-01-05T16:28:44.264Z" },
+    { url = "https://files.pythonhosted.org/packages/af/39/9ad1f958cbe35d4693ae87c09ebafda4bb3e4709c7ccaec86c1a829163a3/pypdfium2-5.3.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1afe35230dc3951b3e79b934c0c35a2e79e2372d06503fce6cf1926d2a816f47", size = 3746568, upload-time = "2026-01-05T16:28:45.897Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/e2/4d32310166c2d6955d924737df8b0a3e3efc8d133344a98b10f96320157d/pypdfium2-5.3.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:00385793030cadce08469085cd21b168fd8ff981b009685fef3103bdc5fc4686", size = 4336683, upload-time = "2026-01-05T16:28:47.584Z" },
+    { url = "https://files.pythonhosted.org/packages/14/ea/38c337ff12a8cec4b00fd4fdb0a63a70597a344581e20b02addbd301ab56/pypdfium2-5.3.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:d911e82676398949697fef80b7f412078df14d725a91c10e383b727051530285", size = 4375030, upload-time = "2026-01-05T16:28:49.5Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/77/9d8de90c35d2fc383be8819bcde52f5821dacbd7404a0225e4010b99d080/pypdfium2-5.3.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:ca1dc625ed347fac3d9002a3ed33d521d5803409bd572e7b3f823c12ab2ef58f", size = 3928914, upload-time = "2026-01-05T16:28:51.433Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/39/9d4a6fbd78fcb6803b0ea5e4952a31d6182a0aaa2609cfcd0eb88446fdb8/pypdfium2-5.3.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:ea4f9db2d3575f22cd41f4c7a855240ded842f135e59a961b5b1351a65ce2b6e", size = 4997777, upload-time = "2026-01-05T16:28:53.589Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/38/cdd4ed085c264234a59ad32df1dfe432c77a7403da2381e0fcc1ba60b74e/pypdfium2-5.3.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0ea24409613df350223c6afc50911c99dca0d43ddaf2616c5a1ebdffa3e1bcb5", size = 4179895, upload-time = "2026-01-05T16:28:55.322Z" },
+    { url = "https://files.pythonhosted.org/packages/93/4c/d2f40145c9012482699664f615d7ae540a346c84f68a8179449e69dcc4d8/pypdfium2-5.3.0-py3-none-win32.whl", hash = "sha256:5bf695d603f9eb8fdd7c1786add5cf420d57fbc81df142ed63c029ce29614df9", size = 2993570, upload-time = "2026-01-05T16:28:58.37Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/dc/1388ea650020c26ef3f68856b9227e7f153dcaf445e7e4674a0b8f26891e/pypdfium2-5.3.0-py3-none-win_amd64.whl", hash = "sha256:8365af22a39d4373c265f8e90e561cd64d4ddeaf5e6a66546a8caed216ab9574", size = 3102340, upload-time = "2026-01-05T16:28:59.933Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/71/a433668d33999b3aeb2c2dda18aaf24948e862ea2ee148078a35daac6c1c/pypdfium2-5.3.0-py3-none-win_arm64.whl", hash = "sha256:0b2c6bf825e084d91d34456be54921da31e9199d9530b05435d69d1a80501a12", size = 2940987, upload-time = "2026-01-05T16:29:01.511Z" },
+]
+
+[[package]]
+name = "pyreadline3"
+version = "3.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
+]
+
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" },
+]
+
+[[package]]
+name = "pytesseract"
+version = "0.3.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "8.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
+]
+
+[[package]]
+name = "python-barcode"
+version = "0.16.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0e/f2/4c0b07f100e1e184ba682021322c336bbba6aa7adfabd2616f70eff917d9/python_barcode-0.16.1.tar.gz", hash = "sha256:665ed09516b0088b5593061c5ac8662caa0b08d56bdad328388b1cab39939ac5", size = 233777, upload-time = "2025-08-27T11:05:45.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/34/810885dca784b02e5ad0f71ced9c06ba5e9d33a6493bc886f7470ce82a39/python_barcode-0.16.1-py3-none-any.whl", hash = "sha256:5776567478c9a0dae473374bb86631ba0b5ea99aaf302763b364e367ac51f367", size = 228046, upload-time = "2025-08-27T11:05:42.776Z" },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
+]
+
+[[package]]
+name = "python-dotenv"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
+]
+
+[[package]]
+name = "python-levenshtein"
+version = "0.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "levenshtein" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/f6/d865a565b7eeef4b5f9a18accafb03d5730c712420fc84a3a40555f7ea6b/python_levenshtein-0.27.1.tar.gz", hash = "sha256:3a5314a011016d373d309a68e875fd029caaa692ad3f32e78319299648045f11", size = 12326, upload-time = "2025-03-02T19:47:25.641Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/95/8c8fd923b0a702388da4f9e0368f490d123cc5224279e6a083984304a15e/python_levenshtein-0.27.1-py3-none-any.whl", hash = "sha256:e1a4bc2a70284b2ebc4c505646142fecd0f831e49aa04ed972995895aec57396", size = 9426, upload-time = "2025-03-02T19:47:24.801Z" },
+]
+
+[[package]]
+name = "python-multipart"
+version = "0.0.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" },
+]
+
+[[package]]
+name = "pytorch-ignite"
+version = "0.5.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/e5/7fe880b24de30b4eadc8d997ea8d3c4a8f507b1a34dcdced08d88f665ee3/pytorch_ignite-0.5.3.tar.gz", hash = "sha256:75c645f02fea66cc80c1998ade3f8402e0e6b6d73f3f4ad727c171f6e93874f4", size = 7506607, upload-time = "2025-10-16T00:42:05.142Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/ea/f6d5ee7433a5a1c1e4746e2a4e9a222eab545fdbe04b66754ffdab479ee8/pytorch_ignite-0.5.3-py3-none-any.whl", hash = "sha256:4ced7539c690a3b6f3116da7878389954dff787c33669f83b38221f3746bc63e", size = 343802, upload-time = "2025-10-16T00:41:55.738Z" },
+]
+
+[[package]]
+name = "pytz"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
+]
+
+[[package]]
+name = "pyxdameraulevenshtein"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/db/b529c031e92a36ded5cca06b48e31e4e2a388da231de84ccb383c8e27af1/pyxDamerauLevenshtein-1.9.0.tar.gz", hash = "sha256:50c84b1b7272c4f1dcee732d6b1713f4871921c99e4cf80e722c65928ca94ce1", size = 73255, upload-time = "2025-10-01T03:55:24.931Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/5e/f3b7c546274a9f908e307d9811c1e25da21df33a288a14b5edc54b0d8bfc/pyxdameraulevenshtein-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b77f6d620ee0a1706005033db847897e22dddfaa152dfa3b4babde1de880c194", size = 32639, upload-time = "2025-10-01T03:55:46.017Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/30/dabd621749c4a2138ee2805e0d12235abdfa25e4c3a4a9397de0b20028e7/pyxdameraulevenshtein-1.9.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:11ff877684aef45045896234383e216a9e55083cc1eb139e9d19579f894f459e", size = 38198, upload-time = "2025-10-01T03:55:32.211Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/8d/4918da22a6e7416b5bd834cf0ba5cebb440f3cb0f19dd46cb12f5aae630d/pyxdameraulevenshtein-1.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2b33d87166eaa9133903a163dcca71fd35a7f20a6c62929187958c6a5e87359f", size = 38563, upload-time = "2025-10-01T03:55:27.02Z" },
+    { url = "https://files.pythonhosted.org/packages/28/22/fd466f404e7f1a8d3fe307121a796ad2100ad837863d51f36a747aca2678/pyxdameraulevenshtein-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:42e96b2d340ecb60cd5bba31766d783f5e1175fa63c7973b61e33c2fdffdd771", size = 30924, upload-time = "2025-10-01T03:55:44.627Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" },
+    { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" },
+    { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" },
+    { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" },
+    { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" },
+]
+
+[[package]]
+name = "pyzmq"
+version = "27.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "implementation_name == 'pypy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/5d/305323ba86b284e6fcb0d842d6adaa2999035f70f8c38a9b6d21ad28c3d4/pyzmq-27.1.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86", size = 1333328, upload-time = "2025-09-08T23:07:45.946Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/a0/fc7e78a23748ad5443ac3275943457e8452da67fda347e05260261108cbc/pyzmq-27.1.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581", size = 908803, upload-time = "2025-09-08T23:07:47.551Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/22/37d15eb05f3bdfa4abea6f6d96eb3bb58585fbd3e4e0ded4e743bc650c97/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f", size = 668836, upload-time = "2025-09-08T23:07:49.436Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/c4/2a6fe5111a01005fc7af3878259ce17684fabb8852815eda6225620f3c59/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e", size = 857038, upload-time = "2025-09-08T23:07:51.234Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/eb/bfdcb41d0db9cd233d6fb22dc131583774135505ada800ebf14dfb0a7c40/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e", size = 1657531, upload-time = "2025-09-08T23:07:52.795Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/21/e3180ca269ed4a0de5c34417dfe71a8ae80421198be83ee619a8a485b0c7/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2", size = 2034786, upload-time = "2025-09-08T23:07:55.047Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/b1/5e21d0b517434b7f33588ff76c177c5a167858cc38ef740608898cd329f2/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394", size = 1894220, upload-time = "2025-09-08T23:07:57.172Z" },
+    { url = "https://files.pythonhosted.org/packages/03/f2/44913a6ff6941905efc24a1acf3d3cb6146b636c546c7406c38c49c403d4/pyzmq-27.1.0-cp311-cp311-win32.whl", hash = "sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f", size = 567155, upload-time = "2025-09-08T23:07:59.05Z" },
+    { url = "https://files.pythonhosted.org/packages/23/6d/d8d92a0eb270a925c9b4dd039c0b4dc10abc2fcbc48331788824ef113935/pyzmq-27.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97", size = 633428, upload-time = "2025-09-08T23:08:00.663Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/14/01afebc96c5abbbd713ecfc7469cfb1bc801c819a74ed5c9fad9a48801cb/pyzmq-27.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07", size = 559497, upload-time = "2025-09-08T23:08:02.15Z" },
+    { url = "https://files.pythonhosted.org/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" },
+    { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" },
+    { url = "https://files.pythonhosted.org/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/c6/c4dcdecdbaa70969ee1fdced6d7b8f60cfabe64d25361f27ac4665a70620/pyzmq-27.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066", size = 836265, upload-time = "2025-09-08T23:09:49.376Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/79/f38c92eeaeb03a2ccc2ba9866f0439593bb08c5e3b714ac1d553e5c96e25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604", size = 800208, upload-time = "2025-09-08T23:09:51.073Z" },
+    { url = "https://files.pythonhosted.org/packages/49/0e/3f0d0d335c6b3abb9b7b723776d0b21fa7f3a6c819a0db6097059aada160/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c", size = 567747, upload-time = "2025-09-08T23:09:52.698Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/cf/f2b3784d536250ffd4be70e049f3b60981235d70c6e8ce7e3ef21e1adb25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271", size = 747371, upload-time = "2025-09-08T23:09:54.563Z" },
+    { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" },
+]
+
+[[package]]
+name = "rapidfuzz"
+version = "3.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ed/f6/6895abc3a3d056b9698da3199b04c0e56226d530ae44a470edabf8b664f0/rapidfuzz-3.13.0.tar.gz", hash = "sha256:d2eaf3839e52cbcc0accbe9817a67b4b0fcf70aaeb229cfddc1c28061f9ce5d8", size = 57904226, upload-time = "2025-04-03T20:38:51.226Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/87/17/9be9eff5a3c7dfc831c2511262082c6786dca2ce21aa8194eef1cb71d67a/rapidfuzz-3.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d395a5cad0c09c7f096433e5fd4224d83b53298d53499945a9b0e5a971a84f3a", size = 1999453, upload-time = "2025-04-03T20:35:40.804Z" },
+    { url = "https://files.pythonhosted.org/packages/75/67/62e57896ecbabe363f027d24cc769d55dd49019e576533ec10e492fcd8a2/rapidfuzz-3.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7b3eda607a019169f7187328a8d1648fb9a90265087f6903d7ee3a8eee01805", size = 1450881, upload-time = "2025-04-03T20:35:42.734Z" },
+    { url = "https://files.pythonhosted.org/packages/96/5c/691c5304857f3476a7b3df99e91efc32428cbe7d25d234e967cc08346c13/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98e0bfa602e1942d542de077baf15d658bd9d5dcfe9b762aff791724c1c38b70", size = 1422990, upload-time = "2025-04-03T20:35:45.158Z" },
+    { url = "https://files.pythonhosted.org/packages/46/81/7a7e78f977496ee2d613154b86b203d373376bcaae5de7bde92f3ad5a192/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bef86df6d59667d9655905b02770a0c776d2853971c0773767d5ef8077acd624", size = 5342309, upload-time = "2025-04-03T20:35:46.952Z" },
+    { url = "https://files.pythonhosted.org/packages/51/44/12fdd12a76b190fe94bf38d252bb28ddf0ab7a366b943e792803502901a2/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fedd316c165beed6307bf754dee54d3faca2c47e1f3bcbd67595001dfa11e969", size = 1656881, upload-time = "2025-04-03T20:35:49.954Z" },
+    { url = "https://files.pythonhosted.org/packages/27/ae/0d933e660c06fcfb087a0d2492f98322f9348a28b2cc3791a5dbadf6e6fb/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5158da7f2ec02a930be13bac53bb5903527c073c90ee37804090614cab83c29e", size = 1608494, upload-time = "2025-04-03T20:35:51.646Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/2c/4b2f8aafdf9400e5599b6ed2f14bc26ca75f5a923571926ccbc998d4246a/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b6f913ee4618ddb6d6f3e387b76e8ec2fc5efee313a128809fbd44e65c2bbb2", size = 3072160, upload-time = "2025-04-03T20:35:53.472Z" },
+    { url = "https://files.pythonhosted.org/packages/60/7d/030d68d9a653c301114101c3003b31ce01cf2c3224034cd26105224cd249/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d25fdbce6459ccbbbf23b4b044f56fbd1158b97ac50994eaae2a1c0baae78301", size = 2491549, upload-time = "2025-04-03T20:35:55.391Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/cd/7040ba538fc6a8ddc8816a05ecf46af9988b46c148ddd7f74fb0fb73d012/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25343ccc589a4579fbde832e6a1e27258bfdd7f2eb0f28cb836d6694ab8591fc", size = 7584142, upload-time = "2025-04-03T20:35:57.71Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/96/85f7536fbceb0aa92c04a1c37a3fc4fcd4e80649e9ed0fb585382df82edc/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a9ad1f37894e3ffb76bbab76256e8a8b789657183870be11aa64e306bb5228fd", size = 2896234, upload-time = "2025-04-03T20:35:59.969Z" },
+    { url = "https://files.pythonhosted.org/packages/55/fd/460e78438e7019f2462fe9d4ecc880577ba340df7974c8a4cfe8d8d029df/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5dc71ef23845bb6b62d194c39a97bb30ff171389c9812d83030c1199f319098c", size = 3437420, upload-time = "2025-04-03T20:36:01.91Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/df/c3c308a106a0993befd140a414c5ea78789d201cf1dfffb8fd9749718d4f/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b7f4c65facdb94f44be759bbd9b6dda1fa54d0d6169cdf1a209a5ab97d311a75", size = 4410860, upload-time = "2025-04-03T20:36:04.352Z" },
+    { url = "https://files.pythonhosted.org/packages/75/ee/9d4ece247f9b26936cdeaae600e494af587ce9bf8ddc47d88435f05cfd05/rapidfuzz-3.13.0-cp311-cp311-win32.whl", hash = "sha256:b5104b62711565e0ff6deab2a8f5dbf1fbe333c5155abe26d2cfd6f1849b6c87", size = 1843161, upload-time = "2025-04-03T20:36:06.802Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/5a/d00e1f63564050a20279015acb29ecaf41646adfacc6ce2e1e450f7f2633/rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:9093cdeb926deb32a4887ebe6910f57fbcdbc9fbfa52252c10b56ef2efb0289f", size = 1629962, upload-time = "2025-04-03T20:36:09.133Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/74/0a3de18bc2576b794f41ccd07720b623e840fda219ab57091897f2320fdd/rapidfuzz-3.13.0-cp311-cp311-win_arm64.whl", hash = "sha256:f70f646751b6aa9d05be1fb40372f006cc89d6aad54e9d79ae97bd1f5fce5203", size = 866631, upload-time = "2025-04-03T20:36:11.022Z" },
+    { url = "https://files.pythonhosted.org/packages/88/df/6060c5a9c879b302bd47a73fc012d0db37abf6544c57591bcbc3459673bd/rapidfuzz-3.13.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1ba007f4d35a45ee68656b2eb83b8715e11d0f90e5b9f02d615a8a321ff00c27", size = 1905935, upload-time = "2025-04-03T20:38:18.07Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/6c/a0b819b829e20525ef1bd58fc776fb8d07a0c38d819e63ba2b7c311a2ed4/rapidfuzz-3.13.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7a217310429b43be95b3b8ad7f8fc41aba341109dc91e978cd7c703f928c58f", size = 1383714, upload-time = "2025-04-03T20:38:20.628Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/c1/3da3466cc8a9bfb9cd345ad221fac311143b6a9664b5af4adb95b5e6ce01/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:558bf526bcd777de32b7885790a95a9548ffdcce68f704a81207be4a286c1095", size = 1367329, upload-time = "2025-04-03T20:38:23.01Z" },
+    { url = "https://files.pythonhosted.org/packages/da/f0/9f2a9043bfc4e66da256b15d728c5fc2d865edf0028824337f5edac36783/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:202a87760f5145140d56153b193a797ae9338f7939eb16652dd7ff96f8faf64c", size = 5251057, upload-time = "2025-04-03T20:38:25.52Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/ff/af2cb1d8acf9777d52487af5c6b34ce9d13381a753f991d95ecaca813407/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcccc08f671646ccb1e413c773bb92e7bba789e3a1796fd49d23c12539fe2e4", size = 2992401, upload-time = "2025-04-03T20:38:28.196Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/c5/c243b05a15a27b946180db0d1e4c999bef3f4221505dff9748f1f6c917be/rapidfuzz-3.13.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:1f219f1e3c3194d7a7de222f54450ce12bc907862ff9a8962d83061c1f923c86", size = 1553782, upload-time = "2025-04-03T20:38:30.778Z" },
+]
+
+[[package]]
+name = "realtime"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "typing-extensions" },
+    { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/97/50/340b7f4e9469c9e532657cc96179150b05a868d619183c0fbe9438d5e9ed/realtime-2.25.1.tar.gz", hash = "sha256:0ecd710c37dc42ccb01be5eb25146b249a2b73668da22fd93eae776869db57b6", size = 18537, upload-time = "2025-12-10T21:48:29.81Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/99/10ab53febfa7401ae4899e05eeffa5597523979dea280ad31ba433c9d88a/realtime-2.25.1-py3-none-any.whl", hash = "sha256:3af1da47391cc0da947b4f3850f8e0403ec9be0988c14c2fa3fe66a9458251be", size = 22139, upload-time = "2025-12-10T21:48:28.844Z" },
+]
+
+[[package]]
+name = "redis"
+version = "7.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e9/31/1476f206482dd9bc53fdbbe9f6fbd5e05d153f18e54667ce839df331f2e6/redis-7.2.1.tar.gz", hash = "sha256:6163c1a47ee2d9d01221d8456bc1c75ab953cbda18cfbc15e7140e9ba16ca3a5", size = 4906735, upload-time = "2026-02-25T20:05:18.171Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/98/1dd1a5c060916cf21d15e67b7d6a7078e26e2605d5c37cbc9f4f5454c478/redis-7.2.1-py3-none-any.whl", hash = "sha256:49e231fbc8df2001436ae5252b3f0f3dc930430239bfeb6da4c7ee92b16e5d33", size = 396057, upload-time = "2026-02-25T20:05:16.533Z" },
+]
+
+[[package]]
+name = "regex"
+version = "2025.7.34"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/de/e13fa6dc61d78b30ba47481f99933a3b49a57779d625c392d8036770a60d/regex-2025.7.34.tar.gz", hash = "sha256:9ead9765217afd04a86822dfcd4ed2747dfe426e887da413b15ff0ac2457e21a", size = 400714, upload-time = "2025-07-31T00:21:16.262Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/85/f497b91577169472f7c1dc262a5ecc65e39e146fc3a52c571e5daaae4b7d/regex-2025.7.34-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:da304313761b8500b8e175eb2040c4394a875837d5635f6256d6fa0377ad32c8", size = 484594, upload-time = "2025-07-31T00:19:13.927Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/c5/ad2a5c11ce9e6257fcbfd6cd965d07502f6054aaa19d50a3d7fd991ec5d1/regex-2025.7.34-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:35e43ebf5b18cd751ea81455b19acfdec402e82fe0dc6143edfae4c5c4b3909a", size = 289294, upload-time = "2025-07-31T00:19:15.395Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/01/83ffd9641fcf5e018f9b51aa922c3e538ac9439424fda3df540b643ecf4f/regex-2025.7.34-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96bbae4c616726f4661fe7bcad5952e10d25d3c51ddc388189d8864fbc1b3c68", size = 285933, upload-time = "2025-07-31T00:19:16.704Z" },
+    { url = "https://files.pythonhosted.org/packages/77/20/5edab2e5766f0259bc1da7381b07ce6eb4401b17b2254d02f492cd8a81a8/regex-2025.7.34-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9feab78a1ffa4f2b1e27b1bcdaad36f48c2fed4870264ce32f52a393db093c78", size = 792335, upload-time = "2025-07-31T00:19:18.561Z" },
+    { url = "https://files.pythonhosted.org/packages/30/bd/744d3ed8777dce8487b2606b94925e207e7c5931d5870f47f5b643a4580a/regex-2025.7.34-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f14b36e6d4d07f1a5060f28ef3b3561c5d95eb0651741474ce4c0a4c56ba8719", size = 858605, upload-time = "2025-07-31T00:19:20.204Z" },
+    { url = "https://files.pythonhosted.org/packages/99/3d/93754176289718d7578c31d151047e7b8acc7a8c20e7706716f23c49e45e/regex-2025.7.34-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85c3a958ef8b3d5079c763477e1f09e89d13ad22198a37e9d7b26b4b17438b33", size = 905780, upload-time = "2025-07-31T00:19:21.876Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/2e/c689f274a92deffa03999a430505ff2aeace408fd681a90eafa92fdd6930/regex-2025.7.34-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:37555e4ae0b93358fa7c2d240a4291d4a4227cc7c607d8f85596cdb08ec0a083", size = 798868, upload-time = "2025-07-31T00:19:23.222Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/9e/39673688805d139b33b4a24851a71b9978d61915c4d72b5ffda324d0668a/regex-2025.7.34-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee38926f31f1aa61b0232a3a11b83461f7807661c062df9eb88769d86e6195c3", size = 781784, upload-time = "2025-07-31T00:19:24.59Z" },
+    { url = "https://files.pythonhosted.org/packages/18/bd/4c1cab12cfabe14beaa076523056b8ab0c882a8feaf0a6f48b0a75dab9ed/regex-2025.7.34-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a664291c31cae9c4a30589bd8bc2ebb56ef880c9c6264cb7643633831e606a4d", size = 852837, upload-time = "2025-07-31T00:19:25.911Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/21/663d983cbb3bba537fc213a579abbd0f263fb28271c514123f3c547ab917/regex-2025.7.34-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f3e5c1e0925e77ec46ddc736b756a6da50d4df4ee3f69536ffb2373460e2dafd", size = 844240, upload-time = "2025-07-31T00:19:27.688Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/2d/9beeeb913bc5d32faa913cf8c47e968da936af61ec20af5d269d0f84a100/regex-2025.7.34-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d428fc7731dcbb4e2ffe43aeb8f90775ad155e7db4347a639768bc6cd2df881a", size = 787139, upload-time = "2025-07-31T00:19:29.475Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/f5/9b9384415fdc533551be2ba805dd8c4621873e5df69c958f403bfd3b2b6e/regex-2025.7.34-cp311-cp311-win32.whl", hash = "sha256:e154a7ee7fa18333ad90b20e16ef84daaeac61877c8ef942ec8dfa50dc38b7a1", size = 264019, upload-time = "2025-07-31T00:19:31.129Z" },
+    { url = "https://files.pythonhosted.org/packages/18/9d/e069ed94debcf4cc9626d652a48040b079ce34c7e4fb174f16874958d485/regex-2025.7.34-cp311-cp311-win_amd64.whl", hash = "sha256:24257953d5c1d6d3c129ab03414c07fc1a47833c9165d49b954190b2b7f21a1a", size = 276047, upload-time = "2025-07-31T00:19:32.497Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/cf/3bafbe9d1fd1db77355e7fbbbf0d0cfb34501a8b8e334deca14f94c7b315/regex-2025.7.34-cp311-cp311-win_arm64.whl", hash = "sha256:3157aa512b9e606586900888cd469a444f9b898ecb7f8931996cb715f77477f0", size = 268362, upload-time = "2025-07-31T00:19:34.094Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
+]
+
+[package.optional-dependencies]
+socks = [
+    { name = "pysocks" },
+]
+
+[[package]]
+name = "requests-oauthlib"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "oauthlib" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" },
+]
+
+[[package]]
+name = "retrying"
+version = "1.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c8/5a/b17e1e257d3e6f2e7758930e1256832c9ddd576f8631781e6a072914befa/retrying-1.4.2.tar.gz", hash = "sha256:d102e75d53d8d30b88562d45361d6c6c934da06fab31bd81c0420acb97a8ba39", size = 11411, upload-time = "2025-08-03T03:35:25.189Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/67/f3/6cd296376653270ac1b423bb30bd70942d9916b6978c6f40472d6ac038e7/retrying-1.4.2-py3-none-any.whl", hash = "sha256:bbc004aeb542a74f3569aeddf42a2516efefcdaff90df0eb38fbfbf19f179f59", size = 10859, upload-time = "2025-08-03T03:35:23.829Z" },
+]
+
+[[package]]
+name = "rich"
+version = "14.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" },
+]
+
+[[package]]
+name = "rq"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "croniter" },
+    { name = "redis" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c5/9b/93b7180220fe462b4128425e687665bcdeffddc51683d41e7fbe509c2d2e/rq-2.7.0.tar.gz", hash = "sha256:c2156fc7249b5d43dda918c4355cfbf8d0d299a5cdd3963918e9c8daf4b1e0c0", size = 679396, upload-time = "2026-02-22T11:10:50.775Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/1a/3b64696bc0c33aa1d86d3e6add03c4e0afe51110264fd41208bd95c2665c/rq-2.7.0-py3-none-any.whl", hash = "sha256:4b320e95968208d2e249fa0d3d90ee309478e2d7ea60a116f8ff9aa343a4c117", size = 115728, upload-time = "2026-02-22T11:10:48.401Z" },
+]
+
+[[package]]
+name = "rsa"
+version = "4.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
+]
+
+[[package]]
+name = "safetensors"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" },
+    { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" },
+    { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" },
+]
+
+[[package]]
+name = "scikit-learn"
+version = "1.7.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/83/564e141eef908a5863a54da8ca342a137f45a0bfb71d1d79704c9894c9d1/scikit_learn-1.7.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e", size = 9331967, upload-time = "2025-09-09T08:20:32.421Z" },
+    { url = "https://files.pythonhosted.org/packages/18/d6/ba863a4171ac9d7314c4d3fc251f015704a2caeee41ced89f321c049ed83/scikit_learn-1.7.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1", size = 8648645, upload-time = "2025-09-09T08:20:34.436Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/0e/97dbca66347b8cf0ea8b529e6bb9367e337ba2e8be0ef5c1a545232abfde/scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d", size = 9715424, upload-time = "2025-09-09T08:20:36.776Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/32/1f3b22e3207e1d2c883a7e09abb956362e7d1bd2f14458c7de258a26ac15/scikit_learn-1.7.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1", size = 9509234, upload-time = "2025-09-09T08:20:38.957Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/71/34ddbd21f1da67c7a768146968b4d0220ee6831e4bcbad3e03dd3eae88b6/scikit_learn-1.7.2-cp311-cp311-win_amd64.whl", hash = "sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1", size = 8894244, upload-time = "2025-09-09T08:20:41.166Z" },
+]
+
+[[package]]
+name = "scipy"
+version = "1.16.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4c/3b/546a6f0bfe791bbb7f8d591613454d15097e53f906308ec6f7c1ce588e8e/scipy-1.16.2.tar.gz", hash = "sha256:af029b153d243a80afb6eabe40b0a07f8e35c9adc269c019f364ad747f826a6b", size = 30580599, upload-time = "2025-09-11T17:48:08.271Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/ef/37ed4b213d64b48422df92560af7300e10fe30b5d665dd79932baebee0c6/scipy-1.16.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:6ab88ea43a57da1af33292ebd04b417e8e2eaf9d5aa05700be8d6e1b6501cd92", size = 36619956, upload-time = "2025-09-11T17:39:20.5Z" },
+    { url = "https://files.pythonhosted.org/packages/85/ab/5c2eba89b9416961a982346a4d6a647d78c91ec96ab94ed522b3b6baf444/scipy-1.16.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c95e96c7305c96ede73a7389f46ccd6c659c4da5ef1b2789466baeaed3622b6e", size = 28931117, upload-time = "2025-09-11T17:39:29.06Z" },
+    { url = "https://files.pythonhosted.org/packages/80/d1/eed51ab64d227fe60229a2d57fb60ca5898cfa50ba27d4f573e9e5f0b430/scipy-1.16.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:87eb178db04ece7c698220d523c170125dbffebb7af0345e66c3554f6f60c173", size = 20921997, upload-time = "2025-09-11T17:39:34.892Z" },
+    { url = "https://files.pythonhosted.org/packages/be/7c/33ea3e23bbadde96726edba6bf9111fb1969d14d9d477ffa202c67bec9da/scipy-1.16.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:4e409eac067dcee96a57fbcf424c13f428037827ec7ee3cb671ff525ca4fc34d", size = 23523374, upload-time = "2025-09-11T17:39:40.846Z" },
+    { url = "https://files.pythonhosted.org/packages/96/0b/7399dc96e1e3f9a05e258c98d716196a34f528eef2ec55aad651ed136d03/scipy-1.16.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e574be127bb760f0dad24ff6e217c80213d153058372362ccb9555a10fc5e8d2", size = 33583702, upload-time = "2025-09-11T17:39:49.011Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/bc/a5c75095089b96ea72c1bd37a4497c24b581ec73db4ef58ebee142ad2d14/scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f5db5ba6188d698ba7abab982ad6973265b74bb40a1efe1821b58c87f73892b9", size = 35883427, upload-time = "2025-09-11T17:39:57.406Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/66/e25705ca3d2b87b97fe0a278a24b7f477b4023a926847935a1a71488a6a6/scipy-1.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec6e74c4e884104ae006d34110677bfe0098203a3fec2f3faf349f4cb05165e3", size = 36212940, upload-time = "2025-09-11T17:40:06.013Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/fd/0bb911585e12f3abdd603d721d83fc1c7492835e1401a0e6d498d7822b4b/scipy-1.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:912f46667d2d3834bc3d57361f854226475f695eb08c08a904aadb1c936b6a88", size = 38865092, upload-time = "2025-09-11T17:40:15.143Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/73/c449a7d56ba6e6f874183759f8483cde21f900a8be117d67ffbb670c2958/scipy-1.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:91e9e8a37befa5a69e9cacbe0bcb79ae5afb4a0b130fd6db6ee6cc0d491695fa", size = 38687626, upload-time = "2025-09-11T17:40:24.041Z" },
+    { url = "https://files.pythonhosted.org/packages/68/72/02f37316adf95307f5d9e579023c6899f89ff3a051fa079dbd6faafc48e5/scipy-1.16.2-cp311-cp311-win_arm64.whl", hash = "sha256:f3bf75a6dcecab62afde4d1f973f1692be013110cad5338007927db8da73249c", size = 25503506, upload-time = "2025-09-11T17:40:30.703Z" },
+]
+
+[[package]]
+name = "seaborn"
+version = "0.13.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" },
+]
+
+[[package]]
+name = "selenium"
+version = "4.36.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "trio" },
+    { name = "trio-websocket" },
+    { name = "typing-extensions" },
+    { name = "urllib3", extra = ["socks"] },
+    { name = "websocket-client" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/10/35/33d3d84e3399c9d00b489aeccfdc78115e149e45816fb8fe84274329e8a2/selenium-4.36.0.tar.gz", hash = "sha256:0eced83038736c3a013b824116df0b6dbb83e93721545f51b680451013416723", size = 913613, upload-time = "2025-10-02T15:24:37.483Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/74/9e/642a355e43a4ebf68bc4f00dd4ab264f635079c5dc7ed6d9991a0c2be3d7/selenium-4.36.0-py3-none-any.whl", hash = "sha256:525fdfe96b99c27d9a2c773c75aa7413f4c24bdb7b9749c1950aa3b5f79ed915", size = 9587029, upload-time = "2025-10-02T15:24:35.025Z" },
+]
+
+[[package]]
+name = "sentence-transformers"
+version = "5.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "pillow" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0f/96/f3f3409179d14dbfdbea8622e2e9eaa3c8836ddcaecd2cd5ff0a11731d20/sentence_transformers-5.1.2.tar.gz", hash = "sha256:0f6c8bd916a78dc65b366feb8d22fd885efdb37432e7630020d113233af2b856", size = 375185, upload-time = "2025-10-22T12:47:55.019Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/a6/a607a737dc1a00b7afe267b9bfde101b8cee2529e197e57471d23137d4e5/sentence_transformers-5.1.2-py3-none-any.whl", hash = "sha256:724ce0ea62200f413f1a5059712aff66495bc4e815a1493f7f9bca242414c333", size = 488009, upload-time = "2025-10-22T12:47:53.433Z" },
+]
+
+[[package]]
+name = "sentry-sdk"
+version = "2.35.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/31/83/055dc157b719651ef13db569bb8cf2103df11174478649735c1b2bf3f6bc/sentry_sdk-2.35.0.tar.gz", hash = "sha256:5ea58d352779ce45d17bc2fa71ec7185205295b83a9dbb5707273deb64720092", size = 343014, upload-time = "2025-08-14T17:11:20.223Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/3d/742617a7c644deb0c1628dcf6bb2d2165ab7c6aab56fe5222758994007f8/sentry_sdk-2.35.0-py2.py3-none-any.whl", hash = "sha256:6e0c29b9a5d34de8575ffb04d289a987ff3053cf2c98ede445bea995e3830263", size = 363806, upload-time = "2025-08-14T17:11:18.29Z" },
+]
+
+[[package]]
+name = "seqeval"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "scikit-learn" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f", size = 43605, upload-time = "2020-10-24T00:24:54.926Z" }
+
+[[package]]
+name = "setuptools"
+version = "78.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/9c/42314ee079a3e9c24b27515f9fbc7a3c1d29992c33451779011c74488375/setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d", size = 1368163, upload-time = "2025-04-19T18:23:36.68Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/99/158ad0609729111163fc1f674a5a42f2605371a4cf036d0441070e2f7455/setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561", size = 1256462, upload-time = "2025-04-19T18:23:34.525Z" },
+]
+
+[[package]]
+name = "shapely"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" },
+    { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" },
+    { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" },
+    { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" },
+    { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" },
+    { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" },
+]
+
+[[package]]
+name = "simplebloom"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5f/85/45f0e8448f37baa7e13949d1d93cf15264737498f4f953464494bf96f8c9/simplebloom-1.1.0.tar.gz", hash = "sha256:2e553d2cea8557c067156de7b8b28af738f36488eef01291559ccd7fa77c7b72", size = 74021, upload-time = "2025-10-10T17:53:46.531Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/79/33f98bcf49c930476ce982cc16424a9c63f801d54145ad1e26d5df64b7da/simplebloom-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5368923334c1852db3ff280880fe0ddb31585e0ac7cda0736d24fdd0cbdf4243", size = 28575, upload-time = "2025-10-10T17:52:57.518Z" },
+    { url = "https://files.pythonhosted.org/packages/da/b3/83857a8617546d9ffd598eb3a0d42f16db67d5add5b68b988a0bf86a8858/simplebloom-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b29e0372612c2faf0f971db56f17d1dd06aca383a8649c5c9aced9caed51a8e", size = 29679, upload-time = "2025-10-10T17:52:58.847Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/07/d218212d28d16e9ad0e444cfc19053040e9291c41b85d846be2ee720c6a5/simplebloom-1.1.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81a40982f2863618ee1ec93c39af03c9853f1102c5285ad9689414d9c821999b", size = 28608, upload-time = "2025-10-10T17:53:00.349Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/1a/e95815814ca9b2d3db710510423d5caa99df05864ca5cf37ca00f9b080b0/simplebloom-1.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:211efd0210bea019174b078fc6304b52ca675d8e76842199c1339d84ee4571b5", size = 28656, upload-time = "2025-10-10T17:53:01.428Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/d2/2f2e5789eec283dee652707768923d7fcc0673301a5cea60238f722ebffd/simplebloom-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6174b2e9096d6fd0ee9ed9eb0c4c7568034ee397a0a691629f9050c766e094ed", size = 28339, upload-time = "2025-10-10T17:53:02.325Z" },
+    { url = "https://files.pythonhosted.org/packages/02/aa/0c6719d7d49c31c21181461928a401e37c6c1825fa1e77d6da8054bad8b8/simplebloom-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e16f8ae71bf4c453c59faad9580043d1ce90c15e0c0bd21b80b8de855f961a9b", size = 28780, upload-time = "2025-10-10T17:53:03.23Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/fd/ca9450ec8623e9568413605ae1f2a4f580821d9a1e9c60dbae060666bd65/simplebloom-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:1438207604a4ec0a03a811630997205a9aa38b5971a390714353a89bbec653e4", size = 27199, upload-time = "2025-10-10T17:53:04.149Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/9c/4418182183aa2091f48e3deb2352bc6dae1dbc771ceb83478d28b8073a58/simplebloom-1.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:434821861b1c00f50cdd77f87b5e98348b23c6625f877b268af03dbf822a5217", size = 24751, upload-time = "2025-10-10T17:53:05.055Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/cb/b54368b35260e20bf6ccbdfd47123ae4984ee2eff4d3ede8fc13f6631dc0/simplebloom-1.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bb066a469328f13838d24f1712cc7aec25c9b257ab9b4ccefb91b4528b88a321", size = 23460, upload-time = "2025-10-10T17:53:41.801Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/11785a806750a41048af35f7bc46d3bc89f20b55fd731de1a71986debad3/simplebloom-1.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:be8ac8feb9790a53280ac1bb4995d940ff6b9fca5e75bd131f760bdaa01e609f", size = 24222, upload-time = "2025-10-10T17:53:42.75Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/80/10bc657c173e9fe3a17a611418ac89015b4dae28d642125d1dd2d6deee82/simplebloom-1.1.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e9efd9fda69f365c2546ba5fc24e27e46201675a8a0d398f0455fbd85bf7ca44", size = 25327, upload-time = "2025-10-10T17:53:43.678Z" },
+    { url = "https://files.pythonhosted.org/packages/88/d4/35d695e7dab216a14f4a064022e747b42a65c1a359c110bf359c96b1d101/simplebloom-1.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:86310d530533f2a6ac987d1753dab226d81d20049da30e3038f22bef4ac295b5", size = 25283, upload-time = "2025-10-10T17:53:44.643Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/99/15b32a9aa3f3682ac1ed08bcb9214ddd5b45d7488c6442ad81ae576c9a62/simplebloom-1.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15a81a6337680377f9037eebba2368f9174fc0d365f4b0c88717cd4dfb4f1233", size = 24255, upload-time = "2025-10-10T17:53:45.556Z" },
+]
+
+[[package]]
+name = "simplejpeg"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/64/da60f0ba80570f9a36c9b6e055f4364bda2c547715296d5773d2ea6d5a60/simplejpeg-1.9.0.tar.gz", hash = "sha256:5ac7d9489eeb812c2e7ea5c283994a29d9fefdfe5ed7b86c09d485e0dd366689", size = 3965764, upload-time = "2025-10-10T10:58:08.197Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/1c/787e062aa3ad48b93cbf516f7aff9ade275f2e3cd901e4eb81744959e5bb/simplejpeg-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60191ea898d58aaef489a8f94bf34a7472a3ae5a40f16a364f154151f751d08b", size = 425492, upload-time = "2025-10-10T10:57:29.067Z" },
+    { url = "https://files.pythonhosted.org/packages/17/5f/00178980659301d4257499143243fa7b7fa0ad348762072f40b08a0459bc/simplejpeg-1.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6cbc0eba5159c9c4b6d2930f429856b4f5b7b792fb48a4c93141e56878c9b71e", size = 401393, upload-time = "2025-10-10T10:57:30.321Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/42/941441677d990e43a53d96c667bf32a3e930855e4807a12e69dedf69c24a/simplejpeg-1.9.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:216ff066e9a05743470ade59ee6014c1a40655bf38a0fc40bae8c78511749a90", size = 448250, upload-time = "2025-10-10T10:57:31.602Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/2f/34c30d9dc903119931f03a1e81112c8f3cd829e833972f6446c0e49ff53f/simplejpeg-1.9.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9cd72c67f1c8fc67f1db432fdae7b03272ca56b72cbb43883c082b63358851c4", size = 405949, upload-time = "2025-10-10T10:57:32.837Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/6a/9952d5c3464f82cf974432ce52a4106ff7b26742eab6e2caa737c28df0ca/simplejpeg-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:8f242aa7401b12edfe3b5c76ee4391a30bfba8e0cb93bc5ddb6ff0c2d2bef33c", size = 292682, upload-time = "2025-10-10T10:57:34.181Z" },
+    { url = "https://files.pythonhosted.org/packages/61/94/aed8b242461a3a603331d3c8eb59e4d56de4532b345d68764ad0896cf750/simplejpeg-1.9.0-cp311-cp311-win_arm64.whl", hash = "sha256:0e28186618efc16b02526ad68ecd53ef84babb3c88a7313624ed665dfe4649ac", size = 253544, upload-time = "2025-10-10T10:57:35.412Z" },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
+]
+
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
+]
+
+[[package]]
+name = "soupsieve"
+version = "2.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload-time = "2025-04-20T18:50:08.518Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" },
+]
+
+[[package]]
+name = "stack-data"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "executing" },
+    { name = "pure-eval" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
+]
+
+[[package]]
+name = "starlette"
+version = "0.50.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" },
+]
+
+[[package]]
+name = "storage3"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecation" },
+    { name = "httpx", extra = ["http2"] },
+    { name = "pydantic" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/40/12/1f7723cd3538181bd37d626d8d7fd1c77e66be38bb0735a332604f48efcf/storage3-2.25.1.tar.gz", hash = "sha256:eb445dcaa3a6ead1c0b27d7d06bf9074592a1fdc07e57c648a69a9bf5057d7a0", size = 18546, upload-time = "2025-12-10T21:48:31.379Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/a4/b20e10088e093ac499ea96a731781ee588aa22202c245cbc6a12a7a3cdfc/storage3-2.25.1-py3-none-any.whl", hash = "sha256:85e2439a5a092965b991ee018a510c3c1a3404b1e029813eca241f5a6bdd6296", size = 26756, upload-time = "2025-12-10T21:48:30.47Z" },
+]
+
+[[package]]
+name = "strenum"
+version = "0.4.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" },
+]
+
+[[package]]
+name = "supabase"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "postgrest" },
+    { name = "realtime" },
+    { name = "storage3" },
+    { name = "supabase-auth" },
+    { name = "supabase-functions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f6/65/ec2bdfd8c593d98a76b3c5e480a00455014a3c65795bb3b04f1cf71d1a8d/supabase-2.25.1.tar.gz", hash = "sha256:dd6663b6e63c93b12df999da6746127f948581302e86578454812d57328aea92", size = 9567, upload-time = "2025-12-10T21:48:32.891Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/09/5e/919b96e5464a6283cec6ebf250adcf82d5b514c085476278a91083a331d3/supabase-2.25.1-py3-none-any.whl", hash = "sha256:ddb209761ac741b6a474b2e125c77875490dfeeac29ca4fa6730df396f06eac0", size = 16442, upload-time = "2025-12-10T21:48:31.962Z" },
+]
+
+[[package]]
+name = "supabase-auth"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx", extra = ["http2"] },
+    { name = "pydantic" },
+    { name = "pyjwt", extra = ["crypto"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/27/56/9ae28ab511ad200662a3ef3ad3277c296c0c7bb78edafb4f258c6af372ea/supabase_auth-2.25.1.tar.gz", hash = "sha256:978168ba28cba87f2c56b80ce596bcebabd51fe51816fc0007e9bedae22cc0ee", size = 38796, upload-time = "2025-12-10T21:48:35.958Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0e/3d/68c0a3616db75ce3f9189c64bc9f96ec712cb6a64938b66b9b0d0b506fe1/supabase_auth-2.25.1-py3-none-any.whl", hash = "sha256:cf18c9b0a92c986e53d4e3db2911d86b2688d1fc63f51f933d8315147d4d7118", size = 48019, upload-time = "2025-12-10T21:48:35.007Z" },
+]
+
+[[package]]
+name = "supabase-functions"
+version = "2.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx", extra = ["http2"] },
+    { name = "strenum" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/4a/32a5010858f4f94ec6b31c2d065ea75f90a29595ddedb3a9b28c44ce533e/supabase_functions-2.25.1.tar.gz", hash = "sha256:6c8c47e29cafede051550a607fac750db4335382fd916d06239fa16be6afadbe", size = 4523, upload-time = "2025-12-10T21:48:37.752Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b6/1f/7ff96448db26ebc43e9d9e4d3b3e49e23c6b4a7dda908413a93ef20a8370/supabase_functions-2.25.1-py3-none-any.whl", hash = "sha256:8ba549a2e3d12a95f46438ad8474e15394dcc7abd05fc5b73b134eda712d096d", size = 8473, upload-time = "2025-12-10T21:48:36.675Z" },
+]
+
+[[package]]
+name = "sympy"
+version = "1.13.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mpmath" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" },
+]
+
+[[package]]
+name = "tenacity"
+version = "9.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" },
+]
+
+[[package]]
+name = "tensorboard"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "absl-py" },
+    { name = "grpcio" },
+    { name = "markdown" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+    { name = "tensorboard-data-server" },
+    { name = "werkzeug" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl", hash = "sha256:9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6", size = 5525680, upload-time = "2025-07-17T19:20:49.638Z" },
+]
+
+[[package]]
+name = "tensorboard-data-server"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356, upload-time = "2023-10-23T21:23:32.16Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598, upload-time = "2023-10-23T21:23:33.714Z" },
+    { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" },
+]
+
+[[package]]
+name = "tensorboardx"
+version = "2.6.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2b/c5/d4cc6e293fb837aaf9f76dd7745476aeba8ef7ef5146c3b3f9ee375fe7a5/tensorboardx-2.6.4.tar.gz", hash = "sha256:b163ccb7798b31100b9f5fa4d6bc22dad362d7065c2f24b51e50731adde86828", size = 4769801, upload-time = "2025-06-10T22:37:07.419Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/1d/b5d63f1a6b824282b57f7b581810d20b7a28ca951f2d5b59f1eb0782c12b/tensorboardx-2.6.4-py3-none-any.whl", hash = "sha256:5970cf3a1f0a6a6e8b180ccf46f3fe832b8a25a70b86e5a237048a7c0beb18e2", size = 87201, upload-time = "2025-06-10T22:37:05.44Z" },
+]
+
+[[package]]
+name = "termcolor"
+version = "3.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/87/56/ab275c2b56a5e2342568838f0d5e3e66a32354adcc159b495e374cda43f5/termcolor-3.2.0.tar.gz", hash = "sha256:610e6456feec42c4bcd28934a8c87a06c3fa28b01561d46aa09a9881b8622c58", size = 14423, upload-time = "2025-10-25T19:11:42.586Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f9/d5/141f53d7c1eb2a80e6d3e9a390228c3222c27705cbe7f048d3623053f3ca/termcolor-3.2.0-py3-none-any.whl", hash = "sha256:a10343879eba4da819353c55cb8049b0933890c2ebf9ad5d3ecd2bb32ea96ea6", size = 7698, upload-time = "2025-10-25T19:11:41.536Z" },
+]
+
+[[package]]
+name = "terminaltables"
+version = "3.1.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f5/fc/0b73d782f5ab7feba8d007573a3773c58255f223c5940a7b7085f02153c3/terminaltables-3.1.10.tar.gz", hash = "sha256:ba6eca5cb5ba02bba4c9f4f985af80c54ec3dccf94cfcd190154386255e47543", size = 12264, upload-time = "2021-12-07T19:03:35.758Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/fb/ea621e0a19733e01fe4005d46087d383693c0f4a8f824b47d8d4122c87e0/terminaltables-3.1.10-py2.py3-none-any.whl", hash = "sha256:e4fdc4179c9e4aab5f674d80f09d76fa436b96fdc698a8505e0a36bf0804a874", size = 15155, upload-time = "2021-12-07T19:03:34.013Z" },
+]
+
+[[package]]
+name = "textdistance"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a9/4c/96d7ff24f1bee11ade34b1daea9f70fc4c115781bbf380089470c053ef4d/textdistance-4.2.0.tar.gz", hash = "sha256:6d2a398815aeed453cfb38a3b62da74e33fa6a5f4e42845fd1d2c9611836befd", size = 34519, upload-time = "2020-04-13T09:59:24.571Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/71/87133323736b9b0180f600d477507318dae0abde613a54df33bfd0248614/textdistance-4.2.0-py3-none-any.whl", hash = "sha256:61ddcdd9a78da99eff11dc1219d444f72915212cf36947de3266a356f5e934f7", size = 29118, upload-time = "2020-04-13T09:59:27.03Z" },
+]
+
+[package.optional-dependencies]
+levenshtein = [
+    { name = "abydos" },
+    { name = "jellyfish" },
+    { name = "numpy" },
+    { name = "python-levenshtein" },
+    { name = "pyxdameraulevenshtein" },
+]
+
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
+]
+
+[[package]]
+name = "timm"
+version = "1.0.21"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+    { name = "torchvision", version = "0.16.0", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "torchvision", version = "0.16.0+cu121", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/63/ab9bc9441f173fab436d15070dbc90341ff1e439f3b76c6871bc37176580/timm-1.0.21.tar.gz", hash = "sha256:aa372fe43a85ed6ea0dd14945dac724c842e6e373779e2a2afd67d7dc1b82c4c", size = 2382582, upload-time = "2025-10-24T22:37:57.756Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/8c/a668e732032f6de4ecc6b33f7ed27eab1c238dce35f6fe39986ad61aed9e/timm-1.0.21-py3-none-any.whl", hash = "sha256:e7428083af9f68af5ef1d50724946d9b6a2ccba8688c3e5fc9370f59f76e50cf", size = 2529988, upload-time = "2025-10-24T22:37:55.539Z" },
+]
+
+[[package]]
+name = "tokenizers"
+version = "0.21.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/2f/402986d0823f8d7ca139d969af2917fefaa9b947d1fb32f6168c509f2492/tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880", size = 351253, upload-time = "2025-07-28T15:48:54.325Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/c6/fdb6f72bf6454f52eb4a2510be7fb0f614e541a2554d6210e370d85efff4/tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133", size = 2863987, upload-time = "2025-07-28T15:48:44.877Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a6/28975479e35ddc751dc1ddc97b9b69bf7fcf074db31548aab37f8116674c/tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60", size = 2732457, upload-time = "2025-07-28T15:48:43.265Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/8f/24f39d7b5c726b7b0be95dca04f344df278a3fe3a4deb15a975d194cbb32/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5", size = 3012624, upload-time = "2025-07-28T13:22:43.895Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/26358925717687a58cb74d7a508de96649544fad5778f0cd9827398dc499/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6", size = 2939681, upload-time = "2025-07-28T13:22:47.499Z" },
+    { url = "https://files.pythonhosted.org/packages/99/6f/cc300fea5db2ab5ddc2c8aea5757a27b89c84469899710c3aeddc1d39801/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9", size = 3247445, upload-time = "2025-07-28T15:48:39.711Z" },
+    { url = "https://files.pythonhosted.org/packages/be/bf/98cb4b9c3c4afd8be89cfa6423704337dc20b73eb4180397a6e0d456c334/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732", size = 3428014, upload-time = "2025-07-28T13:22:49.569Z" },
+    { url = "https://files.pythonhosted.org/packages/75/c7/96c1cc780e6ca7f01a57c13235dd05b7bc1c0f3588512ebe9d1331b5f5ae/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2", size = 3193197, upload-time = "2025-07-28T13:22:51.471Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/90/273b6c7ec78af547694eddeea9e05de771278bd20476525ab930cecaf7d8/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff", size = 3115426, upload-time = "2025-07-28T15:48:41.439Z" },
+    { url = "https://files.pythonhosted.org/packages/91/43/c640d5a07e95f1cf9d2c92501f20a25f179ac53a4f71e1489a3dcfcc67ee/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2", size = 9089127, upload-time = "2025-07-28T15:48:46.472Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a1/dd23edd6271d4dca788e5200a807b49ec3e6987815cd9d0a07ad9c96c7c2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78", size = 9055243, upload-time = "2025-07-28T15:48:48.539Z" },
+    { url = "https://files.pythonhosted.org/packages/21/2b/b410d6e9021c4b7ddb57248304dc817c4d4970b73b6ee343674914701197/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b", size = 9298237, upload-time = "2025-07-28T15:48:50.443Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/0a/42348c995c67e2e6e5c89ffb9cfd68507cbaeb84ff39c49ee6e0a6dd0fd2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24", size = 9461980, upload-time = "2025-07-28T15:48:52.325Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/d3/dacccd834404cd71b5c334882f3ba40331ad2120e69ded32cf5fda9a7436/tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0", size = 2329871, upload-time = "2025-07-28T15:48:56.841Z" },
+    { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" },
+]
+
+[[package]]
+name = "torch"
+version = "2.1.0+cu121"
+source = { registry = "https://download.pytorch.org/whl/cu121" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "jinja2" },
+    { name = "networkx" },
+    { name = "sympy" },
+    { name = "triton" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-linux_x86_64.whl", hash = "sha256:aa984599c2c4ffbc57c48d0d965cbe832e610c967e8179d4ac0a582c733fe112" },
+    { url = "https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-win_amd64.whl", hash = "sha256:3b7c6dd1ab12a9c70b29bf1ea34fcf2c519233c58c619c1a553d328955c8a602" },
+]
+
+[[package]]
+name = "torchinfo"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/d9/2b811d1c0812e9ef23e6cf2dbe022becbe6c5ab065e33fd80ee05c0cd996/torchinfo-1.8.0.tar.gz", hash = "sha256:72e94b0e9a3e64dc583a8e5b7940b8938a1ac0f033f795457f27e6f4e7afa2e9", size = 25880, upload-time = "2023-05-14T19:23:26.377Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl", hash = "sha256:2e911c2918603f945c26ff21a3a838d12709223dc4ccf243407bce8b6e897b46", size = 23377, upload-time = "2023-05-14T19:23:24.141Z" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.16.0"
+source = { registry = "https://download.pytorch.org/whl/cu121" }
+resolution-markers = [
+    "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "numpy", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "pillow", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "requests", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+    { name = "torch", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/torchvision-0.16.0-cp311-cp311-linux_aarch64.whl", hash = "sha256:9ed5f21e5a56e466667c6f9f6f93dba2a75e29921108bd70043eaf8e9ba0a7cc" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.16.0+cu121"
+source = { registry = "https://download.pytorch.org/whl/cu121" }
+resolution-markers = [
+    "platform_machine == 'aarch64' and platform_python_implementation != 'CPython' and sys_platform == 'linux'",
+    "platform_machine != 'aarch64' and sys_platform == 'linux'",
+    "sys_platform == 'darwin'",
+    "sys_platform != 'darwin' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "numpy", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+    { name = "pillow", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+    { name = "requests", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+    { name = "torch", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp311-cp311-linux_x86_64.whl", hash = "sha256:7a325270c7806571ceddbd27c8ece5c163cceb476f09dcca7eb5157073216b22" },
+    { url = "https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp311-cp311-win_amd64.whl", hash = "sha256:09dea0b374be56df4ae148e83221f172a8a6c999475e9483037ab6efa3cd6b80" },
+]
+
+[[package]]
+name = "tornado"
+version = "6.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/ce/1eb500eae19f4648281bb2186927bb062d2438c2e5093d1360391afd2f90/tornado-6.5.2.tar.gz", hash = "sha256:ab53c8f9a0fa351e2c0741284e06c7a45da86afb544133201c5cc8578eb076a0", size = 510821, upload-time = "2025-08-08T18:27:00.78Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/48/6a7529df2c9cc12efd2e8f5dd219516184d703b34c06786809670df5b3bd/tornado-6.5.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2436822940d37cde62771cff8774f4f00b3c8024fe482e16ca8387b8a2724db6", size = 442563, upload-time = "2025-08-08T18:26:42.945Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/b5/9b575a0ed3e50b00c40b08cbce82eb618229091d09f6d14bce80fc01cb0b/tornado-6.5.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:583a52c7aa94ee046854ba81d9ebb6c81ec0fd30386d96f7640c96dad45a03ef", size = 440729, upload-time = "2025-08-08T18:26:44.473Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/4e/619174f52b120efcf23633c817fd3fed867c30bff785e2cd5a53a70e483c/tornado-6.5.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0fe179f28d597deab2842b86ed4060deec7388f1fd9c1b4a41adf8af058907e", size = 444295, upload-time = "2025-08-08T18:26:46.021Z" },
+    { url = "https://files.pythonhosted.org/packages/95/fa/87b41709552bbd393c85dd18e4e3499dcd8983f66e7972926db8d96aa065/tornado-6.5.2-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b186e85d1e3536d69583d2298423744740986018e393d0321df7340e71898882", size = 443644, upload-time = "2025-08-08T18:26:47.625Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108", size = 443878, upload-time = "2025-08-08T18:26:50.599Z" },
+    { url = "https://files.pythonhosted.org/packages/11/92/fe6d57da897776ad2e01e279170ea8ae726755b045fe5ac73b75357a5a3f/tornado-6.5.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:06ceb1300fd70cb20e43b1ad8aaee0266e69e7ced38fa910ad2e03285009ce7c", size = 444549, upload-time = "2025-08-08T18:26:51.864Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/02/c8f4f6c9204526daf3d760f4aa555a7a33ad0e60843eac025ccfd6ff4a93/tornado-6.5.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:74db443e0f5251be86cbf37929f84d8c20c27a355dd452a5cfa2aada0d001ec4", size = 443973, upload-time = "2025-08-08T18:26:53.625Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/2d/f5f5707b655ce2317190183868cd0f6822a1121b4baeae509ceb9590d0bd/tornado-6.5.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b5e735ab2889d7ed33b32a459cac490eda71a1ba6857b0118de476ab6c366c04", size = 443954, upload-time = "2025-08-08T18:26:55.072Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/59/593bd0f40f7355806bf6573b47b8c22f8e1374c9b6fd03114bd6b7a3dcfd/tornado-6.5.2-cp39-abi3-win32.whl", hash = "sha256:c6f29e94d9b37a95013bb669616352ddb82e3bfe8326fccee50583caebc8a5f0", size = 445023, upload-time = "2025-08-08T18:26:56.677Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/2a/f609b420c2f564a748a2d80ebfb2ee02a73ca80223af712fca591386cafb/tornado-6.5.2-cp39-abi3-win_amd64.whl", hash = "sha256:e56a5af51cc30dd2cae649429af65ca2f6571da29504a07995175df14c18f35f", size = 445427, upload-time = "2025-08-08T18:26:57.91Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/4f/e1f65e8f8c76d73658b33d33b81eed4322fb5085350e4328d5c956f0c8f9/tornado-6.5.2-cp39-abi3-win_arm64.whl", hash = "sha256:d6c33dc3672e3a1f3618eb63b7ef4683a7688e7b9e6e8f0d9aa5726360a004af", size = 444456, upload-time = "2025-08-08T18:26:59.207Z" },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+]
+
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
+
+[[package]]
+name = "transformers"
+version = "4.49.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "safetensors" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/50/46573150944f46df8ec968eda854023165a84470b42f69f67c7d475dabc5/transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e", size = 8610952, upload-time = "2025-02-17T15:19:03.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03", size = 9970275, upload-time = "2025-02-17T15:18:58.814Z" },
+]
+
+[[package]]
+name = "trio"
+version = "0.31.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "idna" },
+    { name = "outcome" },
+    { name = "sniffio" },
+    { name = "sortedcontainers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/76/8f/c6e36dd11201e2a565977d8b13f0b027ba4593c1a80bed5185489178e257/trio-0.31.0.tar.gz", hash = "sha256:f71d551ccaa79d0cb73017a33ef3264fde8335728eb4c6391451fe5d253a9d5b", size = 605825, upload-time = "2025-09-09T15:17:15.242Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/5b/94237a3485620dbff9741df02ff6d8acaa5fdec67d81ab3f62e4d8511bf7/trio-0.31.0-py3-none-any.whl", hash = "sha256:b5d14cd6293d79298b49c3485ffd9c07e3ce03a6da8c7dfbe0cb3dd7dc9a4774", size = 512679, upload-time = "2025-09-09T15:17:13.821Z" },
+]
+
+[[package]]
+name = "trio-websocket"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "outcome" },
+    { name = "trio" },
+    { name = "wsproto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" },
+]
+
+[[package]]
+name = "triton"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/c1/54fffb2eb13d293d9a429fead3646752ea190de0229bcf3d591ba2481263/triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8", size = 89234153, upload-time = "2023-09-01T07:26:20.161Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.14.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
+]
+
+[[package]]
+name = "tzdata"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
+]
+
+[[package]]
+name = "umap-learn"
+version = "0.5.9.post2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numba" },
+    { name = "numpy" },
+    { name = "pynndescent" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5f/ee/6bc65bd375c812026a7af63fe9d09d409382120aff25f2152f1ba12af5ec/umap_learn-0.5.9.post2.tar.gz", hash = "sha256:bdf60462d779bd074ce177a0714ced17e6d161285590fa487f3f9548dd3c31c9", size = 95441, upload-time = "2025-07-03T00:18:02.479Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6b/b1/c24deeda9baf1fd491aaad941ed89e0fed6c583a117fd7b79e0a33a1e6c0/umap_learn-0.5.9.post2-py3-none-any.whl", hash = "sha256:fbe51166561e0e7fab00ef3d516ac2621243b8d15cf4bef9f656d701736b16a0", size = 90146, upload-time = "2025-07-03T00:18:01.042Z" },
+]
+
+[[package]]
+name = "uritemplate"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/60/f174043244c5306c9988380d2cb10009f91563fc4b31293d27e17201af56/uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e", size = 33267, upload-time = "2025-06-02T15:12:06.318Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/99/3ae339466c9183ea5b8ae87b34c0b897eda475d2aec2307cae60e5cd4f29/uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686", size = 11488, upload-time = "2025-06-02T15:12:03.405Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
+]
+
+[package.optional-dependencies]
+socks = [
+    { name = "pysocks" },
+]
+
+[[package]]
+name = "uvicorn"
+version = "0.40.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" },
+]
+
+[package.optional-dependencies]
+standard = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "httptools" },
+    { name = "python-dotenv" },
+    { name = "pyyaml" },
+    { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" },
+    { name = "watchfiles" },
+    { name = "websockets" },
+]
+
+[[package]]
+name = "uvloop"
+version = "0.22.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/d5/69900f7883235562f1f50d8184bb7dd84a2fb61e9ec63f3782546fdbd057/uvloop-0.22.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c60ebcd36f7b240b30788554b6f0782454826a0ed765d8430652621b5de674b9", size = 1352420, upload-time = "2025-10-16T22:16:21.187Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/73/c4e271b3bce59724e291465cc936c37758886a4868787da0278b3b56b905/uvloop-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b7f102bf3cb1995cfeaee9321105e8f5da76fdb104cdad8986f85461a1b7b77", size = 748677, upload-time = "2025-10-16T22:16:22.558Z" },
+    { url = "https://files.pythonhosted.org/packages/86/94/9fb7fad2f824d25f8ecac0d70b94d0d48107ad5ece03769a9c543444f78a/uvloop-0.22.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53c85520781d84a4b8b230e24a5af5b0778efdb39142b424990ff1ef7c48ba21", size = 3753819, upload-time = "2025-10-16T22:16:23.903Z" },
+    { url = "https://files.pythonhosted.org/packages/74/4f/256aca690709e9b008b7108bc85fba619a2bc37c6d80743d18abad16ee09/uvloop-0.22.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56a2d1fae65fd82197cb8c53c367310b3eabe1bbb9fb5a04d28e3e3520e4f702", size = 3804529, upload-time = "2025-10-16T22:16:25.246Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/74/03c05ae4737e871923d21a76fe28b6aad57f5c03b6e6bfcfa5ad616013e4/uvloop-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40631b049d5972c6755b06d0bfe8233b1bd9a8a6392d9d1c45c10b6f9e9b2733", size = 3621267, upload-time = "2025-10-16T22:16:26.819Z" },
+    { url = "https://files.pythonhosted.org/packages/75/be/f8e590fe61d18b4a92070905497aec4c0e64ae1761498cad09023f3f4b3e/uvloop-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:535cc37b3a04f6cd2c1ef65fa1d370c9a35b6695df735fcff5427323f2cd5473", size = 3723105, upload-time = "2025-10-16T22:16:28.252Z" },
+]
+
+[[package]]
+name = "wandb"
+version = "0.21.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "gitpython" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "protobuf" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "sentry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/69/217598886af89350e36bc05c092a67c9c469cff1fd6446edd4c879027e36/wandb-0.21.1.tar.gz", hash = "sha256:753bbdaa3a7703344056e019425b39c17a3d31d8ca0c4d13c4efc046935b08b9", size = 40131395, upload-time = "2025-08-07T18:52:48.85Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/d0/589f970741f3ead9ad28d4cbb668d1e6a39848df767f004ac9c7bed8f4b5/wandb-0.21.1-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:96f9eedeae428de0d88f9751fb81f1b730ae7902f35c2f5a7a904d7733f124f3", size = 21701698, upload-time = "2025-08-07T18:52:22.399Z" },
+    { url = "https://files.pythonhosted.org/packages/41/6c/a6140a0f395a99902aafdfe63088b7aff509e4f14cd7dd084d47eab36f27/wandb-0.21.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:41a1ec1b98d9d7e1bcafc483bce82e184b6cbae7531328a0fe8dd0f56d96a92e", size = 21221046, upload-time = "2025-08-07T18:52:26.134Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/d8/dacbb30ed35141d48a387d84f2e792d4b61b5bcdbf5ffdbd3f0b57beb346/wandb-0.21.1-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:f74d4691c38318ed8611e00ca3246b4152a03ff390fdce41816bea5705452a73", size = 21885803, upload-time = "2025-08-07T18:52:28.489Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/48/3a7290a33b1f64e29ac8779dab4d4cdef31a9ed3c3d9ea656a4507d64332/wandb-0.21.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c8fbd60b9abf4b9bec201f311602f61394d41a3503c801750b03975a5e36d1b", size = 20825318, upload-time = "2025-08-07T18:52:31.282Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/54/c0a087114ff1bb6c32e64aaa58aea4342cebc0ad58b1378c0a5a831d2508/wandb-0.21.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ded9313672630c0630f5b13c598ce9aa0e932e811ebc18823fcc4d73acfb6bb", size = 22362500, upload-time = "2025-08-07T18:52:33.889Z" },
+    { url = "https://files.pythonhosted.org/packages/65/68/3aae277ea9fb5d91eec066cf256755bed3a740d92b539888a7ce36cf3f6c/wandb-0.21.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:44f3194d697b409f91708c50c5f9d56e282434a0d60ac380b64f0fb6991cd630", size = 20830372, upload-time = "2025-08-07T18:52:36.76Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/bb/58d206e79be1f279ef06cb934ae1e208bcacd2cd73b7a7652236575010d6/wandb-0.21.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e0b68bb6dbe94f1910c665c755f438292df40c272feb1a8b42208c1df52cce26", size = 22438521, upload-time = "2025-08-07T18:52:39.672Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/b8/dfe01f8e4c40d5dda820fd839c39431608a3453670f79404fa28915972d2/wandb-0.21.1-py3-none-win32.whl", hash = "sha256:98306c3fb369dfafb7194270b938b000ea2bb08dbddff10c19b5a805fd5cab80", size = 21569814, upload-time = "2025-08-07T18:52:42.58Z" },
+    { url = "https://files.pythonhosted.org/packages/51/ba/81c77d5d831fcddb89661c85175fcbb91d2ffecf6b0591972829da3eb42f/wandb-0.21.1-py3-none-win_amd64.whl", hash = "sha256:8be92a7e92b5cb5ce00ec0961f9dbaad7757ffdbc5b5a8f2cc7188e23f653f0a", size = 21569817, upload-time = "2025-08-07T18:52:45.559Z" },
+]
+
+[[package]]
+name = "watchfiles"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1f/f8/2c5f479fb531ce2f0564eda479faecf253d886b1ab3630a39b7bf7362d46/watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5", size = 406529, upload-time = "2025-10-14T15:04:32.899Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/cd/f515660b1f32f65df671ddf6f85bfaca621aee177712874dc30a97397977/watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741", size = 394384, upload-time = "2025-10-14T15:04:33.761Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/c3/28b7dc99733eab43fca2d10f55c86e03bd6ab11ca31b802abac26b23d161/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6", size = 448789, upload-time = "2025-10-14T15:04:34.679Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/24/33e71113b320030011c8e4316ccca04194bf0cbbaeee207f00cbc7d6b9f5/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b", size = 460521, upload-time = "2025-10-14T15:04:35.963Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/c3/3c9a55f255aa57b91579ae9e98c88704955fa9dac3e5614fb378291155df/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14", size = 488722, upload-time = "2025-10-14T15:04:37.091Z" },
+    { url = "https://files.pythonhosted.org/packages/49/36/506447b73eb46c120169dc1717fe2eff07c234bb3232a7200b5f5bd816e9/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d", size = 596088, upload-time = "2025-10-14T15:04:38.39Z" },
+    { url = "https://files.pythonhosted.org/packages/82/ab/5f39e752a9838ec4d52e9b87c1e80f1ee3ccdbe92e183c15b6577ab9de16/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff", size = 472923, upload-time = "2025-10-14T15:04:39.666Z" },
+    { url = "https://files.pythonhosted.org/packages/af/b9/a419292f05e302dea372fa7e6fda5178a92998411f8581b9830d28fb9edb/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606", size = 456080, upload-time = "2025-10-14T15:04:40.643Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/c3/d5932fd62bde1a30c36e10c409dc5d54506726f08cb3e1d8d0ba5e2bc8db/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701", size = 629432, upload-time = "2025-10-14T15:04:41.789Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/77/16bddd9779fafb795f1a94319dc965209c5641db5bf1edbbccace6d1b3c0/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10", size = 623046, upload-time = "2025-10-14T15:04:42.718Z" },
+    { url = "https://files.pythonhosted.org/packages/46/ef/f2ecb9a0f342b4bfad13a2787155c6ee7ce792140eac63a34676a2feeef2/watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849", size = 271473, upload-time = "2025-10-14T15:04:43.624Z" },
+    { url = "https://files.pythonhosted.org/packages/94/bc/f42d71125f19731ea435c3948cad148d31a64fccde3867e5ba4edee901f9/watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4", size = 287598, upload-time = "2025-10-14T15:04:44.516Z" },
+    { url = "https://files.pythonhosted.org/packages/57/c9/a30f897351f95bbbfb6abcadafbaca711ce1162f4db95fc908c98a9165f3/watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e", size = 277210, upload-time = "2025-10-14T15:04:45.883Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8e/e500f8b0b77be4ff753ac94dc06b33d8f0d839377fee1b78e8c8d8f031bf/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88", size = 408250, upload-time = "2025-10-14T15:06:10.264Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/95/615e72cd27b85b61eec764a5ca51bd94d40b5adea5ff47567d9ebc4d275a/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336", size = 396117, upload-time = "2025-10-14T15:06:11.28Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/81/e7fe958ce8a7fb5c73cc9fb07f5aeaf755e6aa72498c57d760af760c91f8/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24", size = 450493, upload-time = "2025-10-14T15:06:12.321Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/d4/ed38dd3b1767193de971e694aa544356e63353c33a85d948166b5ff58b9e/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49", size = 457546, upload-time = "2025-10-14T15:06:13.372Z" },
+]
+
+[[package]]
+name = "wcwidth"
+version = "0.2.14"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" },
+]
+
+[[package]]
+name = "websocket-client"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" },
+]
+
+[[package]]
+name = "websockets"
+version = "15.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" },
+    { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" },
+    { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" },
+    { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" },
+    { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" },
+    { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
+]
+
+[[package]]
+name = "werkzeug"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload-time = "2024-11-08T15:52:18.093Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" },
+]
+
+[[package]]
+name = "wheel"
+version = "0.45.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef809ae6338005b3f21bb568bea3165cfc6a243fdc25c/wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729", size = 107545, upload-time = "2024-11-23T00:18:23.513Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" },
+]
+
+[[package]]
+name = "wsproto"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/4a/44d3c295350d776427904d73c189e10aeae66d7f555bb2feee16d1e4ba5a/wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", size = 53425, upload-time = "2022-08-23T19:58:21.447Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226, upload-time = "2022-08-23T19:58:19.96Z" },
+]
+
+[[package]]
+name = "xxhash"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" },
+    { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" },
+    { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" },
+    { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" },
+    { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" },
+    { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" },
+    { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" },
+    { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" },
+    { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" },
+    { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" },
+]
+
+[[package]]
+name = "yapf"
+version = "0.43.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "platformdirs" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/23/97/b6f296d1e9cc1ec25c7604178b48532fa5901f721bcf1b8d8148b13e5588/yapf-0.43.0.tar.gz", hash = "sha256:00d3aa24bfedff9420b2e0d5d9f5ab6d9d4268e72afbf59bb3fa542781d5218e", size = 254907, upload-time = "2024-11-14T00:11:41.584Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/37/81/6acd6601f61e31cfb8729d3da6d5df966f80f374b78eff83760714487338/yapf-0.43.0-py3-none-any.whl", hash = "sha256:224faffbc39c428cb095818cf6ef5511fdab6f7430a10783fdfb292ccf2852ca", size = 256158, upload-time = "2024-11-14T00:11:39.37Z" },
+]
+
+[[package]]
+name = "yarl"
+version = "1.20.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b1/18/893b50efc2350e47a874c5c2d67e55a0ea5df91186b2a6f5ac52eff887cd/yarl-1.20.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:47ee6188fea634bdfaeb2cc420f5b3b17332e6225ce88149a17c413c77ff269e", size = 133833, upload-time = "2025-06-10T00:43:07.393Z" },
+    { url = "https://files.pythonhosted.org/packages/89/ed/b8773448030e6fc47fa797f099ab9eab151a43a25717f9ac043844ad5ea3/yarl-1.20.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d0f6500f69e8402d513e5eedb77a4e1818691e8f45e6b687147963514d84b44b", size = 91070, upload-time = "2025-06-10T00:43:09.538Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/e3/409bd17b1e42619bf69f60e4f031ce1ccb29bd7380117a55529e76933464/yarl-1.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8900a42fcdaad568de58887c7b2f602962356908eedb7628eaf6021a6e435b", size = 89818, upload-time = "2025-06-10T00:43:11.575Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/77/64d8431a4d77c856eb2d82aa3de2ad6741365245a29b3a9543cd598ed8c5/yarl-1.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bad6d131fda8ef508b36be3ece16d0902e80b88ea7200f030a0f6c11d9e508d4", size = 347003, upload-time = "2025-06-10T00:43:14.088Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/d2/0c7e4def093dcef0bd9fa22d4d24b023788b0a33b8d0088b51aa51e21e99/yarl-1.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:df018d92fe22aaebb679a7f89fe0c0f368ec497e3dda6cb81a567610f04501f1", size = 336537, upload-time = "2025-06-10T00:43:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/f3/fc514f4b2cf02cb59d10cbfe228691d25929ce8f72a38db07d3febc3f706/yarl-1.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f969afbb0a9b63c18d0feecf0db09d164b7a44a053e78a7d05f5df163e43833", size = 362358, upload-time = "2025-06-10T00:43:18.704Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/6d/a313ac8d8391381ff9006ac05f1d4331cee3b1efaa833a53d12253733255/yarl-1.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:812303eb4aa98e302886ccda58d6b099e3576b1b9276161469c25803a8db277d", size = 357362, upload-time = "2025-06-10T00:43:20.888Z" },
+    { url = "https://files.pythonhosted.org/packages/00/70/8f78a95d6935a70263d46caa3dd18e1f223cf2f2ff2037baa01a22bc5b22/yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98c4a7d166635147924aa0bf9bfe8d8abad6fffa6102de9c99ea04a1376f91e8", size = 348979, upload-time = "2025-06-10T00:43:23.169Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/05/42773027968968f4f15143553970ee36ead27038d627f457cc44bbbeecf3/yarl-1.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12e768f966538e81e6e7550f9086a6236b16e26cd964cf4df35349970f3551cf", size = 337274, upload-time = "2025-06-10T00:43:27.111Z" },
+    { url = "https://files.pythonhosted.org/packages/05/be/665634aa196954156741ea591d2f946f1b78ceee8bb8f28488bf28c0dd62/yarl-1.20.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe41919b9d899661c5c28a8b4b0acf704510b88f27f0934ac7a7bebdd8938d5e", size = 363294, upload-time = "2025-06-10T00:43:28.96Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/90/73448401d36fa4e210ece5579895731f190d5119c4b66b43b52182e88cd5/yarl-1.20.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:8601bc010d1d7780592f3fc1bdc6c72e2b6466ea34569778422943e1a1f3c389", size = 358169, upload-time = "2025-06-10T00:43:30.701Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/b0/fce922d46dc1eb43c811f1889f7daa6001b27a4005587e94878570300881/yarl-1.20.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:daadbdc1f2a9033a2399c42646fbd46da7992e868a5fe9513860122d7fe7a73f", size = 362776, upload-time = "2025-06-10T00:43:32.51Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/0d/b172628fce039dae8977fd22caeff3eeebffd52e86060413f5673767c427/yarl-1.20.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:03aa1e041727cb438ca762628109ef1333498b122e4c76dd858d186a37cec845", size = 381341, upload-time = "2025-06-10T00:43:34.543Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/9b/5b886d7671f4580209e855974fe1cecec409aa4a89ea58b8f0560dc529b1/yarl-1.20.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:642980ef5e0fa1de5fa96d905c7e00cb2c47cb468bfcac5a18c58e27dbf8d8d1", size = 379988, upload-time = "2025-06-10T00:43:36.489Z" },
+    { url = "https://files.pythonhosted.org/packages/73/be/75ef5fd0fcd8f083a5d13f78fd3f009528132a1f2a1d7c925c39fa20aa79/yarl-1.20.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:86971e2795584fe8c002356d3b97ef6c61862720eeff03db2a7c86b678d85b3e", size = 371113, upload-time = "2025-06-10T00:43:38.592Z" },
+    { url = "https://files.pythonhosted.org/packages/50/4f/62faab3b479dfdcb741fe9e3f0323e2a7d5cd1ab2edc73221d57ad4834b2/yarl-1.20.1-cp311-cp311-win32.whl", hash = "sha256:597f40615b8d25812f14562699e287f0dcc035d25eb74da72cae043bb884d773", size = 81485, upload-time = "2025-06-10T00:43:41.038Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/09/d9c7942f8f05c32ec72cd5c8e041c8b29b5807328b68b4801ff2511d4d5e/yarl-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:26ef53a9e726e61e9cd1cda6b478f17e350fb5800b4bd1cd9fe81c4d91cfeb2e", size = 86686, upload-time = "2025-06-10T00:43:42.692Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" },
+]
+
+[[package]]
+name = "zipp"
+version = "3.23.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
+]