diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..5b1c3ac0de8096376f74ef34bf08eaa86f651e3a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,59 @@ +# Ignore development artifacts +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +*.dylib +*.log +.venv/ +venv/ +ENV/ +env/ +.git/ +.gitignore +.gitlab-ci.yml +*.md +!README.md +.pytest_cache/ +*.swp +*.swo +*~ +.DS_Store + +# Ignore data directories (too large for Docker context) +data/ +!data/prompt_templates/ +!data/visual_element_prefabs/ + +# Ignore build artifacts +*.egg-info/ +dist/ +build/ +*.whl + +# Ignore handwriting service (separate deployment) +handwriting_service/ + +# Ignore WordStylist (not needed for API) +WordStylist/ + +# Ignore scripts (not needed for API runtime) +scripts/ + +# Ignore documentation and deployment files +ARCHITECTURE.md +DEPLOYMENT.md +*.sh +!start.sh +!start_worker.sh +docker-compose.yml +railway.json +railway_setup_vars.sh + +# Keep only essential code +!docgenie/ +!api/ +!setup.py +!pyproject.toml diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..e7aea87a0092a7d8b52ce45d2aa2759f26cac194 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.svg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +*.ico filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000000000000000000000000000000000000..92b93970145852b19d0dab127119aab54824a7ad --- /dev/null +++ b/.gitignore @@ -0,0 +1,172 @@ +# Project +data/clusters/ +data/embeddings/ +data/temp/ +wandb/ +data/models/ +data/webapp_cache/ +data/analyzation/ +data/cherrypicks/ +data/hw_imgs/ +/data/seed-images/* +/docgenie/playground/test.py +/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images +/docgenie/playground/handwritten_text/handwriting_raw_tokens +/docgenie/playground/handwritten_text/temp +data/datasets +data/models +data/cluster_plots +data/syn_dataset_statistics_plots +data/gt_embeddings +data/wandb_downloads +data/wandb_project_csvs +data/folders.txt +cache +runs +visualizations +.venv +**/**.__pycache__ +/docgenie/playground/handwritten_text/doc_vqa_handwriting_text_images +/docgenie/playground/handwritten_text/temp +data/datasets +data/models + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.log + +# Virtual environments +venv/ +env/ +ENV/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Jupyter Notebook +.ipynb_checkpoints +*.ipynb_checkpoints/ + +# Model artifacts - download separately +inference/ +inference_new/ +inference_hf/ +model/experiments/hf_conditional_latent/cached_vae/ +*.zip + + +# Datasets - download separately +docvqa-handwritten-sizes4/ +syn_docvqa/ +iam_dataset/ +iam_dataset_processed/ +iam_dataset_processed_partial/ +docvqa-test/ +docvqa-viselems/ +docvqa-viselems2/ +temp/ +generations/ + +# Generated outputs +output/ + +# Backup files +*.bak +*.backup +*.tmp + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# OS +./data/clusters_old/ +Thumbs.db + + +# Training +training/ +vae_evaluation/ + + +# Logs and checkpoints +*.pt +# But allow the inference model for handwriting service +!handwriting_service/WordStylist/models/ema_ckpt.pt +*.ckpt +*.pth +*.safetensors + +.env + +# Playwright +node_modules/ +/test-results/ +/playwright-report/ +/blob-report/ +/playwright/.cache/ +/playwright/.auth/ + + +!data/models/ +!data/models/handwriting/ +!data/models/handwriting/char_vocab.json +!data/models/handwriting/config.yaml +!data/models/handwriting/writer_id_map.json +!data/models/handwriting/cached_vae/config.json +data/models/.locks* +data/models/baseline +data/models/legacy +data/models/models* +data/models/pretrained +test_run.py +test_vlm.ipynb +test.ipynb +test2.ipynb +test3.py +test4.py +test5.py +test6.py +data/results +data/results_old/ +data/tmp/ +docgenie/playground/extract_02_eval_metrics_from_wandb.py +docgenie/playground/extract_metrics_from_wandb.py +data/cached_subsets +data/mixed_datasets +data/results_backup_v1 +data/results_v1 +data/old-results/ +data/embeddings +data/mixed_datasets +data/results_backup_v1 +sync_datasets.sh +data/results_latest +data/results_latest copy diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100755 index 0000000000000000000000000000000000000000..6cd5d7481bede501a691eac5043403cd029d7eec --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,16 @@ +# You can override the included template(s) by including variable overrides +# SAST customization: https://docs.gitlab.com/ee/user/application_security/sast/#customizing-the-sast-settings +# Secret Detection customization: https://docs.gitlab.com/user/application_security/secret_detection/pipeline/configure +# Dependency Scanning customization: https://docs.gitlab.com/ee/user/application_security/dependency_scanning/#customizing-the-dependency-scanning-settings +# Container Scanning customization: https://docs.gitlab.com/ee/user/application_security/container_scanning/#customizing-the-container-scanning-settings +# Note that environment variables can be set in several places +# See https://docs.gitlab.com/ee/ci/variables/#cicd-variable-precedence +stages: +- test +- secret-detection +variables: + SECRET_DETECTION_ENABLED: 'true' +secret_detection: + stage: secret-detection +include: +- template: Security/Secret-Detection.gitlab-ci.yml diff --git a/.python-version b/.python-version new file mode 100755 index 0000000000000000000000000000000000000000..efbce23a0e1b1eed58654641085f009d5233a0fb --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11.12 diff --git a/API_FLOW_DOCUMENTATION.md b/API_FLOW_DOCUMENTATION.md new file mode 100644 index 0000000000000000000000000000000000000000..b8a7767b0b10d8485dfad38c82284ade137eb4d3 --- /dev/null +++ b/API_FLOW_DOCUMENTATION.md @@ -0,0 +1,1024 @@ +# Complete API Flow Documentation + +## Overview +The DocGenie API provides three endpoints for synthetic document generation, implementing a 19-stage pipeline that transforms seed images and prompts into complete datasets with OCR, ground truth, and optional handwriting/visual elements. + +**Base URL**: `http://localhost:8000` (development) or Railway deployment +**Documentation**: `/docs` (FastAPI auto-generated Swagger UI) + +--- + +## API Endpoints + +### 1. `/generate` - Legacy JSON Response (POST) +**Purpose**: Generate documents and return complete JSON metadata +**Response**: JSON with HTML, PDF (base64), bounding boxes, optional handwriting/visual elements +**Use Case**: Testing, development, full metadata inspection +**Pipeline Stages**: 1-19 (configurable via parameters) + +### 2. `/generate/pdf` - Sync PDF+Dataset ZIP (POST) +**Purpose**: Generate documents and return ZIP file with all artifacts +**Response**: ZIP file containing: +- `*.pdf` - Generated document PDFs +- `*_final.pdf` - PDFs with handwriting/visual elements (if enabled) +- `*.msgpack` - Dataset format (if export enabled) +- `metadata.json` - Complete generation metadata +- `handwriting/` - Individual handwriting images +- `visual_elements/` - Individual visual element images + +**Use Case**: Production dataset generation, batch processing +**Pipeline Stages**: 1-19 (all features available) + +### 3. `/generate/async` - Async Batch Processing (POST) +**Purpose**: Queue large batch jobs via background worker (Redis Queue) +**Response**: Task ID for status polling +**Status Check**: `GET /generate/async/status/{task_id}` +**Result Download**: `GET /generate/async/result/{task_id}` (returns ZIP) +**Use Case**: Large-scale dataset generation (100+ documents) +**Pipeline Stages**: 1-19 (via worker.py) + +--- + +## Request Parameters + +```python +class GenerateDocumentRequest: + seed_images: List[HttpUrl] # 1-8 seed images from web URLs + prompt_params: PromptParameters # Generation configuration + +class PromptParameters: + # Core Parameters + language: str = "english" # Document language + doc_type: str = "invoice" # Document type (invoice, receipt, form, etc.) + gt_type: str = "qa" # Ground truth format (qa, kie) + gt_format: str = "json" # GT encoding (json, annotation) + num_solutions: int = 1 # Documents per seed set + + # Feature Toggles (Stages 07-19) + enable_handwriting: bool = False # Stage 07-09, 12 + handwriting_ratio: float = 0.2 # Probabilistic filter (0.0-1.0) + enable_visual_elements: bool = False # Stage 08, 10, 13 + visual_element_types: List[str] = [] # Filter types: logo, photo, figure, barcode, etc. + enable_ocr: bool = True # Stage 15 + enable_bbox_normalization: bool = True # Stage 16 + enable_gt_verification: bool = False # Stage 17 + enable_analysis: bool = False # Stage 18 + enable_debug_visualization: bool = False # Stage 19 + enable_dataset_export: bool = False # Stage 19 (msgpack format) + dataset_export_format: str = "msgpack" # Currently only msgpack supported + + # Reproducibility + seed: Optional[int] = None # Random seed (null = random, int = reproducible) +``` + +--- + +## Pipeline Architecture: The 19 Stages + +The API implements all 19 stages of the original batch pipeline in `docgenie/generation/`. Each stage is mapped to corresponding functions in `api/utils.py`. + +### **Phase 1: Core Pipeline (Stages 01-06)** +Generate base documents from seed images and LLM prompts. + +#### **Stage 01: Seed Selection & Download** +- **Original**: `pipeline_01_select_seeds.py` +- **API**: `download_seed_images()` in `api/utils.py:117-161` +- **Process**: + 1. Accept user-provided seed image URLs (1-8 images) + 2. Download with retry logic (3 attempts, exponential backoff) + 3. Handle transient HTTP errors (502, 503, 504, 429) + 4. Convert to base64 for LLM input +- **Error Handling**: Retry with 2s, 4s, 8s delays; raise HTTPException on failure + +#### **Stage 02: Prompt LLM** +- **Original**: `pipeline_02_prompt_llm.py` +- **API**: `call_claude_api_direct()` in `api/utils.py:550-600` +- **Process**: + 1. Load prompt template: `data/prompt_templates/ClaudeRefined12/seed-based-json.txt` + 2. Build prompt with parameters: language, doc_type, gt_type, num_solutions + 3. Call Claude API (Anthropic Messages API v1) + - Model: `claude-3-5-sonnet-20241022` (configurable) + - Max tokens: 16,000 + - Temperature: 1.0 + - Vision: Send base64-encoded seed images + 4. Receive HTML documents with embedded ground truth +- **LLM Output Format**: Multiple `...` blocks with: + - CSS styling with page dimensions + - HTML elements with semantic classes + - Handwriting markers: `class="handwritten author1"` (author1, author2, etc.) + - Visual element placeholders: `data-placeholder="logo"`, `data-content="company-logo"` + - Ground truth: `` + +#### **Stage 03: Process Response & Extract HTML** +- **Original**: `pipeline_03_process_response.py` +- **API**: `extract_html_documents_from_response()` in `api/utils.py:605-635` +- **Process**: + 1. Parse LLM response for `...` blocks (regex) + 2. Prettify HTML with BeautifulSoup + 3. Validate HTML structure + 4. Extract ground truth JSON from ` in the following format: {gt example} +Notes: +• Pay close attention to cultural/regional differences seen +in the seed images (e.g., language, format, disclaimers). +• Feel free to creatively adapt or combine stylistic cues +from the seeds, as long as the end result looks authentic +for that cultural context. +• Do NOT directly copy-paste text or entire code blocks +from any single seed image or across these new solutions. +Now please generate the {num solutions} distinct +{doc type} documents. diff --git a/data/prompt_templates/Adaptation_GT/seed-free.txt b/data/prompt_templates/Adaptation_GT/seed-free.txt new file mode 100755 index 0000000000000000000000000000000000000000..634698f7e5ffd52c3e87fbf2a7844c3079998758 --- /dev/null +++ b/data/prompt_templates/Adaptation_GT/seed-free.txt @@ -0,0 +1,25 @@ +You are an AI specialized in generating multiple unique +HTML documents in one response. Please create +{num solutions} unique HTML documents representing +{doc type}. +Each solution must: +1. Include all mandatory fields: {sections}. +2. Be formatted so it could print on A4 (e.g., use @page +{{ size: A4; }} in your CSS). +3. Show a significantly different layout, styling, and textual content from every other solution. +4. Maintain a {background requirements}. +5. Avoid copy-pasting or reusing large chunks of HTML, +CSS, or disclaimers—each document must be at least +70% different in code and text than the others. +6. Wrap each complete document between +and tags, labeled as: +1. ...Solution #1... +2. ...Solution #2... +... +{num solutions}. ...Solution +#{num solutions}... +Include the {gt type} as JSON in the document via in the following format: {gt example} +Do not provide additional commentary or references to the +other solutions within each HTML. +Now generate the {num solutions} distinct {doc type} +documents. diff --git a/data/prompt_templates/ClaudeRefined1/seed-based.txt b/data/prompt_templates/ClaudeRefined1/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..83b816d620eed5f01c6328df4c1f5d02f3f4bd13 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined1/seed-based.txt @@ -0,0 +1,78 @@ +# HTML Document Generation Prompt (Refined) + +You are an AI specialized in creating culturally authentic HTML documents based on visual analysis of real-world examples. You have been provided with {num_seed_images} seed images of **{doc_type}** documents from different cultural and regional contexts. + +## Cultural Variations (If Present) +The seed images may demonstrate regional differences such as: +- Language variations and local terminology +- Date formatting conventions (DD/MM/YYYY, MM/DD/YYYY, etc.) +- Currency symbols and number formatting +- Layout preferences (field positioning, official elements, cultural design patterns) +- Regional legal disclaimers and regulatory requirements +- Typography and visual hierarchy standards + +## Task Requirements +Generate **{num_solutions}** unique HTML documents that meet these specifications: + +### Core Requirements +1. **Cultural Authenticity**: If cultural/regional variations are present in the seed images, reflect those stylistic elements without directly copying any text, disclaimers, or layouts verbatim +2. **Required Content**: Include all essential fields: {required_sections} +3. **Single Page Format**: Design as single-page documents with dimensions appropriate to the document type (receipts: narrow format, forms: standard width, etc.) +4. **Language**: Generate all content in {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: Each document must be at least 70% different in code structure, styling, and content from others + +## Ground Truth Generation +Generate appropriate ground truth data for each document: {gt_type}. +Include the ground truth as JSON inside each document in a `` tag. +The ground truth must follow the format: {gt_format} + +### Technical Specifications +- Wrap each solution in `...` tags numbered sequentially +- Include the ground truth JSON in `` as specified above +- Implement static CSS appropriate for the document type and single-page layout (no animations, transitions, or dynamic effects) + +## Additional Requirements +{user_descriptions} + +### Content Guidelines +- **DO**: Adapt any cultural/regional stylistic elements present in the seed images +- **DO**: Create authentic-feeling content appropriate to each cultural context +- **DO**: Vary layout structures, color schemes, and typographic choices +- **DO**: Use static styling only (no animations, hover effects, or transitions) +- **DON'T**: Copy-paste text, code blocks, or entire sections between solutions +- **DON'T**: Reuse identical disclaimers, headers, or formatting patterns +- **DON'T**: Include any dynamic effects, animations, or interactive elements + +## Additional Requirements +{user_descriptions} + +## Output Format +Structure your response as: + +``` +1. + + ...complete HTML document... + + +2. + + ...complete HTML document... + + +...continue for all {num_solutions} solutions +``` + +## Quality Checklist +Before generating, ensure each document: +- [ ] Reflects any authentic cultural/regional characteristics present in seed images +- [ ] Contains all required sections: {required_sections} +- [ ] Uses static styling only (no animations or dynamic effects) +- [ ] Uses appropriate single-page formatting for the document type +- [ ] All content is in English +- [ ] Includes the specified ground truth in proper JSON format +- [ ] Maintains 70%+ uniqueness from other solutions +- [ ] Follows semantic HTML best practices + +Now generate the **{num_solutions}** distinct **{doc_type}** documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined10/seed-based.txt b/data/prompt_templates/ClaudeRefined10/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..21fb12f4689eae2f5eb109d684594b5b67a21c42 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined10/seed-based.txt @@ -0,0 +1,57 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply generously increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- Never include signatures as handwriting + +## Visual Placeholders (if document type requires) +- Insert `
` for non-text elements at appropiate positions +- Valid types are: signature, stamp, logo, barcode, photo, chart +- Add data-content attribute with actual content description +- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people and ensure the author is semantically coherent with the document content +- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right' +- Dimensions in mm/cm, e.g. `width:30mm;height:20mm;` +- Example: `
` +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with minified HTML/CSS/JS +- [ ] Content in {language} +- [ ] GT JSON present and correctly formatted +- [ ] Visual elements are semantically coherent + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined11/seed-based.txt b/data/prompt_templates/ClaudeRefined11/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..ea526e5508632fca75840f26cc364944daa15015 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined11/seed-based.txt @@ -0,0 +1,55 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' and use regular text +- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- If the handwriting represents a signature mark it additionally with class 'signature' + +## Visual Placeholders (if document type requires) +- Insert `
` for non-text elements at appropriate positions +- Valid types are: stamp, logo, barcode, photo, chart +- Add data-content attribute with actual content description +- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right' +- Always provide dimensions in mm/cm, e.g. `width:30mm;height:20mm;` +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with minified HTML/CSS/JS +- [ ] Content in {language} +- [ ] GT JSON present and correctly formatted +- [ ] Visual elements are semantically coherent + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt b/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt new file mode 100755 index 0000000000000000000000000000000000000000..166162e89a5670ee549ee9a0eb66df9642bdaa2d --- /dev/null +++ b/data/prompt_templates/ClaudeRefined12/seed-based-annotation.txt @@ -0,0 +1,55 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' and use regular text +- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- If the handwriting represents a signature mark it additionally with class 'signature' + +## Visual Placeholders (if document type requires) +- Insert `
` for non-text elements at appropriate positions +- Valid types are: stamp, logo, figure, barcode, photo +- Add data-content attribute with actual content description +- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right' +- Always provide appropiate dimensions +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +Generate ground truth by assigning each applicable element in HTML a class from the list below to uniquely identify its label: +{gt_type} +{gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with minified HTML/CSS +- [ ] Content in {language} +- [ ] GT labels via class annotations are present and assigned to correct elements +- [ ] Visual elements are semantically coherent + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined12/seed-based-json.txt b/data/prompt_templates/ClaudeRefined12/seed-based-json.txt new file mode 100755 index 0000000000000000000000000000000000000000..6dbac5efd21eb7a8365ac553b11817d6defbb395 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined12/seed-based-json.txt @@ -0,0 +1,55 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' and use regular text +- Apply no special styles to 'handwritten', except generously increased size, in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- If the handwriting represents a signature mark it additionally with class 'signature' + +## Visual Placeholders (if document type requires) +- Insert `
` for non-text elements at appropriate positions +- Valid types are: stamp, logo, figure, barcode, photo +- Add data-content attribute with actual content description +- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right' +- Always provide appropiate dimensions +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with minified HTML/CSS +- [ ] Content in {language} +- [ ] GT JSON present, correctly formatted and semantically coherent +- [ ] Visual elements are semantically coherent + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined2/seed-based.txt b/data/prompt_templates/ClaudeRefined2/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..107eeb8425d485b835ec96c4f36573cdb54d1d70 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined2/seed-based.txt @@ -0,0 +1,70 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' (no special styling/fonts, treat as regular text) +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document + +## Visual Placeholders (if document type requires) +- Include placeholders for non-text visual elements using HTML class 'visual-element' +- Add data attributes: data-type (signature/logo/stamp/barcode/photo/chart/etc.) and data-content (actual content) +- Give each placeholder appropriate dimensions via inline styles +- Examples: `
`, `
`, `
` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined3/seed-based.txt b/data/prompt_templates/ClaudeRefined3/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..1ef1aeafbc5d5865dc065b64c5ff0520bcfc5a03 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined3/seed-based.txt @@ -0,0 +1,70 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' (no special styling/fonts, treat as regular text) +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document + +## Visual Placeholders (if document type requires) +- Include placeholders for non-text visual elements as JSON in `` tag. +- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content) +- Describe placement of each visual element with appropriate dimensions and y-rotation +- Examples: `[{"type": "signature", "content": "John Doe", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt b/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..b968eed4d2f93b252244cc9afb1b18d4b6556a5d --- /dev/null +++ b/data/prompt_templates/ClaudeRefined3CloneDoc/seed-based.txt @@ -0,0 +1,97 @@ +You are an AI creating HTML documents that **clone the style and structure** of {num_seed_images} seed images of **{doc_type}** documents. + +# Task: Generate {num_solutions} cloned HTML documents + +## Core Objective +**CLONE the visual design, layout, and structure** of the seed images while using **completely different data**. Think of this as creating blank template instances filled with new information. + +## Critical Requirements +1. **Visual Fidelity**: Replicate styling elements from seed images: + - Exact layout structure (positioning, spacing, alignment) + - Typography (fonts, sizes, weights, colors) + - Visual hierarchy and sectioning + - Color schemes and backgrounds + - Border styles, dividers, and decorative elements + - Logo/header/footer placement and styling + +2. **Data Uniqueness**: Generate completely new content: + - **NEVER copy**: names, addresses, phone numbers, emails, IBANs, account numbers, license numbers, ID numbers, dates, amounts, prices, or any other specific data points + - Generate realistic but fictional alternatives for all data fields + - Maintain data type appropriateness (valid formats for phones, IBANs, dates, etc.) + - Ensure cultural/regional authenticity for generated data + +3. **Required Fields**: {required_sections} + +4. **Format**: Single-page, dimensions matching seed documents + +5. **Language**: {language} + +6. **Background**: {background_requirements} + +7. **Static Only**: No animations, transitions, or dynamic effects + +## Cloning Strategy +- **DO**: Match layout grids, spacing, font choices, color palettes, sectioning patterns, table structures, visual element placement +- **DON'T**: Copy any actual text content, numerical data, personal information, or business-specific details +- **Think**: "Same template, different instance" + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Replicate CSS styling patterns from seed documents + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' (no special styling/fonts, treat as regular text) +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document +- Generate different handwritten content than seed documents + +## Visual Placeholders (if document type requires) +- Include placeholders for non-text visual elements as JSON in `` tag. +- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content - must be different from seed) +- Match placement patterns from seed documents with appropriate dimensions and y-rotation +- Examples: `[{"type": "signature", "content": "Jane Smith", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]` + +## Data Generation Guidelines +- Names: Generate culturally appropriate fictional names +- Addresses: Create realistic but non-existent addresses +- Phone/Fax: Use valid formats with fictional numbers +- IBANs/Account numbers: Generate format-compliant fictional numbers +- Dates: Use different dates maintaining logical consistency +- Amounts: Generate different values appropriate to context +- IDs/References: Create format-matching fictional identifiers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Layout/structure matches seed documents +- [ ] Typography and colors replicated +- [ ] ALL data is different from seed (no copied info) +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] Data formats are culturally appropriate + +Generate {num_solutions} cloned {doc_type} documents with new data. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined4/seed-based.txt b/data/prompt_templates/ClaudeRefined4/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..985e165a782f5848155ec49288544652368759cd --- /dev/null +++ b/data/prompt_templates/ClaudeRefined4/seed-based.txt @@ -0,0 +1,71 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' and apply no styles to this class +- Distinguish between different sizes of handwriting using classes 'hw-size1', 'hw-size2' which are in line with realistic handwriting and dependent on the context +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document + +## Visual Placeholders (if document type requires) +- Include placeholders for non-text visual elements as JSON in `` tag. +- Describe type (signature/logo/stamp/barcode/photo/chart/etc.) and content (actual content) +- Describe placement of each visual element with appropriate dimensions and y-rotation +- Examples: `[{"type": "signature", "content": "John Doe", "x0": 105, "x1": 116, "y0": 82, "y1": 102, "rotation": -4}, ...]` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined5/seed-based.txt b/data/prompt_templates/ClaudeRefined5/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..0d87781ea0b394f199ee0b01c8d3a1705c29c166 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined5/seed-based.txt @@ -0,0 +1,77 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people's handwriting on the same document +- Never include signatures as handwriting + +## Visual Placeholders (if document type requires) +- Use invisible placeholder divs with class 'visual-placeholder' +- Specify type via data-type attribute ('signature', 'stamp', 'logo', 'barcode', 'photo', 'chart', etc.) +- Add data-content attribute with actual content description +- For signatures/handwriting, add author class ('author1', 'author2', etc.) to distinguish different people +- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate +- Specify dimensions in mm/cm and rotation via inline style transform +- For overlapping elements (stamps over text), use CSS z-index and absolute positioning +- Example: `
` +- Example: `
` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined6/seed-based.txt b/data/prompt_templates/ClaudeRefined6/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..34abb8cfa0c6676642a2a3dae57dc4ed613dab8d --- /dev/null +++ b/data/prompt_templates/ClaudeRefined6/seed-based.txt @@ -0,0 +1,77 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- Never include signatures as handwriting + +## Visual Placeholders (if document type requires) +- Use invisible placeholder divs with class 'visual-placeholder' +- Specify type via data-type attribute (signature, stamp, logo, barcode, photo, chart, etc.) +- Add data-content attribute with actual content description +- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people +- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate +- Specify dimensions in mm/cm and rotation via **inline** style transform +- For overlapping elements (stamps over text), use CSS z-index and absolute positioning +- Example: `
` +- Example: `
` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined7/seed-based.txt b/data/prompt_templates/ClaudeRefined7/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..9016855e8c2b98941ccbee9dd1a1619d64daf01d --- /dev/null +++ b/data/prompt_templates/ClaudeRefined7/seed-based.txt @@ -0,0 +1,78 @@ +You are an AI creating culturally authentic HTML documents based on {num_seed_images} seed images of **{doc_type}** documents. + +# Cultural Variations +Seed images may show regional differences: language/terminology, date/number/currency formats, layout preferences, legal disclaimers, typography standards. + +# Task: Generate {num_solutions} unique HTML documents + +## Requirements +1. **Cultural Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Required Fields**: {required_sections} +3. **Format**: Single-page, dimensions appropriate to document type +4. **Language**: {language} +5. **Background**: {background_requirements} +6. **Uniqueness**: 70%+ different in code, styling, content +7. **Static Only**: No animations, transitions, or dynamic effects + +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Technical +- Wrap each in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Specify page size via `@media print { @page { size: ... } }` in CSS and use standard sizes when appropiate + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply generously increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- Never include signatures as handwriting + +## Visual Placeholders (if document type requires) +- Use invisible placeholder divs with class 'visual-placeholder' +- Specify type via data-type attribute (signature, stamp, logo, barcode, photo, chart, etc.) +- Add data-content attribute with actual content description +- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people +- Position naturally in document flow or use CSS positioning (absolute/relative) as appropriate +- Specify dimensions in mm/cm +- For overlapping elements (e.g. stamps over text), use CSS z-index and absolute positioning +- Example: `
` +- Example: `
` + +## Structural Elements (analyze seed images for) +Headers/titles, content organization (tables/lists/paragraphs), data hierarchies, labels/captions, numerical data/dates/references, visual elements (charts/diagrams), footers + +## Additional Requirements +{user_descriptions} + +## Output Format +``` +1. + + ...complete document... + + +2. + + ...complete document... + + +... +``` + +## Quality Checklist +- [ ] Authentic cultural characteristics +- [ ] All required sections: {required_sections} +- [ ] Static styling only +- [ ] Single-page format +- [ ] {language} language +- [ ] Ground truth JSON included +- [ ] 70%+ unique + +Generate {num_solutions} distinct {doc_type} documents. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined8/seed-based.txt b/data/prompt_templates/ClaudeRefined8/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..2c92de2b2134d09db081d31545dc2511579008d8 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined8/seed-based.txt @@ -0,0 +1,60 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Specify page size via `@media print { @page { size: ... } }` and also `body` such that the content looks the same in browser and when printed +- In CSS use standard sizes when appropriate +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply generously increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- Never include signatures as handwriting + +## Visual Placeholders +- Use `
` for non-text elements (signature, stamp, logo, barcode, photo, chart) +- Add data-content attribute with actual content description +- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people +- Dimensions in mm/cm: `width:30mm;height:20mm;` +- Positioning: `position:absolute;top:50mm;right:20mm;` with `z-index` for overlays +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +- Generate ground truth as JSON in `` tag. +- For each GT entry, insert the key of the entry as the `id` attribute with the corresponding HTML element. +- Individual values MUST BE visible and found in the DOM as elements because we want to get the geometries of the values before printing. +- Example: `
Name:
` +- Example: `
Corp XY LLC
` +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with correct dimensions and minified HTML/CSS +- [ ] Content in {language} +- [ ] GT ids present HTML and GT JSON present and correctly formatted + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images in {language}. \ No newline at end of file diff --git a/data/prompt_templates/ClaudeRefined9/seed-based.txt b/data/prompt_templates/ClaudeRefined9/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..4b77f2ba4e08dadcf342bce1510e7286ed80df38 --- /dev/null +++ b/data/prompt_templates/ClaudeRefined9/seed-based.txt @@ -0,0 +1,54 @@ +You are an AI creating authentic HTML representations of documents based on seed images. +Analyze the seed images for structural and semantic content and generate authentic variations. +The generated documents will be printed. + +## Requirements +1. **Authenticity**: Reflect stylistic elements from seed images without copying text/layouts verbatim +2. **Format**: Single-page documents with dimensions appropriate to the document type +3. **Language**: {language} +4. **Static Only**: No animations, transitions, or dynamic effects + +## Technical +- Wrap each document in `...` tags, numbered sequentially +- Static CSS only for single-page layout +- Generate only minified CSS, HTML, JS. + +## Content Guidelines +**DO**: Adapt cultural elements, vary layouts/colors/typography, use static styling +**DON'T**: Copy text/code blocks, reuse identical sections, include dynamic effects + +## Handwritten Fields (if document type requires) +- Mark with class 'handwritten' +- Apply generously increased size to 'handwritten', in line with realistic handwriting +- Assign author ID via class ('author1', 'author2', etc.) to distinguish different people +- Never include signatures as handwriting + +## Visual Placeholders +- Use `
` for non-text elements (signature, stamp, logo, barcode, photo, chart) +- Add data-content attribute with actual content description +- For signatures, add author class ('author1', 'author2', etc.) to distinguish different people +- Dimensions in mm/cm: `width:30mm;height:20mm;` +- Positioning: `position:absolute;top:50mm;right:20mm;` with `z-index` for overlays +- Example: `
` +- Example: `
` + +## Output Format +Generate minified HTML like this: +``` +1. +2. +... +``` +## Ground Truth +Generate ground truth as JSON in `` tag. +Ground truth specification: {gt_type} +Ground truth must follow the format: {gt_format} + +## Quality Checklist +- [ ] Authentic variations without verbatim copying from seed images +- [ ] Static styling only (no animations or dynamic effects) +- [ ] Single-page format with minified HTML/CSS/JS +- [ ] Content in {language} +- [ ] GT JSON present and correctly formatted + +Generate {num_solutions} distinct {doc_type} documents based on {num_seed_images} seed images. \ No newline at end of file diff --git a/data/prompt_templates/DocGenie/seed-based.txt b/data/prompt_templates/DocGenie/seed-based.txt new file mode 100755 index 0000000000000000000000000000000000000000..8c0911a092f031d8914d4831e9a7dfd33944a1d0 --- /dev/null +++ b/data/prompt_templates/DocGenie/seed-based.txt @@ -0,0 +1,39 @@ +You are an AI specialized in generating unique HTML +documents based on multiple scanned images of realworld examples. You have been provided with distinct +sample images, each from a different cultural or regional +background. You have been provided seed images of +{doc type}, each originating from different cultural or regional contexts. For example, some might feature: +• Local languages or regional disclaimers +• Different date formats (e.g., dd/mm/yyyy vs. mm/dd/yyyy) +• Unique currency or numbering formats +• Varying layout norms (positions of key fields, disclaimers, official stamps, etc.) +Now, please generate {num solutions} unique HTML +documents that: +1. Strictly reflect the overall style, layout, and cultural +cues found in these samples, but do NOT copy any text, +disclaimers, or layout verbatim from the samples. +2. Include any essential mandatory fields: {sections}. +3. Maintain an A4 size format for printing (using @page +{{ size: A4; }} or similar CSS). +4. Maintain a {background requirements}. +5. Avoid copy-pasting or reusing large chunks of HTML, +CSS, or disclaimers—each document must be at least +70% different in code and text than the others. +6. Strictly wrap each new document in +... tags, for example: +1. ...Solution #1... +2. ...Solution #2... +... +{num solutions}. ...Solution +#{num solutions}... +Additional Requirements: {user descriptions} +Notes: +• Pay close attention to cultural/regional differences seen +in the seed images (e.g., language, format, disclaimers). +• Feel free to creatively adapt or combine stylistic cues +from the seeds, as long as the end result looks authentic +for that cultural context. +• Do NOT directly copy-paste text or entire code blocks +from any single seed image or across these new solutions. +Now please generate the {num solutions} distinct +{doc type} documents. diff --git a/data/prompt_templates/DocGenie/seed-free.txt b/data/prompt_templates/DocGenie/seed-free.txt new file mode 100755 index 0000000000000000000000000000000000000000..fbdc60bdb2ca1ee2b1904774c481d90d0fe02966 --- /dev/null +++ b/data/prompt_templates/DocGenie/seed-free.txt @@ -0,0 +1,24 @@ +You are an AI specialized in generating multiple unique +HTML documents in one response. Please create +{num solutions} unique HTML documents representing +{doc type}. +Each solution must: +1. Include all mandatory fields: {sections}. +2. Be formatted so it could print on A4 (e.g., use @page +{{ size: A4; }} in your CSS). +3. Show a significantly different layout, styling, and textual content from every other solution. +4. Maintain a {background requirements}. +5. Avoid copy-pasting or reusing large chunks of HTML, +CSS, or disclaimers—each document must be at least +70% different in code and text than the others. +6. Wrap each complete document between +and tags, labeled as: +1. ...Solution #1... +2. ...Solution #2... +... +{num solutions}. ...Solution +#{num solutions}... +Do not provide additional commentary or references to the +other solutions within each HTML. +Now generate the {num solutions} distinct {doc type} +documents. diff --git a/data/syn_dataset_definitions/cord_alpha=0.5.yaml b/data/syn_dataset_definitions/cord_alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..56477fc1cb5e7283ad9e4c755a9d66e190947ade --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=0.5.yaml @@ -0,0 +1,235 @@ +name: "cord_alpha=0.5" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..97c8b163be1e4465a3d47ade8cd74a2de3a812cb --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=0.5_v1.yaml @@ -0,0 +1,236 @@ +name: "cord_alpha=0.5_v1" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/cord_alpha=0.75.yaml b/data/syn_dataset_definitions/cord_alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..17a90c6b18a4d805e8a7e45dfdb226434d95ca46 --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=0.75.yaml @@ -0,0 +1,235 @@ +name: "cord_alpha=0.75" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..8c8349967f803443cb53f336fbb11aed6fdb7e01 --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=0.75_v1.yaml @@ -0,0 +1,235 @@ +name: "cord_alpha=0.75_v1" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/cord_alpha=1.0.yaml b/data/syn_dataset_definitions/cord_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..564c6ef6081edeebaf80a4a78c0ed17f8fa98f14 --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=1.0.yaml @@ -0,0 +1,235 @@ +name: "cord_alpha=1.0" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB.NM #MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB.UNITPRICE #MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB.CNT # MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB.PRICE #MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..f4c0bc3fef2d8b5c777b2b280aeb2891a5a0899f --- /dev/null +++ b/data/syn_dataset_definitions/cord_alpha=1.0_v1.yaml @@ -0,0 +1,235 @@ +name: "cord_alpha=1.0_v1" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "cord" +documents_count: 1200 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT + +label_mapping: + MENU_NM: MENU.NM + MENU_NUM: MENU.NUM + MENU_UNITPRICE: MENU.UNITPRICE + MENU_CNT: MENU.CNT + MENU_DISCOUNTPRICE: MENU.DISCOUNTPRICE + MENU_PRICE: MENU.PRICE + MENU_ITEMSUBTOTAL: MENU.ITEMSUBTOTAL + MENU_VATYN: MENU.VATYN + MENU_ETC: MENU.ETC + MENU_SUB_NM: MENU.SUB_NM + MENU_SUB_UNITPRICE: MENU.SUB_UNITPRICE + MENU_SUB_CNT: MENU.SUB_CNT + MENU_SUB_PRICE: MENU.SUB_PRICE + MENU_SUB_ETC: MENU.SUB_ETC + VOID_MENU_NM: VOID_MENU.NM + VOID_MENU_PRICE: VOID_MENU.PRICE + SUB_TOTAL_SUBTOTAL_PRICE: SUB_TOTAL.SUBTOTAL_PRICE + SUB_TOTAL_DISCOUNT_PRICE: SUB_TOTAL.DISCOUNT_PRICE + SUB_TOTAL_SERVICE_PRICE: SUB_TOTAL.SERVICE_PRICE + SUB_TOTAL_OTHERSVC_PRICE: SUB_TOTAL.OTHERSVC_PRICE + SUB_TOTAL_TAX_PRICE: SUB_TOTAL.TAX_PRICE + SUB_TOTAL_ETC: SUB_TOTAL.ETC + TOTAL_TOTAL_PRICE: TOTAL.TOTAL_PRICE + TOTAL_TOTAL_ETC: TOTAL.TOTAL_ETC + TOTAL_CASHPRICE: TOTAL.CASHPRICE + TOTAL_CHANGEPRICE: TOTAL.CHANGEPRICE + TOTAL_CREDITCARDPRICE: TOTAL.CREDITCARDPRICE + TOTAL_EMONEYPRICE: TOTAL.EMONEYPRICE + TOTAL_MENUTYPE_CNT: TOTAL.MENUTYPE_CNT + TOTAL_MENUQTY_CNT: TOTAL.MENUQTY_CNT + +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml new file mode 100755 index 0000000000000000000000000000000000000000..a584e090f06d98708333198de6d6102016ff0ff3 --- /dev/null +++ b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_CLS.yaml @@ -0,0 +1,40 @@ +name: "doclaynet4k_alpha=1.0_CLS" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "doclaynet_4k_cls" +documents_count: 4500 +valid_labels: + - financial_reports + - scientific_articles + - laws_and_regulations + - government_tenders + - manuals + - patents +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "single A4 pages out of diverse business and technical" + language: "English" + gt_type: | + document class label + * financial_reports + * scientific_articles + * laws_and_regulations + * government_tenders + * manuals + * patents + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d672f79881d2d4d10c146a48319a43ce4dc533b2 --- /dev/null +++ b/data/syn_dataset_definitions/doclaynet4k_alpha=1.0_DLA.yaml @@ -0,0 +1,60 @@ +name: "doclaynet4k_alpha=1.0_DLA" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "doclaynet_4k_dla" +documents_count: 4500 +valid_labels: + - LE-CAPTION + - LE-FOOTNOTE + - LE-FORMULA + - LE-LIST-ITEM + - LE-PAGE-FOOTER + - LE-PAGE-HEADER + - LE-PICTURE + - LE-SECTION-HEADER + - LE-TABLE + - LE-TEXT + - LE-TITLE +label_mapping: + LE-CAPTION: Caption + LE-FOOTNOTE: Footnote + LE-FORMULA: Formula + LE-LIST-ITEM: List-item + LE-PAGE-FOOTER: Page-footer + LE-PAGE-HEADER: Page-header + LE-PICTURE: Picture + LE-SECTION-HEADER: Section-header + LE-TABLE: Table + LE-TEXT: Text + LE-TITLE: "Title " +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of diverse business and technical" + language: "English" + gt_type: | + * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element. + * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers. + * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements. + * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure. + * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text. + * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names. + * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables. + * "LE-SECTION-HEADER": Section and subsection headings. + * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element. + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content. + * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml b/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e741acc279eca821ab3759ef3bc290468e91d052 --- /dev/null +++ b/data/syn_dataset_definitions/doclaynet_alpha=1.0_CLS.yaml @@ -0,0 +1,40 @@ +name: "doclaynet_alpha=1.0_CLS" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "doclaynet" +documents_count: 4500 +valid_labels: + - financial_reports + - scientific_articles + - laws_and_regulations + - government_tenders + - manuals + - patents +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "single A4 pages out of diverse business and technical" + language: "English" + gt_type: | + document class label + * financial_reports + * scientific_articles + * laws_and_regulations + * government_tenders + * manuals + * patents + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml b/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml new file mode 100755 index 0000000000000000000000000000000000000000..704cc415d61b48f0ac721ea49b2249a46036ac95 --- /dev/null +++ b/data/syn_dataset_definitions/doclaynet_alpha=1.0_DLA.yaml @@ -0,0 +1,49 @@ +name: "doclaynet_alpha=1.0_DLA" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "doclaynet" +documents_count: 4500 +valid_labels: + - LE-CAPTION + - LE-FOOTNOTE + - LE-FORMULA + - LE-LIST-ITEM + - LE-PAGE-FOOTER + - LE-PAGE-HEADER + - LE-PICTURE + - LE-SECTION-HEADER + - LE-TABLE + - LE-TEXT + - LE-TITLE +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of diverse business and technical" + language: "English" + gt_type: | + * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element. + * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers. + * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements. + * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure. + * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text. + * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names. + * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables. + * "LE-SECTION-HEADER": Section and subsection headings. + * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element. + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content. + * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa.yaml b/data/syn_dataset_definitions/docvqa.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d01d9234a697f4925263966fe1a164452da20ca3 --- /dev/null +++ b/data/syn_dataset_definitions/docvqa.yaml @@ -0,0 +1,24 @@ +name: "docvqa" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e85ff874ab12aa07a541c08464fba387c9efce65 --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=0.5.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=0.5" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..b4969951fe24990db65740d49e1e5aeb40bd79af --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=0.5_v1.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=0.5_v1" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..df32b8ec9bedd893272cf1c2dc07abc0a7efc45d --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=0.75.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=0.75" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..0faec31c7ec5b5ddae500e8abfaff2dbbc284de1 --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=0.75_v1.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=0.75_v1" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml b/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..a300707ba8462a2b661629dd466a475c170a5011 --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=1.0.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=1.0" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..c4600f0d5c0430203a3f34883c18c4d09d18350a --- /dev/null +++ b/data/syn_dataset_definitions/docvqa_alpha=1.0_v1.yaml @@ -0,0 +1,24 @@ +name: "docvqa_alpha=1.0_v1" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_docvqa" +documents_count: 10000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Multiple questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/funsd_alpha=1.0.yaml b/data/syn_dataset_definitions/funsd_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..08368cca7e58a7a63bdf5010f265e0dc704d213b --- /dev/null +++ b/data/syn_dataset_definitions/funsd_alpha=1.0.yaml @@ -0,0 +1,133 @@ +name: "funsd_alpha=1.0" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "funsd" +documents_count: 300 +valid_labels: + - HEADER + - QUESTION + - ANSWER +label_mapping: +valid_secondary_labels: + - PAIR_1 + - PAIR_2 + - PAIR_3 + - PAIR_4 + - PAIR_5 + - PAIR_6 + - PAIR_7 + - PAIR_8 + - PAIR_9 + - PAIR_10 + - PAIR_11 + - PAIR_12 + - PAIR_13 + - PAIR_14 + - PAIR_15 + - PAIR_16 + - PAIR_17 + - PAIR_18 + - PAIR_19 + - PAIR_20 + - PAIR_21 + - PAIR_22 + - PAIR_23 + - PAIR_24 + - PAIR_25 + - PAIR_26 + - PAIR_27 + - PAIR_28 + - PAIR_29 + - PAIR_30 + - PAIR_31 + - PAIR_32 + - PAIR_33 + - PAIR_34 + - PAIR_35 + - PAIR_36 + - PAIR_37 + - PAIR_38 + - PAIR_39 + - PAIR_40 + - PAIR_41 + - PAIR_42 + - PAIR_43 + - PAIR_44 + - PAIR_45 + - PAIR_46 + - PAIR_47 + - PAIR_48 + - PAIR_49 + - PAIR_50 + - PAIR_51 + - PAIR_52 + - PAIR_53 + - PAIR_54 + - PAIR_55 + - PAIR_56 + - PAIR_57 + - PAIR_58 + - PAIR_59 + - PAIR_60 + - PAIR_61 + - PAIR_62 + - PAIR_63 + - PAIR_64 + - PAIR_65 + - PAIR_66 + - PAIR_67 + - PAIR_68 + - PAIR_69 + - PAIR_70 + - PAIR_71 + - PAIR_72 + - PAIR_73 + - PAIR_74 + - PAIR_75 + - PAIR_76 + - PAIR_77 + - PAIR_78 + - PAIR_79 + - PAIR_80 + - PAIR_81 + - PAIR_82 + - PAIR_83 + - PAIR_84 + - PAIR_85 + - PAIR_86 + - PAIR_87 + - PAIR_88 + - PAIR_89 + - PAIR_90 + - PAIR_91 + - PAIR_92 + - PAIR_93 + - PAIR_94 + - PAIR_95 + - PAIR_96 + - PAIR_97 + - PAIR_98 + - PAIR_99 + - PAIR_100 + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "form" + language: "English" + gt_type: | + keys and their values structured as QA pairs + * "HEADER": The header of the question answer pair. + * "QUESTION": The question i.e. a key. + * "ANSWER": The answer i.e a value. + gt_format: | + Group individual annotations in groups using the enumerator class PAIR_ and a annotation class from the list above (e.g. "PAIR_1 QUESTION", "PAIR_1 ANSWER", "PAIR_2 HEADER", ...). + Ensure to annotate exact using spans, i.e. "QUESTION" element should not contain "ANSWER". + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml b/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..268048251ed1767b29dbdaa91e4f91356d05e9bc --- /dev/null +++ b/data/syn_dataset_definitions/icdar2019_alpha=1.0.yaml @@ -0,0 +1,27 @@ +name: "icdar2019_alpha=1.0" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "icdar2019" +documents_count: 1600 +valid_labels: + - LE-TABLE +label_mapping: + LE-TABLE: table +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of diverse modern digital-born and historical archival scanned" + language: "English" + gt_type: | + * "LE-TABLE": Any tabular structure containing data organized in rows and columns. Include the complete table region from border to border. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/kleister_alpha=1.0.yaml b/data/syn_dataset_definitions/kleister_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..3c43337a33cae3f3d4477805b7b685140063027b --- /dev/null +++ b/data/syn_dataset_definitions/kleister_alpha=1.0.yaml @@ -0,0 +1,41 @@ +name: "kleister_alpha=1.0" +task: "KIE" +dataloader_model_task_as: "QA" +base_dataset_name: "ex_klc" +documents_count: 4000 +valid_labels: + - address__post_town + - address__postcode + - address__street_line + - charity_name + - charity_number + - income_annually_in_british_pounds + - report_date + - spending_annually_in_british_pounds +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "UK charity annual financial report" + language: "English" + gt_type: | + keys and their values (if applicable, provide as plaintext values from the document): + * "address__post_town": Post town of the address of the charitable organization. + * "address__postcode": Postcode of the address of the charitable organization. + * "address__street_line": Street line of the address of the charitable organization. + * "charity_name": The name of the charitable organization. + * "charity_number": The registered number of the charitable organization. + * "income_annually_in_british_pounds": The annual income in British Pounds of the charitable organization. + * "report_date": The reporting date of the annual document of the charitable organization. + * "spending_annually_in_british_pounds": The annual spending in British Pounds of the charitable organization. + gt_format: '{"address__post_town": "", "spending_annually_in_british_pounds": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml new file mode 100755 index 0000000000000000000000000000000000000000..163946dbc2158f3fff6154388ffe18514685689e --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/cord.yaml @@ -0,0 +1,89 @@ +name: "cord" +task: "KIE" +base_dataset_name: "cord" +documents_count: 1000 +valid_labels: + - MENU.NM + - MENU.NUM + - MENU.UNITPRICE + - MENU.CNT + - MENU.DISCOUNTPRICE + - MENU.PRICE + - MENU.ITEMSUBTOTAL + - MENU.VATYN + - MENU.ETC + - MENU.SUB.NM + - MENU.SUB.UNITPRICE + - MENU.SUB.CNT + - MENU.SUB.PRICE + - MENU.SUB.ETC + - VOID_MENU.NM + - VOID_MENU.PRICE + - SUB_TOTAL.SUBTOTAL_PRICE + - SUB_TOTAL.DISCOUNT_PRICE + - SUB_TOTAL.SERVICE_PRICE + - SUB_TOTAL.OTHERSVC_PRICE + - SUB_TOTAL.TAX_PRICE + - SUB_TOTAL.ETC + - TOTAL.TOTAL_PRICE + - TOTAL.TOTAL_ETC + - TOTAL.CASHPRICE + - TOTAL.CHANGEPRICE + - TOTAL.CREDITCARDPRICE + - TOTAL.EMONEYPRICE + - TOTAL.MENUTYPE_CNT + - TOTAL.MENUQTY_CNT + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU.NM": The menu item name. + * "MENU.NUM": The menu item number or identifier. + * "MENU.UNITPRICE": The price per unit of the menu item. + * "MENU.CNT": The quantity or count of the menu item. + * "MENU.DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU.PRICE": The final price of the menu item. + * "MENU.ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU.VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU.ETC": Other miscellaneous menu item information. + * "MENU.SUB.NM": The name of a sub-item or modifier. + * "MENU.SUB.UNITPRICE": The price per unit of the sub-item. + * "MENU.SUB.CNT": The quantity of the sub-item. + * "MENU.SUB.PRICE": The price of the sub-item. + * "MENU.SUB.ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU.NM": The name of a cancelled or voided item. + * "VOID_MENU.PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL.SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL.DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL.SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL.OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL.TAX_PRICE": The tax amount. + * "SUB_TOTAL.ETC": Other subtotal information. + * "TOTAL.TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL.TOTAL_ETC": Other total-related information. + * "TOTAL.CASHPRICE": The amount paid in cash. + * "TOTAL.CHANGEPRICE": The change given back to the customer. + * "TOTAL.CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL.EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL.MENUTYPE_CNT": The count of different menu item types. + * "TOTAL.MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Up to 8 menu items and the receipt data as a JSON object { + "MENU_1": {"MENU.NM": "", "MENU.NUM": "", ...}, + "MENU_2": {"MENU.NM": "", "MENU.NUM": "", ...}, + ..., + "VOID_MENU": {"VOID_MENU.NM": "", "VOID_MENU.PRICE": ""}, + "GENERIC": {"SUB_TOTAL.SUBTOTAL_PRICE": "", ..., "TOTAL.TOTAL_PRICE": ...} + } +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml new file mode 100755 index 0000000000000000000000000000000000000000..13ea3e5f1ca88611e3e79b965ee9bc549a50050c --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/doclaynet.yaml @@ -0,0 +1,45 @@ +name: "doclaynet" +task: "DLA" +base_dataset_name: "doclaynet" +documents_count: 10 +valid_labels: + - LE-CAPTION + - LE-FOOTNOTE + - LE-FORMULA + - LE-LIST-ITEM + - LE-PAGE-FOOTER + - LE-PAGE-HEADER + - LE-PICTURE + - LE-SECTION-HEADER + - LE-TABLE + - LE-TEXT + - LE-TITLE + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of diverse business and technical" + language: "English" + gt_type: | + Give each applicable element in HTML a layout class from the list below to uniquely identify its label: + * "LE-CAPTION": Text that accompanies and explains figures, tables, or other visual elements, typically appearing above or below the referenced element. + * "LE-FOOTNOTE": Supplementary notes or citations placed at the bottom of a page, providing additional context or references to the main text, distinct from footers. + * "LE-FORMULA": Mathematical equations, chemical formulas, or symbolic expressions, whether displayed inline or as standalone elements. + * "LE-LIST-ITEM": Individual items within enumerated, bulleted, or definition lists, with each list item annotated separately rather than as a unified list structure. + * "LE-PAGE-FOOTER": Recurring content at the bottom of pages such as page numbers, copyright notices, document identifiers, or footer text. + * "LE-PAGE-HEADER": Recurring content at the top of pages including running headers, document titles, chapter names. + * "LE-PICTURE": Photographs, diagrams, charts, graphs, illustrations, and other visual content excluding tables. + * "LE-SECTION-HEADER": Section and subsection headings. + * "LE-TABLE": Complete table structure including grid content, inline captions, and column/row headers as a unified element. + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, definitions, descriptions, and other primary textual content. + * "LE-TITLE": The main document title appearing prominently at the beginning of the document, distinct from section headers. + gt_format: 'Empty JSON object: {}' + +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: image +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml new file mode 100755 index 0000000000000000000000000000000000000000..87ba0d4e8a76c87555378271680dbf0d6b1ce488 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/docvqa.yaml @@ -0,0 +1,19 @@ +name: "docvqa" +task: "QA" +base_dataset_name: "ex_docvqa" +documents_count: 1000 # 10.194 Documents in DocVQA train, 39,461 QA pairs +valid_labels: + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7b08ace6c32232d726b3b2800dbe420ee373c7d8 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/funsd.yaml @@ -0,0 +1,28 @@ +name: "funsd" +task: "QA" +base_dataset_name: "funsd" +documents_count: 300 +valid_labels: +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "form" + language: "English" + gt_type: | + keys and their values structured as QA pairs + * "HEADER": The header of the question answer pair. + * "QUESTION": The question i.e. a key. + * "ANSWER": The answer i.e, a value. + gt_format: | + Up to 8 pairs as a JSON object { + "PAIR_1": {"header": "
", "question": "", "answer": ""}, + "PAIR_2": {"header": "
", "question": "", "answer": ""}, + ... + } + + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7e8b12404d1fb0551366803399633fe5784e726c --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/icdar2019.yaml @@ -0,0 +1,25 @@ +name: "icdar2019" +task: "DLA" +base_dataset_name: "icdar2019" +documents_count: 10 +valid_labels: + - LE-TABLE + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of diverse modern digital-born and historical archival scanned" + language: "English" + gt_type: | + Give each applicable element in HTML a layout class from the list below to uniquely identify its label: + * "LE-TABLE": Any tabular structure containing data organized in rows and columns. Include the complete table region from border to border. + gt_format: 'Empty JSON object: {}' + +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: image +alpha: 1 +max_seed_pool: -1 + +# Issues: +# TODO: \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml new file mode 100755 index 0000000000000000000000000000000000000000..6fdccad3871b4bdc9cc0c72c5be35882cb3109cc --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/publaynet.yaml @@ -0,0 +1,30 @@ +name: "publaynet" +task: "DLA" +base_dataset_name: "publaynet" +documents_count: 10 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + Give each applicable element in HTML a layout class from the list below to uniquely identify its label: + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises article titles and standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: 'Empty JSON object: {}' + +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e5756d2d6451afc201a9cb5a74bf4c7b3e03cd87 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/rvlcdip.yaml @@ -0,0 +1,52 @@ +name: "rvlcdip" +task: "CLASSIFICATION" +base_dataset_name: "rvlcdip" +documents_count: 10 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml new file mode 100755 index 0000000000000000000000000000000000000000..18680f4cd72e6fd69399e291f9b77adc3443ce5c --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/sroie.yaml @@ -0,0 +1,32 @@ +name: "sroie" +task: "KIE" +base_dataset_name: "sroie" +documents_count: 1000 +valid_labels: + - COMPANY + - DATE + - ADDRESS + - TOTAL + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: 'JSON object {"COMPANY": "", "DATE": "", "ADDRESS": "", "TOTAL": ""}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml b/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml new file mode 100755 index 0000000000000000000000000000000000000000..88fc8ea9390ea663bcbf21e5b4ecd5a66a64f89b --- /dev/null +++ b/data/syn_dataset_definitions/legacy/ClaudeRefined11/tobacco3482.yaml @@ -0,0 +1,44 @@ +name: "tobacco3482" +task: "CLASSIFICATION" +base_dataset_name: "tobacco3482" +documents_count: 1000 +valid_labels: + - ADVERTISEMENT + - EMAIL + - FORM + - LETTER + - MEMO + - NEWS_ARTICLE + - NOTE + - REPORT + - RESUME + - SCIENTIFIC + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "legal and corporate" + language: "English" + gt_type: | + document class labels: + * ADVERTISEMENT: Advertisement + * EMAIL: Email + * FORM: Form + * LETTER: Letter + * MEMO: Memo + * NEWS_ARTICLE: News article + * NOTE: Note/handwritten note + * REPORT: Report + * RESUME: Resume/CV + * SCIENTIFIC: Scientific publication + gt_format: 'JSON object {"label": ""}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: start | 38.61 USD +# ICVPR: end | 50.37 USD +# 936 samples @ 11.76 USD => 1.25 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml b/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml new file mode 100755 index 0000000000000000000000000000000000000000..896cd126f1e76400132082ab4923e7480408057c --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-handwritten-sizes4.yaml @@ -0,0 +1,20 @@ +name: "docvqa-handwritten-sizes4" +documents_count: 10 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_folder: "docvqa-handwritten-examples" +seed_images_count: 1 +seed_image_max_width: 512 +seed_image_quality: 80 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml b/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml new file mode 100755 index 0000000000000000000000000000000000000000..b30d37b86b73d7c270de3c1a0ec3d68b00a943f4 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-pipelinetest.yaml @@ -0,0 +1,21 @@ +name: "docvqa-pipelinetest" +base_dataset_name: "ex_docvqa" +documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +sampling_strategy: "proportional_cluster_size_sampling" \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..8cf71906cc43e233c6c83624a69433bc2f0416d6 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=-1.yaml @@ -0,0 +1,22 @@ +name: "docvqa-test-alpha=-1" +base_dataset_name: "ex_docvqa" +documents_count: 1 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: -1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7f5d1ad9d170f9d172ab5ad7213be22453c67f88 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.5.yaml @@ -0,0 +1,22 @@ +name: "docvqa-test-alpha=0.5" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..4dd7d39c594f76a82c01b9fd5f19cd05c419815e --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.75.yaml @@ -0,0 +1,22 @@ +name: "docvqa-test-alpha=0.75" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..830a206982556a4dbf750e0ebd95e84f423e5a1d --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=0.yaml @@ -0,0 +1,22 @@ +name: "docvqa-test-alpha=0" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..435ac48602e6c965d48be81c6c18eddb2028ae16 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test-alpha=1.yaml @@ -0,0 +1,22 @@ +name: "docvqa-test-alpha=1" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-test.yaml b/data/syn_dataset_definitions/legacy/docvqa-test.yaml new file mode 100755 index 0000000000000000000000000000000000000000..c241265205f64c3e77f1773e02273d7ff3c49b85 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-test.yaml @@ -0,0 +1,21 @@ +name: "docvqa-test" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +prompt_template: "ClaudeRefined7" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml b/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7c29e7c4bb4bfcbe802f90bad2e5a0f0d1fc9dbe --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-viselems.yaml @@ -0,0 +1,21 @@ +name: "docvqa-viselems" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +prompt_template: "ClaudeRefined10" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml b/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml new file mode 100755 index 0000000000000000000000000000000000000000..9f8248735bf2f1929786613922dfdf31657d79ba --- /dev/null +++ b/data/syn_dataset_definitions/legacy/docvqa-viselems2.yaml @@ -0,0 +1,18 @@ +name: "docvqa-viselems2" +task: "QA" +base_dataset_name: "ex_docvqa" +documents_count: 50 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_count: 10 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/sroie-test.yaml b/data/syn_dataset_definitions/legacy/sroie-test.yaml new file mode 100755 index 0000000000000000000000000000000000000000..9538dbc43640bde0d5c5158dd747e7fab6a7bce2 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/sroie-test.yaml @@ -0,0 +1,27 @@ +name: "sroie-test" +task: "KIE" +base_dataset_name: "sroie" +documents_count: 100 + +prompt_template: "ClaudeRefined11" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: 'JSON object {"COMPANY": "", "DATE": "", "ADDRESS": "", "TOTAL": ""}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml b/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml new file mode 100755 index 0000000000000000000000000000000000000000..41cbee70ab5b361bbc639a9bfcf5a7c078c1c817 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/sroie_as_annotation.yaml @@ -0,0 +1,34 @@ +name: "sroie" +task: "KIE" +base_dataset_name: "sroie" +documents_count: 50 +valid_labels: + - COMPANY + - DATE + - ADDRESS + - TOTAL +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: | + Ensure every label is only present once and to annotate exact using spans, e.g. "ADDRESS" element should not contain other contact info. + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml new file mode 100755 index 0000000000000000000000000000000000000000..a0761ea7cda99c190ae82ab93954b00eac58c881 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-authors-visual_elements-examples_seed_based.yaml @@ -0,0 +1,20 @@ +name: "syn_docvqa-handwritten-authors-visual_elements-examples_seed_based" +documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined2" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_folder: "docvqa-handwritten-examples" +seed_images_count: 1 +seed_image_max_width: 512 +seed_image_quality: 80 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml new file mode 100755 index 0000000000000000000000000000000000000000..93c5fd84c5013b4e2277d28e46c8696efbfa5d1d --- /dev/null +++ b/data/syn_dataset_definitions/legacy/syn_docvqa-handwritten-examples_seed_based.yaml @@ -0,0 +1,29 @@ +name: "syn-docvqa-handwritten-examples-seed-based" +documents_count: 100 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined1" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "Also include **handwritten textfields**, if the type of document demands it: mark these simply with the HTML class 'handwritten', otherwise apply no specific styles or fonts and treat them as usual text spans. + Analyze the seed images to identify and replicate the primary structural elements, which may include: + * Headers, titles, and document identification + * Main content organization (tables, paragraphs, lists, visual elements) + * Data relationships and hierarchical information + * Labels, captions, and descriptive text + * Numerical data, dates, and reference information + * Visual elements like charts, diagrams, or structured layouts + * Footer information, signatures, or supplementary details + * Any other document-specific organizational patterns observed" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_folder: "docvqa-handwritten-examples" +seed_images_count: 1 +seed_image_max_width: 500 +seed_image_quality: 80 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml new file mode 100755 index 0000000000000000000000000000000000000000..cf894d8a287cf033f6a104801e84785090a81587 --- /dev/null +++ b/data/syn_dataset_definitions/legacy/syn_docvqa_seed_based.yaml @@ -0,0 +1,28 @@ +name: "syn-docvqa-seed-based" +documents_count: 15000 # 10.194 Documents in DocVQA train, 39,461 QA pairs + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined1" +prompt_params: + num_solutions: 3 + doc_type: "business and administrative" + language: "English" + sections: + - "N/A - replicate structural elements observed in seed images" + background_requirements: "white background" + additional_requirements: "Analyze the seed images to identify and replicate the primary structural elements, which may include: + * Headers, titles, and document identification + * Main content organization (tables, paragraphs, lists, visual elements) + * Data relationships and hierarchical information + * Labels, captions, and descriptive text + * Numerical data, dates, and reference information + * Visual elements like charts, diagrams, or structured layouts + * Footer information, signatures, or supplementary details + * Any other document-specific organizational patterns observed" + gt_type: "Up to 4 questions about each document, with their answers taken **verbatim** from the document." + gt_format: '{"": "", "": "", ...}' + +seed_images_folder: "docvqa" +seed_images_count: 10 +seed_image_max_width: 500 +seed_image_quality: 80 \ No newline at end of file diff --git a/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml b/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml new file mode 100755 index 0000000000000000000000000000000000000000..10cd8bdc1b5e8d45448e7e165089d3e3d5f09cab --- /dev/null +++ b/data/syn_dataset_definitions/legacy/syn_sroie_seed_based.yaml @@ -0,0 +1,23 @@ +name: "syn-sroie-seed-based" +documents_count: 600 + +seed_type: "seed-based" # or "seed-free" +prompt_template: "ClaudeRefined1" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + sections: + - "company" + - "date" + - "address" + - "total" + background_requirements: "white background" + additional_requirements: "None" + gt_type: "keys and their values" + gt_format: '{"company": "company value", "date": "date value", "address": "address value", "total": "total value"}' + +seed_images_folder: "sroie" +seed_images_count: 10 +seed_image_max_width: 500 +seed_image_quality: 80 \ No newline at end of file diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..4fe80d9ab6b792c2aa65c084788007cf63702c54 --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=0.5.yaml @@ -0,0 +1,33 @@ +name: "publaynet_alpha=0.5" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 0.5 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..c86b6bd8bd993711c1385a8dd4293dc3d80ad83d --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=0.5_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_alpha=0.5_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 0.5 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7f363a06a904e9e7d2ecad5b8677c07b886b000d --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=0.75.yaml @@ -0,0 +1,33 @@ +name: "publaynet_alpha=0.75" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 0.75 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..760d1911dd20a864b8d1d5a6d6c961f5440552b9 --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=0.75_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_alpha=0.75_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 0.75 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml b/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..0840bd9ac7b3e771cc622821f4e03a035262c5ee --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=1.0.yaml @@ -0,0 +1,39 @@ +name: "publaynet_alpha=1.0" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +label_mapping: + LE-TEXT: text + LE-TITLE: title + LE-TABLE: table + LE-FIGURE: figure + LE-LIST: list +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..25e71a4f983d90d6529e6984dbfa00bb4e5ce6c6 --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_alpha=1.0_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_alpha=1.0_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 # Should have been 5 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d016b2d8976c16a500fbecd8134112a551ca861a --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=0.5" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 0.5 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..39002a7736e3235bf95bebdfa4a18d87db2286db --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.5_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=0.5_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 0.5 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..eb43943862eae3251bfd507029df532e43731234 --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=0.75" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 0.75 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..7864fc37f1a1fa59d02c47a1f4ab7bbe41c35c0e --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=0.75_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=0.75_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 0.75 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..95f91c0efdcbc04256798d4faf3a14451d8cfe80 --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=1.0" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..14e9cf44dcacec4178f58fad1b2a5cc2ae27caee --- /dev/null +++ b/data/syn_dataset_definitions/publaynet_correct-sampling_alpha=1.0_v1.yaml @@ -0,0 +1,33 @@ +name: "publaynet_correct-sampling_alpha=1.0_v1" +task: "DLA" +dataloader_model_task_as: +base_dataset_name: "publaynet" +documents_count: 4500 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v1" +seed_images_count: 4 +hdbscan_min_cluster_size: 5 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/rvlcdip.yaml b/data/syn_dataset_definitions/rvlcdip.yaml new file mode 100755 index 0000000000000000000000000000000000000000..e0897cbfe4dc63644d48d20d3429076c5909c1a3 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 10 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml new file mode 100755 index 0000000000000000000000000000000000000000..099934af2e96f6783e6c62786b2607c635bce8c6 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.5.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=0.5" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..5a9c73a4508b804dbd661dcbc02112494d74f3f6 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.5_v1.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=0.5_v1" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.5 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml new file mode 100755 index 0000000000000000000000000000000000000000..306524e3ec99a18c72bd8a23370a2d2e117e50f3 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.75.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=0.75" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..595c21385ab79a6724b4e54a04eb7b5d5b85e9f9 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=0.75_v1.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=0.75_v1" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 0.75 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..3b32ab6f79eed8a14e2d624c629d99c2bf12cfc3 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=1.0.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=1.0" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml b/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d78111012868203f5be22fe285b3aa56d2cf7d56 --- /dev/null +++ b/data/syn_dataset_definitions/rvlcdip_alpha=1.0_v1.yaml @@ -0,0 +1,57 @@ +name: "rvlcdip_alpha=1.0_v1" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "rvlcdip" +documents_count: 4500 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v1" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/sroie.yaml b/data/syn_dataset_definitions/sroie.yaml new file mode 100755 index 0000000000000000000000000000000000000000..0ad1e348176c843296dffc5931517b4aa2d4c9fd --- /dev/null +++ b/data/syn_dataset_definitions/sroie.yaml @@ -0,0 +1,37 @@ +name: "sroie" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "sroie" +documents_count: 50 +valid_labels: + - COMPANY + - DATE + - ADDRESS + - TOTAL +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: '{"COMPANY": "", "DATE": "", "ADDRESS": "", "TOTAL": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/sroie_alpha=1.0.yaml b/data/syn_dataset_definitions/sroie_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..1122fc39ac9d8f51714077ce5b866ce536b4985e --- /dev/null +++ b/data/syn_dataset_definitions/sroie_alpha=1.0.yaml @@ -0,0 +1,37 @@ +name: "sroie_alpha=1.0" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "sroie" +documents_count: 1000 +valid_labels: + - COMPANY + - DATE + - ADDRESS + - TOTAL +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: '{"COMPANY": "", "DATE": "", "ADDRESS": "", "TOTAL": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/sroie_test.yaml b/data/syn_dataset_definitions/sroie_test.yaml new file mode 100755 index 0000000000000000000000000000000000000000..3b50a0e0bc5fd826fa1f4f45205a9773b78eaddf --- /dev/null +++ b/data/syn_dataset_definitions/sroie_test.yaml @@ -0,0 +1,37 @@ +name: "sroie_test" +task: "KIE" +dataloader_model_task_as: +base_dataset_name: "sroie" +documents_count: 10 +valid_labels: + - COMPANY + - DATE + - ADDRESS + - TOTAL +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + keys and their values + * "COMPANY": The company name. + * "DATE": The date on the receipt. + * "ADDRESS": The address of the company. + * "TOTAL": The total amount. + gt_format: '{"COMPANY": "", "DATE": "", "ADDRESS": "", "TOTAL": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: 22.10.2025 | 11.21 USD +# ICVPR: 23.10.2025 | 38.61 USD +# 1950 samples @ 27.4 USD => 1.4 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/templates/cord.yaml b/data/syn_dataset_definitions/templates/cord.yaml new file mode 100755 index 0000000000000000000000000000000000000000..fb069bd03f06f336deeb77050c121a49cdb96388 --- /dev/null +++ b/data/syn_dataset_definitions/templates/cord.yaml @@ -0,0 +1,200 @@ +name: "cord" +task: "KIE" +base_dataset_name: "cord" +documents_count: 10 +valid_labels: + - MENU_NM + - MENU_NUM + - MENU_UNITPRICE + - MENU_CNT + - MENU_DISCOUNTPRICE + - MENU_PRICE + - MENU_ITEMSUBTOTAL + - MENU_VATYN + - MENU_ETC + - MENU_SUB_NM + - MENU_SUB_UNITPRICE + - MENU_SUB_CNT + - MENU_SUB_PRICE + - MENU_SUB_ETC + - VOID_MENU_NM + - VOID_MENU_PRICE + - SUB_TOTAL_SUBTOTAL_PRICE + - SUB_TOTAL_DISCOUNT_PRICE + - SUB_TOTAL_SERVICE_PRICE + - SUB_TOTAL_OTHERSVC_PRICE + - SUB_TOTAL_TAX_PRICE + - SUB_TOTAL_ETC + - TOTAL_TOTAL_PRICE + - TOTAL_TOTAL_ETC + - TOTAL_CASHPRICE + - TOTAL_CHANGEPRICE + - TOTAL_CREDITCARDPRICE + - TOTAL_EMONEYPRICE + - TOTAL_MENUTYPE_CNT + - TOTAL_MENUQTY_CNT +valid_secondary_labels: + - MENU_1 + - MENU_2 + - MENU_3 + - MENU_4 + - MENU_5 + - MENU_6 + - MENU_7 + - MENU_8 + - MENU_9 + - MENU_10 + - MENU_11 + - MENU_12 + - MENU_13 + - MENU_14 + - MENU_15 + - MENU_16 + - MENU_17 + - MENU_18 + - MENU_19 + - MENU_20 + - MENU_21 + - MENU_22 + - MENU_23 + - MENU_24 + - MENU_25 + - MENU_26 + - MENU_27 + - MENU_28 + - MENU_29 + - MENU_30 + - MENU_31 + - MENU_32 + - MENU_33 + - MENU_34 + - MENU_35 + - MENU_36 + - MENU_37 + - MENU_38 + - MENU_39 + - MENU_40 + - MENU_41 + - MENU_42 + - MENU_43 + - MENU_44 + - MENU_45 + - MENU_46 + - MENU_47 + - MENU_48 + - MENU_49 + - MENU_50 + - MENU_51 + - MENU_52 + - MENU_53 + - MENU_54 + - MENU_55 + - MENU_56 + - MENU_57 + - MENU_58 + - MENU_59 + - MENU_60 + - MENU_61 + - MENU_62 + - MENU_63 + - MENU_64 + - MENU_65 + - MENU_66 + - MENU_67 + - MENU_68 + - MENU_69 + - MENU_70 + - MENU_71 + - MENU_72 + - MENU_73 + - MENU_74 + - MENU_75 + - MENU_76 + - MENU_77 + - MENU_78 + - MENU_79 + - MENU_80 + - MENU_81 + - MENU_82 + - MENU_83 + - MENU_84 + - MENU_85 + - MENU_86 + - MENU_87 + - MENU_88 + - MENU_89 + - MENU_90 + - MENU_91 + - MENU_92 + - MENU_93 + - MENU_94 + - MENU_95 + - MENU_96 + - MENU_97 + - MENU_98 + - MENU_99 + - MENU_100 + - VOID_MENU + - VOID_MENU_1 # the LLM shouldn't do this but does + - VOID_MENU_2 + - VOID_MENU_3 + - VOID_MENU_4 + - VOID_MENU_5 + - VOID_MENU_6 + - VOID_MENU_7 + - VOID_MENU_8 + - VOID_MENU_9 + - VOID_MENU_10 + - GENERIC + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 3 + doc_type: "receipt" + language: "English" + gt_type: | + (if applicable, provide as plaintext values from the document) + // Menu items (multiple menu items are allowed) + * "MENU_NM": The menu item name. + * "MENU_NUM": The menu item number or identifier. + * "MENU_UNITPRICE": The price per unit of the menu item. + * "MENU_CNT": The quantity or count of the menu item. + * "MENU_DISCOUNTPRICE": The discount amount applied to the menu item. + * "MENU_PRICE": The final price of the menu item. + * "MENU_ITEMSUBTOTAL": The subtotal for this menu item line. + * "MENU_VATYN": The VAT indicator (yes/no) for the menu item. + * "MENU_ETC": Other miscellaneous menu item information. + * "MENU_SUB_NM": The name of a sub-item or modifier. + * "MENU_SUB_UNITPRICE": The price per unit of the sub-item. + * "MENU_SUB_CNT": The quantity of the sub-item. + * "MENU_SUB_PRICE": The price of the sub-item. + * "MENU_SUB_ETC": Other sub-item information. + // Menu items that were canceled + * "VOID_MENU_NM": The name of a cancelled or voided item. + * "VOID_MENU_PRICE": The price of the cancelled item. + // Generic receipt data + * "SUB_TOTAL_SUBTOTAL_PRICE": The subtotal before additional charges. + * "SUB_TOTAL_DISCOUNT_PRICE": The total discount amount. + * "SUB_TOTAL_SERVICE_PRICE": The service charge or fee. + * "SUB_TOTAL_OTHERSVC_PRICE": Other service charges. + * "SUB_TOTAL_TAX_PRICE": The tax amount. + * "SUB_TOTAL_ETC": Other subtotal information. + * "TOTAL_TOTAL_PRICE": The final total amount on the receipt. + * "TOTAL_TOTAL_ETC": Other total-related information. + * "TOTAL_CASHPRICE": The amount paid in cash. + * "TOTAL_CHANGEPRICE": The change given back to the customer. + * "TOTAL_CREDITCARDPRICE": The amount paid by credit card. + * "TOTAL_EMONEYPRICE": The amount paid by electronic money or digital payment. + * "TOTAL_MENUTYPE_CNT": The count of different menu item types. + * "TOTAL_MENUQTY_CNT": The total quantity of all items ordered. + gt_format: | + Group individual menu items in groups using the menu item enumerator class MENU_ and a sub-field class from the list above (e.g. "MENU_1 MENU_NM", "MENU_1 MENU_CNT", "MENU_2 MENU_NM", ...). + For void/canceled menu items use the class "VOID_MENU" instead of the enumeration. + For generic receipt data use the class "GENERIC". + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/templates/publaynet.yaml b/data/syn_dataset_definitions/templates/publaynet.yaml new file mode 100755 index 0000000000000000000000000000000000000000..885158bef50feba059b2ca84e94d7d338f833ac4 --- /dev/null +++ b/data/syn_dataset_definitions/templates/publaynet.yaml @@ -0,0 +1,33 @@ +name: "publaynet" +task: "DLA" +base_dataset_name: "publaynet" +documents_count: 20 +valid_labels: + - LE-TEXT + - LE-TITLE + - LE-TABLE + - LE-FIGURE + - LE-LIST +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "annotation" +prompt_params: + num_solutions: 2 + doc_type: "single A4 pages out of one and two column scientific article" + language: "English" + gt_type: | + * "LE-TEXT": Contains regular body text including paragraphs, abstracts, authors, affiliations, keywords, footnotes, footer, references, and captions for figures and tables. + * "LE-TITLE": Comprises all document titles and headings, article titles as well as standalone section or subsection headings that appear on their own line rather than inline with text. + * "LE-TABLE": Denotes the main body content of tables, excluding captions and labels. + * "LE-FIGURE": Indicates the main visual content of figures and illustrations, with multi-panel figures annotated as complete units rather than individual sub-figures. + * "LE-LIST": Represents enumerated or bulleted list structures, with nested lists annotated as single unified objects. + gt_format: + +seed_selection_strategy: "v2" +seed_images_count: 4 +hdbscan_min_cluster_size: 10 +embedding_type: image +alpha: 1 +max_seed_pool: -1 diff --git a/data/syn_dataset_definitions/templates/rvlcdip.yaml b/data/syn_dataset_definitions/templates/rvlcdip.yaml new file mode 100755 index 0000000000000000000000000000000000000000..45e33017660a53598959979e9f09e489a388774a --- /dev/null +++ b/data/syn_dataset_definitions/templates/rvlcdip.yaml @@ -0,0 +1,54 @@ +name: "rvlcdip" +task: "CLASSIFICATION" +base_dataset_name: "rvlcdip" +documents_count: 10 +valid_labels: + - letter + - form + - email + - handwritten + - advertisement + - scientific report + - scientific publication + - specification + - file folder + - news article + - budget + - invoice + - presentation + - questionnaire + - resume + - memo +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "business correspondence and corporate" + language: "English" + gt_type: | + document class label + * letter + * form + * email + * handwritten + * advertisement + * scientific report + * scientific publication + * specification + * file folder + * news article + * budget + * invoice + * presentation + * questionnaire + * resume + * memo + gt_format: 'JSON object {"label": ""}' + +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 \ No newline at end of file diff --git a/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml b/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..9c89ba154534e79281ee9fb5c7bb90ff7a3958f3 --- /dev/null +++ b/data/syn_dataset_definitions/tobacco3482_alpha=1.0.yaml @@ -0,0 +1,60 @@ +name: "tobacco3482_alpha=1.0" +task: "CLASSIFICATION" +dataloader_model_task_as: +base_dataset_name: "tobacco3482" +documents_count: 5500 +valid_labels: + - ADVERTISEMENT + - EMAIL + - FORM + - LETTER + - MEMO + - NEWS_ARTICLE + - NOTE + - REPORT + - RESUME + - SCIENTIFIC +label_mapping: + ADVERTISEMENT: ADVE + EMAIL: Email + FORM: Form + LETTER: Letter + MEMO: Memo + NEWS_ARTICLE: News + NOTE: Note + REPORT: Report + RESUME: Resume + SCIENTIFIC: Scientific + +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "legal and corporate" + language: "English" + gt_type: | + document class labels: + * ADVERTISEMENT: Advertisement + * EMAIL: Email + * FORM: Form + * LETTER: Letter + * MEMO: Memo + * NEWS_ARTICLE: News article + * NOTE: Note/handwritten note + * REPORT: Report + * RESUME: Resume/CV + * SCIENTIFIC: Scientific publication + gt_format: 'JSON object {"label": ""}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 10 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 + +# ICVPR: start | 38.61 USD +# ICVPR: end | 50.37 USD +# 936 samples @ 11.76 USD => 1.25 ct/doc \ No newline at end of file diff --git a/data/syn_dataset_definitions/wtq_alpha=1.0.yaml b/data/syn_dataset_definitions/wtq_alpha=1.0.yaml new file mode 100755 index 0000000000000000000000000000000000000000..d03cc747e95624906ca2e0bdb4c8e7f5abf3dba9 --- /dev/null +++ b/data/syn_dataset_definitions/wtq_alpha=1.0.yaml @@ -0,0 +1,30 @@ +name: "wtq_alpha=1.0" +task: "QA" +dataloader_model_task_as: +base_dataset_name: "ex_wiki" +documents_count: 1600 # 1600 (1400 + 200 margin of error) +valid_labels: +label_mapping: +valid_secondary_labels: + +prompt_template: "ClaudeRefined12" +prompt_task: "json" +prompt_params: + num_solutions: 3 + doc_type: "semi-structures table" + language: "English" + gt_type: | + Multiple complex question-answer pairs in everyday language that can be answered from the associated table, with their answers taken **verbatim** from the document. + Common Question Types: + * Lookup: Finding specific cell values ("What is the capital of France?") + * Aggregation: Counting, summing, averaging ("How many players scored over 20 points?") + * Comparison: Finding max/min ("Which country has the largest population?") + * Reasoning: Requiring multiple steps ("What team did the highest scorer play for?") + gt_format: '{"": "", "": "", ...}' + +seed_selection_strategy: "v2" +seed_images_count: 6 +hdbscan_min_cluster_size: 5 +embedding_type: combined +alpha: 1 +max_seed_pool: -1 diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png new file mode 100755 index 0000000000000000000000000000000000000000..aa97cb337fdcd413404851c8e4d2cb7f97a32e39 --- /dev/null +++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM (1).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8862d479ae51472629b63424e6786a6ee0affd0b46c96dff3cc2489d6fdfa85e +size 1210373 diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png new file mode 100755 index 0000000000000000000000000000000000000000..785f4fca8497adc0fc39aed20fc6ded7461fb82c --- /dev/null +++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_10PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf61a520590255d3b96c005b62d52a60b8135fbd3efa6a68a3b8289a865e9217 +size 1435185 diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png new file mode 100755 index 0000000000000000000000000000000000000000..70d255d6e0cc15d9d7e46880b04e124dae035e47 --- /dev/null +++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM (1).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:030bd85435de1b77e07a5ce579686fa807e57195914d84c438de469fb2506948 +size 1537276 diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png new file mode 100755 index 0000000000000000000000000000000000000000..9ac0c82944c8810570cadd00c423d117bf5e11ff --- /dev/null +++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_11PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f871d775be655d255086af8ea04730c235d4040ce4e7503de98f16205ad8a373 +size 1693736 diff --git a/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png new file mode 100755 index 0000000000000000000000000000000000000000..bc618c88575a5f082debc36aeb90ee8c884b3516 --- /dev/null +++ b/data/visual_element_prefabs/figure/Generated Image October 27, 2025 - 9_12PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1177692955eed1930228849a22197ffa6e93d7c393a82e40dfe584d16b0bcecf +size 1470095 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png new file mode 100755 index 0000000000000000000000000000000000000000..c372c616b1709eee531374257ed7913b108b2de2 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_13PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de46e8129861a4e753e3a6c9b10b00c8d32c3e69a3b5ff78bc2fbf6b0d86b6f +size 1562926 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png new file mode 100755 index 0000000000000000000000000000000000000000..4b3680af82196261833234f0c5d28ec8e4662fb1 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM (1).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04f9cb6b08eff876ba35311dfb09a1ef7994219f09997064138570d59acf7ded +size 1195767 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png new file mode 100755 index 0000000000000000000000000000000000000000..95547d755a56866ad5afe728506dc399800be2d1 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_14PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6fc569379b9e99d61554940cbea022af613b1cba910d6731d23e83f9929403f +size 1344638 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png new file mode 100755 index 0000000000000000000000000000000000000000..f4ae0402577db8aaa501c6f0518ef1b32ae50217 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM (2).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71156800daddbf8fd1f7bfc8d5827afc24df0b080750567bcba865569d14fed +size 1439178 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png new file mode 100755 index 0000000000000000000000000000000000000000..e0534c57bebd1df59c618b1dcba988d9a4a23388 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_16PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8f133f7ec17a2f0eaf58b96c941c0d420b282edc07673ff6f5b03e8dcdd7c7f +size 1261549 diff --git a/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png new file mode 100755 index 0000000000000000000000000000000000000000..8ac33ecef7a424b0e7c93abbe737a68bd144ddf0 --- /dev/null +++ b/data/visual_element_prefabs/logo/Generated Image October 27, 2025 - 9_20PM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205f1444edf957d3301ff43ff3fb8bbfc79210c3b84de1322269cbf77dc7fa71 +size 1697350 diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png new file mode 100755 index 0000000000000000000000000000000000000000..75f45acc15292c3628d3d0f621f01d4b5d28a42f --- /dev/null +++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_36AM (2).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3467d58ae9d8f2b4bdca834915ac2245096d1b673aaffb1d504246cd9bc67c9 +size 2127546 diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png new file mode 100755 index 0000000000000000000000000000000000000000..2c36cf358e7a7732a638f6f8cc436cd5985c50d8 --- /dev/null +++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM (1).png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a91e2d887b5f3bc49ea5c98c86e11bbdb806234b6716e2970a5926562e8e6be5 +size 2180085 diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png new file mode 100755 index 0000000000000000000000000000000000000000..49003d4dba70cac7bb02ed72dddc89af277e0842 --- /dev/null +++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_38AM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cb3a5912c28948e49a59d1b61886c9ba113371531a6e25de50efec70b075c74 +size 2096141 diff --git a/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png new file mode 100755 index 0000000000000000000000000000000000000000..0b5981fde9eca21d21e95bfef7d8cb2c2e54f9b4 --- /dev/null +++ b/data/visual_element_prefabs/photo/Generated Image October 28, 2025 - 12_39AM.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00e880cf6d316a7f46f8505b8e351ca8d21e360d93ca738f339bbd5f27d979b1 +size 2241792 diff --git a/data/visual_element_prefabs/photo/photo1.jpg b/data/visual_element_prefabs/photo/photo1.jpg new file mode 100755 index 0000000000000000000000000000000000000000..c1aa252ccf87453a9a0f3bbbcd333453aa33dd11 --- /dev/null +++ b/data/visual_element_prefabs/photo/photo1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f21fa89f133ca73b40e9b6051b032b7ca0a69ff69a6f0ec160fda62bbbfacd +size 547596 diff --git a/data/visual_element_prefabs/photo/photo2.jpg b/data/visual_element_prefabs/photo/photo2.jpg new file mode 100755 index 0000000000000000000000000000000000000000..2b71d7099b95ab6dd2873d01c9f4e29ee383a5b9 --- /dev/null +++ b/data/visual_element_prefabs/photo/photo2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c55f123f45626687bea3d54bb12353447c45cce7264a13f9250610fa506209cd +size 590629 diff --git a/data/visual_element_prefabs/photo/photo3.jpg b/data/visual_element_prefabs/photo/photo3.jpg new file mode 100755 index 0000000000000000000000000000000000000000..56cfdfdc993cf167c2c49b2eadc5ddfbbd24c2db --- /dev/null +++ b/data/visual_element_prefabs/photo/photo3.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:632ae0175f03eaaf19315a250aa6fb1af62eb4e331e3185967a3348bbd04394a +size 550545 diff --git a/data/visual_element_prefabs/photo/photo4.jpg b/data/visual_element_prefabs/photo/photo4.jpg new file mode 100755 index 0000000000000000000000000000000000000000..fbd37c4d57816e3e183e4da832ef2362cba5a83c --- /dev/null +++ b/data/visual_element_prefabs/photo/photo4.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c298f261a7b24aadf96b63fc4cf560a1b69c236f884a58d11cf8e3dc8eed64d +size 552578 diff --git a/data/visual_element_prefabs/photo/photo5.jpg b/data/visual_element_prefabs/photo/photo5.jpg new file mode 100755 index 0000000000000000000000000000000000000000..da5fd64d53b6a2be84f4f9f1df4088b2f7c34433 --- /dev/null +++ b/data/visual_element_prefabs/photo/photo5.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7df040dd747bc9b53e195a71a06171199c57a57b85e10c84c6caaf6613c4defe +size 527274 diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000000000000000000000000000000000000..a79c88db58c289ddb028194da1b432c3c63e8472 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# ============================================ +# DocGenie Deployment Helper Script +# ============================================ +# Quick deployment script for Railway + RunPod + +set -e # Exit on error + +echo "🚀 DocGenie Deployment Helper" +echo "==============================" +echo "" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Function to print colored messages +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +print_error() { + echo -e "${RED}✗ $1${NC}" +} + +print_info() { + echo -e "${YELLOW}ℹ $1${NC}" +} + +# Check prerequisites +echo "Checking prerequisites..." + +# Check if Docker is installed +if ! command -v docker &> /dev/null; then + print_error "Docker is not installed. Please install Docker first." + exit 1 +fi +print_success "Docker installed" + +# Check if .env exists +if [ ! -f "api/.env" ]; then + print_error "api/.env file not found. Please create it first." + exit 1 +fi +print_success "Environment file found" + +# Menu +echo "" +echo "Select deployment option:" +echo "1) Build Handwriting Service Docker image" +echo "2) Push Handwriting Service to Docker Hub" +echo "3) Deploy API to Railway" +echo "4) Run local test environment (docker-compose)" +echo "5) Full deployment (Handwriting + API)" +echo "0) Exit" +echo "" +read -p "Enter option (0-5): " option + +case $option in + 1) + echo "" + print_info "Building Handwriting Service Docker image..." + + # Build image + cd handwriting_service + docker buildx build --platform linux/amd64 \ + -t docgenie-handwriting:latest \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + . + + print_success "Image built successfully" + print_info "Tag: docgenie-handwriting:latest" + ;; + + 2) + echo "" + read -p "Enter your Docker Hub username: " docker_username + + print_info "Tagging image for Docker Hub..." + docker tag docgenie-handwriting:latest ${docker_username}/docgenie-handwriting:latest + + print_info "Pushing to Docker Hub..." + docker push ${docker_username}/docgenie-handwriting:latest + + print_success "Image pushed successfully" + print_info "Deploy this on RunPod: ${docker_username}/docgenie-handwriting:latest" + ;; + + 3) + echo "" + print_info "Deploying API to Railway..." + + # Check if Railway CLI is installed + if ! command -v railway &> /dev/null; then + print_error "Railway CLI not installed. Installing..." + npm i -g @railway/cli + fi + + # Deploy + railway up + + print_success "API deployed to Railway" + print_info "View logs: railway logs" + print_info "View URL: railway open" + ;; + + 4) + echo "" + print_info "Starting local test environment..." + print_info "This will start: Redis, API, Worker, Handwriting Service" + + # Check if GPU is available + if command -v nvidia-smi &> /dev/null; then + print_info "GPU detected, using CUDA" + docker-compose up + else + print_info "No GPU detected, using CPU for handwriting service" + DEVICE=cpu docker-compose up + fi + ;; + + 5) + echo "" + print_info "Full deployment starting..." + + # Step 1: Build handwriting image + print_info "Step 1/4: Building Handwriting Service..." + cd handwriting_service + docker buildx build --platform linux/amd64 \ + -t docgenie-handwriting:latest \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + . + cd .. + print_success "Handwriting image built" + + # Step 2: Push to Docker Hub + echo "" + read -p "Enter your Docker Hub username: " docker_username + print_info "Step 2/4: Pushing to Docker Hub..." + docker tag docgenie-handwriting:latest ${docker_username}/docgenie-handwriting:latest + docker push ${docker_username}/docgenie-handwriting:latest + print_success "Image pushed" + + # Step 3: Deploy to RunPod (manual) + echo "" + print_info "Step 3/4: Deploy to RunPod (manual step)" + print_info "1. Go to https://runpod.io → Serverless → New Endpoint" + print_info "2. Use image: ${docker_username}/docgenie-handwriting:latest" + print_info "3. Select GPU: RTX 4090 or A40" + print_info "4. Set port: 8080" + print_info "5. Set env: DEVICE=cuda" + read -p "Press Enter when RunPod deployment is complete..." + + # Step 4: Get RunPod URL and deploy API + echo "" + read -p "Enter your RunPod endpoint URL: " runpod_url + + print_info "Step 4/4: Deploying API to Railway..." + + # Set HANDWRITING_SERVICE_URL + export HANDWRITING_SERVICE_URL=$runpod_url + + # Deploy to Railway + if ! command -v railway &> /dev/null; then + print_error "Railway CLI not installed. Installing..." + npm i -g @railway/cli + fi + + railway up + + print_success "Full deployment complete!" + echo "" + print_info "Next steps:" + print_info "1. Set HANDWRITING_SERVICE_URL in Railway dashboard" + print_info "2. railway variables set HANDWRITING_SERVICE_URL=$runpod_url" + print_info "3. Test: curl https://your-domain.up.railway.app/health" + ;; + + 0) + echo "Goodbye!" + exit 0 + ;; + + *) + print_error "Invalid option" + exit 1 + ;; +esac + +echo "" +print_success "Done!" diff --git a/docgenie/__init__.py b/docgenie/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..56bc87e9e06dec489b2e81b897c8125ca713ec3a --- /dev/null +++ b/docgenie/__init__.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from enum import Enum +from pathlib import Path + +_root_path = Path(__file__).parent.parent.resolve() + + +# Project paths +class ENV: + # General + ROOT_DIR: Path = _root_path + DATA_DIR: Path = ROOT_DIR / "data" + + DATASETS_DIR: Path = ROOT_DIR / "data" / "datasets" + BASE_DATASETS_DIR: Path = DATASETS_DIR / "base_v2" + SYN_DATASETS_PREPARED_DIR: Path = DATASETS_DIR / "synthesized_prepared" + SYN_DATASETS_DIR: Path = DATASETS_DIR / "synthesized_datasets" + + VISUAL_ELEMENT_PREFABS_DIR: Path = DATA_DIR / "visual_element_prefabs" + + EMBEDDINGS_DIR: Path = DATA_DIR / "embeddings" + GT_EMBEDDINGS_DIR: Path = DATA_DIR / "gt_embeddings" + CLUSTERS_DIR: Path = DATA_DIR / "clusters" + CLUSTER_PLOTS: Path = DATA_DIR / "cluster_plots" + SYN_DATASET_STAT_PLOTS: Path = DATA_DIR / "syn_dataste_statistics_plots" + + ANALYZATION_DIR: Path = DATA_DIR / "analyzation" + GT_ANALYZATION_DIR: Path = ANALYZATION_DIR / "gt" + KIE_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "kie" + CLS_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "cls" + QA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "qa" + DLA_GT_ANALYZATION_DIR: Path = GT_ANALYZATION_DIR / "dla" + + WEBAPP_CACHE_DIR: Path = DATA_DIR / "webapp_cache" + QA_GT_WEBAPP_CACHE_DIR: Path = WEBAPP_CACHE_DIR / "qa_gt" + + TEMP_DIR: Path = DATA_DIR / "temp" + + MODELS_DIR: Path = DATA_DIR / "models" + RUNS_DIR: Path = DATA_DIR / "runs" + + EXPORTS_DIR: Path = DATA_DIR / "exports" + + # Contains combined datasets (original and synthetic) + PREPARED_DATASETS_DIR: Path = DATASETS_DIR / "prepared" + + SYN_DATA_DEFINITIONS_DIR: Path = DATA_DIR / "syn_dataset_definitions" + PROMPT_TEMPLATES_DIR: Path = DATA_DIR / "prompt_templates" + SEED_IMAGES_DIR: Path = DATA_DIR / "seed-images" + + +ENV.BASE_DATASETS_DIR.mkdir(parents=True, exist_ok=True) +ENV.SYN_DATASETS_DIR.mkdir(parents=True, exist_ok=True) +ENV.SYN_DATASETS_PREPARED_DIR.mkdir(parents=True, exist_ok=True) +ENV.VISUAL_ELEMENT_PREFABS_DIR.mkdir(parents=True, exist_ok=True) +ENV.PREPARED_DATASETS_DIR.mkdir(parents=True, exist_ok=True) +ENV.EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True) +ENV.CLUSTERS_DIR.mkdir(parents=True, exist_ok=True) +ENV.TEMP_DIR.mkdir(parents=True, exist_ok=True) +ENV.MODELS_DIR.mkdir(parents=True, exist_ok=True) +ENV.EXPORTS_DIR.mkdir(parents=True, exist_ok=True) +ENV.CLUSTER_PLOTS.mkdir(parents=True, exist_ok=True) +ENV.SYN_DATASET_STAT_PLOTS.mkdir(parents=True, exist_ok=True) +ENV.GT_EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True) +ENV.KIE_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True) +ENV.CLS_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True) +ENV.DLA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True) +ENV.QA_GT_ANALYZATION_DIR.mkdir(parents=True, exist_ok=True) +ENV.QA_GT_WEBAPP_CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +class LLM: + CLAUDE_SONNET_4 = "claude-sonnet-4-20250514" + CLAUDE_SONNET_4_5 = "claude-sonnet-4-5-20250929" + CLAUDE_HAIKU_4_5 = "claude-haiku-4-5-20251001" + TINYLLM_CLAUDE_SONNET_4 = "anthropic/claude-sonnet-4-20250514" + + +# Default values for generation +class GENERATION: + LLM = LLM.CLAUDE_SONNET_4_5 + MAX_TOKENS = 16384 + HANDWRITING_MODEL_CHECKPOINT = ENV.MODELS_DIR / "handwriting" / "latest.pt" diff --git a/docgenie/generation/constants.py b/docgenie/generation/constants.py new file mode 100755 index 0000000000000000000000000000000000000000..bd490dc1de606ee4882e25f16827e4f1b9799310 --- /dev/null +++ b/docgenie/generation/constants.py @@ -0,0 +1,61 @@ +SEED_IMAGE_MAX_WIDTH: int = 500 +SEED_IMAGE_QUALITY: int = 80 + +PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE = 50 +PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH = 5 +PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION = 5 +PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF = 0.75 + +PIPELINE_03_RENDER_PDF__MAX_WORKERS = 8 +PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY = 10 +PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT = 30 +PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES = 2 + +BS_PARSER = "lxml" # "html.parser" + +PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN = -1 +PIPELINE_04_3_SCALE_UP_FACTOR = 3 + +# bboxes read from pdf (and probably also those retrieved via OCR) dont fit exactly into the geo extracted via javascript +BBOX_TO_GEO_MATCHING_THRESHOLD = 25 + +IMAGE_RENDER_EXT = "png" + +HANDWRITING_DEFAULT_BATCH_SIZE = 256 +HANDWRITING_CLASS_NAME = "handwritten" +SIGNATURE_CLASS_NAME = "signature" +HANDWRITING_FONT_SIZE = "26" + +FIXED_HANDWRITING_X_OFFSET = ( + 2 # place all handwritten text 2px to the right to look better +) +MAX_HANDWRITING_RAND_X_OFFSET_LEFT = 1 +MAX_HANDWRITING_RAND_X_OFFSET_RIGHT = 2 +MAX_HANDWRITING_RAND_Y_OFFSET_UP = 1 +MAX_HANDWRITING_RAND_Y_OFFSET_DOWN = 2 +MAX_HANDWRITING_RAND_DEG_ROT = 1 + +PDF_DPI = 200 + +WRITER_STYLES = [ + 404, + 347, + 156, + 253, + 354, + 166, + 320, +] + +# VISUAL_ELEMENT_TYPES = ["stamp", "logo", "barcode", "photo", "chart"] +VISUAL_ELEMENT_TYPES = ["stamp", "logo", "figure", "barcode", "photo"] +VISUAL_ELEMENT_TYPE_SYNONYMS = { + "chart": "figure", + "diagram": "figure", + "plot": "figure", + "graph": "figure", + "illustration": "figure", + "infographic": "figure", + "image": "photo", + "seal": "stamp", +} diff --git a/docgenie/generation/debug.js b/docgenie/generation/debug.js new file mode 100755 index 0000000000000000000000000000000000000000..d733b16898212e136e6f5bda59f9a0e749bdcca1 --- /dev/null +++ b/docgenie/generation/debug.js @@ -0,0 +1,679 @@ +// GT Display Script +// Reads JSON from element with id="GT" and displays in a readable overlay + +(function() { + // Read JSON from GT element + const gtElement = document.getElementById('GT'); + let gtData = null; + + if (gtElement) { + try { + gtData = JSON.parse(gtElement.textContent); + } catch (e) { + console.error('Failed to parse JSON from GT element:', e); + } + } + + // Generate distinct colors for groups + function generateColor(index) { + const hue = (index * 137.508) % 360; // Golden angle for good distribution + return `hsla(${hue}, 70%, 60%, 0.3)`; + } + + // Generate label color (darker, more saturated) + function generateLabelColor(index) { + const hue = (index * 137.508) % 360; + return `hsla(${hue}, 80%, 40%, 1)`; + } + + // Highlight and collect MENU_, PAIR_, and GENERIC groups + function highlightAndCollectGroups() { + const groups = {}; // Structure: { "MENU_1": { elements: [...], subFields: {...} }, ... } + const allElements = document.querySelectorAll('*'); + const groupLabels = []; // Store all label elements for toggling + + // Regex patterns for group identifiers + const menuPattern = /^MENU_(\d+)$/; + const pairPattern = /^PAIR_(\d+)$/; + const genericPattern = /^GENERIC$/; + + // First pass: collect all elements and their group memberships + allElements.forEach(element => { + const classList = Array.from(element.classList); + + // Check if element has GENERIC class + const hasGeneric = classList.includes('GENERIC'); + + // Check for MENU_X or PAIR_X classes + const menuClass = classList.find(cls => cls.match(menuPattern)); + const pairClass = classList.find(cls => cls.match(pairPattern)); + + // Process MENU groups + if (menuClass) { + const groupId = menuClass; + if (!groups[groupId]) { + groups[groupId] = { + elements: [], + subFields: {}, + elementSubFields: new Map() + }; + } + groups[groupId].elements.push(element); + + const elementSubFields = []; + classList.forEach(cls => { + if (cls.startsWith('MENU_') && !cls.match(menuPattern)) { + if (!groups[groupId].subFields[cls]) { + groups[groupId].subFields[cls] = []; + } + groups[groupId].subFields[cls].push(element); + elementSubFields.push(cls); + } + }); + groups[groupId].elementSubFields.set(element, elementSubFields); + } + + // Process PAIR groups + if (pairClass) { + const groupId = pairClass; + if (!groups[groupId]) { + groups[groupId] = { + elements: [], + subFields: {}, + elementSubFields: new Map() + }; + } + groups[groupId].elements.push(element); + + const elementSubFields = []; + classList.forEach(cls => { + if (cls.startsWith('PAIR_') && !cls.match(pairPattern)) { + if (!groups[groupId].subFields[cls]) { + groups[groupId].subFields[cls] = []; + } + groups[groupId].subFields[cls].push(element); + elementSubFields.push(cls); + } + }); + groups[groupId].elementSubFields.set(element, elementSubFields); + } + + // Process GENERIC group + if (hasGeneric) { + const groupId = 'GENERIC'; + if (!groups[groupId]) { + groups[groupId] = { + elements: [], + subFields: {}, + elementSubFields: new Map() + }; + } + groups[groupId].elements.push(element); + + const elementSubFields = []; + classList.forEach(cls => { + // For GENERIC, collect all classes that look like subfields: + // - Start with GENERIC_ or GEN_ + // - Are uppercase with underscores (semantic field pattern) + // - Exclude: GENERIC itself, LE- prefixed classes, common utility classes + const isSubfield = cls !== 'GENERIC' && + !cls.startsWith('LE-') && + !cls.startsWith('layout-') && + !cls.startsWith('text-') && + !cls.startsWith('flex') && + !cls.startsWith('grid') && + !cls.startsWith('item-') && + (cls.startsWith('GENERIC_') || + cls.startsWith('GEN_') || + (/^[A-Z_]+$/.test(cls) && cls.includes('_'))); // Uppercase with underscores + + if (isSubfield) { + if (!groups[groupId].subFields[cls]) { + groups[groupId].subFields[cls] = []; + } + groups[groupId].subFields[cls].push(element); + elementSubFields.push(cls); + } + }); + groups[groupId].elementSubFields.set(element, elementSubFields); + } + }); + + // Sort groups by name for consistent coloring + const sortedGroupIds = Object.keys(groups).sort((a, b) => { + // Extract type and number for proper sorting + const aMatch = a.match(/^(MENU|PAIR)_(\d+)$/); + const bMatch = b.match(/^(MENU|PAIR)_(\d+)$/); + + // Handle GENERIC separately + if (a === 'GENERIC' && b !== 'GENERIC') return 1; // GENERIC goes last + if (b === 'GENERIC' && a !== 'GENERIC') return -1; + if (a === 'GENERIC' && b === 'GENERIC') return 0; + + if (aMatch && bMatch) { + if (aMatch[1] !== bMatch[1]) { + return aMatch[1].localeCompare(bMatch[1]); + } + return parseInt(aMatch[2]) - parseInt(bMatch[2]); + } + return a.localeCompare(b); + }); + + // Second pass: apply highlighting with colors + sortedGroupIds.forEach((groupId, index) => { + const color = generateColor(index); + const labelColor = generateLabelColor(index); + + groups[groupId].elements.forEach(element => { + element.style.backgroundColor = color; + element.style.transition = 'background-color 0.3s'; + element.style.position = 'relative'; + element.style.outline = `2px solid ${labelColor}`; + element.style.outlineOffset = '-2px'; + + // Get the subfields for this specific element + const elementSubFields = groups[groupId].elementSubFields.get(element) || []; + + // Create label text: GROUP_ID + subfields + let labelText = groupId; + if (elementSubFields.length > 0) { + labelText += ' | ' + elementSubFields.join(', '); + } + + // Add a label above and to the right of the element + const label = document.createElement('div'); + label.textContent = labelText; + label.className = 'group-label'; // Add class for easy toggling + label.style.position = 'absolute'; + label.style.top = '-20px'; + label.style.right = '0'; + label.style.color = labelColor; + label.style.fontWeight = 'bold'; + label.style.fontSize = '9px'; + label.style.backgroundColor = 'rgba(255, 255, 255, 0.95)'; + label.style.padding = '2px 6px'; + label.style.borderRadius = '3px'; + label.style.whiteSpace = 'nowrap'; + label.style.pointerEvents = 'none'; + label.style.zIndex = '1000'; + label.style.boxShadow = '0 2px 4px rgba(0,0,0,0.3)'; + label.style.border = `1px solid ${labelColor}`; + label.style.display = 'block'; // Initially visible + + element.appendChild(label); + groupLabels.push(label); + }); + }); + + return { groups, sortedGroupIds, groupLabels }; + } + + // Display group information in overlay + function displayGroupInfo(groups, sortedGroupIds, container) { + if (sortedGroupIds.length === 0) { + return; + } + + const groupSection = document.createElement('div'); + groupSection.style.marginTop = '15px'; + groupSection.style.paddingTop = '12px'; + groupSection.style.borderTop = '2px solid rgba(255, 255, 255, 0.4)'; + + const groupTitle = document.createElement('div'); + groupTitle.textContent = `Element Groups (${sortedGroupIds.length})`; + groupTitle.style.fontWeight = 'bold'; + groupTitle.style.fontSize = '12px'; + groupTitle.style.marginBottom = '10px'; + groupTitle.style.color = 'rgba(255, 200, 100, 1)'; + groupSection.appendChild(groupTitle); + + sortedGroupIds.forEach((groupId, index) => { + const group = groups[groupId]; + const color = generateColor(index); + const labelColor = generateLabelColor(index); + + const groupContainer = document.createElement('div'); + groupContainer.style.marginBottom = '12px'; + groupContainer.style.paddingBottom = '8px'; + groupContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.2)'; + + // Group header with color indicator + const groupHeader = document.createElement('div'); + groupHeader.style.display = 'flex'; + groupHeader.style.alignItems = 'center'; + groupHeader.style.marginBottom = '6px'; + + const colorBox = document.createElement('div'); + colorBox.style.width = '16px'; + colorBox.style.height = '16px'; + colorBox.style.backgroundColor = color; + colorBox.style.border = `2px solid ${labelColor}`; + colorBox.style.borderRadius = '3px'; + colorBox.style.marginRight = '8px'; + colorBox.style.flexShrink = '0'; + + const groupLabel = document.createElement('span'); + groupLabel.textContent = `${groupId} (${group.elements.length} element${group.elements.length !== 1 ? 's' : ''})`; + groupLabel.style.fontWeight = 'bold'; + groupLabel.style.fontSize = '11px'; + groupLabel.style.color = labelColor; + + groupHeader.appendChild(colorBox); + groupHeader.appendChild(groupLabel); + groupContainer.appendChild(groupHeader); + + // Sub-fields + if (Object.keys(group.subFields).length > 0) { + const subFieldsContainer = document.createElement('div'); + subFieldsContainer.style.marginLeft = '24px'; + subFieldsContainer.style.fontSize = '10px'; + + const subFieldsList = document.createElement('div'); + subFieldsList.textContent = 'Sub-fields: ' + Object.keys(group.subFields).sort().join(', '); + subFieldsList.style.color = 'rgba(200, 200, 200, 1)'; + subFieldsList.style.marginBottom = '4px'; + subFieldsContainer.appendChild(subFieldsList); + + // Show content from sub-fields + Object.entries(group.subFields).sort().forEach(([subField, elements]) => { + const subFieldRow = document.createElement('div'); + subFieldRow.style.marginTop = '3px'; + + const subFieldName = document.createElement('span'); + subFieldName.textContent = subField + ': '; + subFieldName.style.color = 'rgba(150, 200, 255, 0.9)'; + subFieldName.style.fontWeight = 'normal'; + + const subFieldValues = document.createElement('span'); + const values = elements + .map(el => el.textContent.trim()) + .filter(text => text.length > 0) + .slice(0, 3); // Limit to first 3 values + + if (values.length > 0) { + subFieldValues.textContent = values.join(', ') + (elements.length > 3 ? '...' : ''); + subFieldValues.style.color = 'rgba(100, 255, 150, 1)'; + } else { + subFieldValues.textContent = '(empty)'; + subFieldValues.style.color = 'rgba(150, 150, 150, 0.8)'; + } + + subFieldRow.appendChild(subFieldName); + subFieldRow.appendChild(subFieldValues); + subFieldsContainer.appendChild(subFieldRow); + }); + + groupContainer.appendChild(subFieldsContainer); + } + + groupSection.appendChild(groupContainer); + }); + + container.appendChild(groupSection); + } + + // Highlight elements containing exact value matches + function highlightValues(values) { + // Get all text-containing elements (excluding script tags and the overlay) + const allElements = document.body.querySelectorAll('*:not(script):not(style)'); + + allElements.forEach(element => { + // Get the direct text content (not including children) + const textContent = Array.from(element.childNodes) + .filter(node => node.nodeType === Node.TEXT_NODE) + .map(node => node.textContent.trim()) + .join(' '); + + // Check if this element's text exactly matches any of the values + for (const value of values) { + if (textContent === value || element.textContent.trim() === value) { + element.style.backgroundColor = 'rgba(0, 100, 255, 0.3)'; + element.style.transition = 'background-color 0.3s'; + break; + } + } + }); + } + + // Display structured format (header/question/answer) + function displayStructuredFormat(data, container) { + for (const [pairKey, pairData] of Object.entries(data)) { + const pairContainer = document.createElement('div'); + pairContainer.style.marginBottom = '12px'; + pairContainer.style.paddingBottom = '8px'; + pairContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.2)'; + + // Pair identifier (e.g., PAIR_1) + const pairLabel = document.createElement('div'); + pairLabel.textContent = pairKey; + pairLabel.style.fontSize = '10px'; + pairLabel.style.color = 'rgba(255, 255, 255, 0.6)'; + pairLabel.style.marginBottom = '4px'; + pairContainer.appendChild(pairLabel); + + // Header + if (pairData.header) { + const header = document.createElement('div'); + header.innerHTML = `Header: ${pairData.header}`; + header.style.marginBottom = '3px'; + pairContainer.appendChild(header); + } + + // Question + if (pairData.question) { + const question = document.createElement('div'); + question.innerHTML = `Question: ${pairData.question}`; + question.style.marginBottom = '3px'; + pairContainer.appendChild(question); + } + + // Answer + if (pairData.answer) { + const answer = document.createElement('div'); + answer.innerHTML = `Answer: ${pairData.answer}`; + answer.style.color = 'rgba(100, 255, 150, 1)'; + pairContainer.appendChild(answer); + } + + container.appendChild(pairContainer); + } + } + + // Display simple key-value format + function displaySimpleFormat(data, container) { + const table = document.createElement('div'); + + for (const [key, value] of Object.entries(data)) { + const row = document.createElement('div'); + row.style.display = 'flex'; + row.style.marginBottom = '6px'; + row.style.paddingBottom = '6px'; + row.style.borderBottom = '1px dotted rgba(255, 255, 255, 0.2)'; + + const keySpan = document.createElement('span'); + keySpan.textContent = key + ':'; + keySpan.style.fontWeight = 'bold'; + keySpan.style.minWidth = '100px'; + keySpan.style.color = 'rgba(150, 200, 255, 1)'; + + const valueSpan = document.createElement('span'); + valueSpan.textContent = value; + valueSpan.style.marginLeft = '10px'; + valueSpan.style.color = 'rgba(100, 255, 150, 1)'; + + row.appendChild(keySpan); + row.appendChild(valueSpan); + table.appendChild(row); + } + + container.appendChild(table); + } + + // Display nested objects format (e.g., MENU_1, MENU_2, etc.) + function displayNestedFormat(data, container) { + for (const [groupKey, groupData] of Object.entries(data)) { + const groupContainer = document.createElement('div'); + groupContainer.style.marginBottom = '14px'; + groupContainer.style.paddingBottom = '10px'; + groupContainer.style.borderBottom = '1px solid rgba(255, 255, 255, 0.3)'; + + // Group identifier (e.g., MENU_1, VOID_MENU, GENERIC) + const groupLabel = document.createElement('div'); + groupLabel.textContent = groupKey; + groupLabel.style.fontSize = '11px'; + groupLabel.style.fontWeight = 'bold'; + groupLabel.style.color = 'rgba(255, 200, 100, 1)'; + groupLabel.style.marginBottom = '6px'; + groupLabel.style.paddingBottom = '4px'; + groupLabel.style.borderBottom = '1px dotted rgba(255, 255, 255, 0.2)'; + groupContainer.appendChild(groupLabel); + + // Check if groupData is an object + if (typeof groupData === 'object' && groupData !== null && !Array.isArray(groupData)) { + // Display nested key-value pairs + const nestedTable = document.createElement('div'); + nestedTable.style.marginLeft = '10px'; + + for (const [key, value] of Object.entries(groupData)) { + const row = document.createElement('div'); + row.style.display = 'flex'; + row.style.marginBottom = '4px'; + row.style.fontSize = '10px'; + + const keySpan = document.createElement('span'); + keySpan.textContent = key + ':'; + keySpan.style.fontWeight = 'normal'; + keySpan.style.minWidth = '150px'; + keySpan.style.color = 'rgba(150, 200, 255, 0.9)'; + + const valueSpan = document.createElement('span'); + valueSpan.textContent = typeof value === 'object' ? JSON.stringify(value) : value; + valueSpan.style.marginLeft = '10px'; + valueSpan.style.color = 'rgba(100, 255, 150, 1)'; + + row.appendChild(keySpan); + row.appendChild(valueSpan); + nestedTable.appendChild(row); + } + + groupContainer.appendChild(nestedTable); + } else { + // Handle non-object values + const valueDiv = document.createElement('div'); + valueDiv.textContent = typeof groupData === 'object' ? JSON.stringify(groupData) : groupData; + valueDiv.style.marginLeft = '10px'; + valueDiv.style.color = 'rgba(100, 255, 150, 1)'; + valueDiv.style.fontSize = '10px'; + groupContainer.appendChild(valueDiv); + } + + container.appendChild(groupContainer); + } + } + + // Collect all values recursively from nested objects + function collectValues(obj, values = []) { + if (typeof obj !== 'object' || obj === null) { + values.push(String(obj)); + return values; + } + + for (const value of Object.values(obj)) { + if (typeof value === 'object' && value !== null) { + collectValues(value, values); + } else { + values.push(String(value)); + } + } + + return values; + } + + // Create overlay container + const overlay = document.createElement('div'); + overlay.style.position = 'relative'; + overlay.style.backgroundColor = 'rgba(0, 0, 0, 0.85)'; + overlay.style.color = 'white'; + overlay.style.padding = '15px'; + overlay.style.fontSize = '11px'; + overlay.style.fontFamily = 'monospace'; + overlay.style.marginTop = '8mm'; + overlay.style.borderRadius = '6px'; + overlay.style.boxShadow = '0 2px 10px rgba(0, 0, 0, 0.4)'; + overlay.style.lineHeight = '1.6'; + overlay.style.maxHeight = '500px'; + overlay.style.overflowY = 'auto'; + + let valuesToHighlight = []; + + // Only process GT data if it exists + if (gtData) { + // Create title + const title = document.createElement('div'); + title.textContent = 'Ground Truth Data'; + title.style.fontWeight = 'bold'; + title.style.fontSize = '12px'; + title.style.marginBottom = '10px'; + title.style.borderBottom = '2px solid rgba(255, 255, 255, 0.4)'; + title.style.paddingBottom = '6px'; + overlay.appendChild(title); + + // Detect format and display accordingly + const firstKey = Object.keys(gtData)[0]; + const firstValue = gtData[firstKey]; + + // Check if it's the header/question/answer format + if (typeof firstValue === 'object' && firstValue !== null && + ('header' in firstValue || 'question' in firstValue || 'answer' in firstValue)) { + // Format 1: Structured pairs with header/question/answer + displayStructuredFormat(gtData, overlay); + // Collect answer values for highlighting + for (const pairData of Object.values(gtData)) { + if (pairData.answer) { + valuesToHighlight.push(pairData.answer); + } + } + } else if (typeof firstValue === 'object' && firstValue !== null) { + // Format 3: Nested objects (MENU_1, MENU_2, etc.) + displayNestedFormat(gtData, overlay); + // Collect all nested values for highlighting + valuesToHighlight = collectValues(gtData); + } else { + // Format 2: Simple key-value pairs + displaySimpleFormat(gtData, overlay); + // Collect all values for highlighting + valuesToHighlight = Object.values(gtData); + } + + // Highlight elements containing the values + highlightValues(valuesToHighlight); + } + + // Highlight placeholder elements with red background + highlightPlaceholders(); + + // Highlight layout elements and display count + const layoutElementCount = highlightLayoutElements(); + + // Highlight and collect MENU_/PAIR_/GENERIC groups + const { groups, sortedGroupIds, groupLabels } = highlightAndCollectGroups(); + + // Add toggle button for group labels + const toggleButton = document.createElement('button'); + toggleButton.textContent = 'Hide Group Labels'; + toggleButton.style.position = 'absolute'; + toggleButton.style.top = '10px'; + toggleButton.style.right = '10px'; + toggleButton.style.padding = '6px 12px'; + toggleButton.style.fontSize = '10px'; + toggleButton.style.fontWeight = 'bold'; + toggleButton.style.backgroundColor = 'rgba(100, 150, 255, 0.9)'; + toggleButton.style.color = 'white'; + toggleButton.style.border = 'none'; + toggleButton.style.borderRadius = '4px'; + toggleButton.style.cursor = 'pointer'; + toggleButton.style.boxShadow = '0 2px 4px rgba(0,0,0,0.3)'; + toggleButton.style.pointerEvents = 'auto'; + toggleButton.style.zIndex = '1001'; + + let labelsVisible = true; + toggleButton.addEventListener('click', function() { + labelsVisible = !labelsVisible; + groupLabels.forEach(label => { + label.style.display = labelsVisible ? 'block' : 'none'; + }); + toggleButton.textContent = labelsVisible ? 'Hide Group Labels' : 'Show Group Labels'; + }); + + overlay.appendChild(toggleButton); + + // Add layout element count to overlay + const layoutCountDiv = document.createElement('div'); + layoutCountDiv.textContent = `Layout Elements: ${layoutElementCount}`; + layoutCountDiv.style.fontSize = '11px'; + layoutCountDiv.style.fontWeight = 'bold'; + layoutCountDiv.style.color = 'rgba(255, 255, 100, 1)'; + layoutCountDiv.style.marginTop = '10px'; + layoutCountDiv.style.paddingTop = '8px'; + layoutCountDiv.style.borderTop = '1px solid rgba(255, 255, 255, 0.3)'; + overlay.appendChild(layoutCountDiv); + + // Display group information in overlay + displayGroupInfo(groups, sortedGroupIds, overlay); + + document.body.appendChild(overlay); + + // Highlight elements with data-placeholder attribute + function highlightPlaceholders() { + const placeholderElements = document.querySelectorAll('[data-placeholder]'); + placeholderElements.forEach(element => { + element.style.backgroundColor = 'rgba(255, 0, 0, 0.3)'; + element.style.transition = 'background-color 0.3s'; + element.style.position = 'relative'; + + // Get the placeholder value + const placeholderValue = element.getAttribute('data-placeholder'); + + // Create a label to display the placeholder value + const label = document.createElement('div'); + label.textContent = placeholderValue; + label.style.position = 'absolute'; + label.style.top = '50%'; + label.style.left = '50%'; + label.style.transform = 'translate(-50%, -50%)'; + label.style.color = 'red'; + label.style.fontWeight = 'bold'; + label.style.fontSize = '10px'; + label.style.backgroundColor = 'rgba(255, 255, 255, 0.8)'; + label.style.padding = '2px 6px'; + label.style.borderRadius = '3px'; + label.style.whiteSpace = 'nowrap'; + label.style.pointerEvents = 'none'; + label.style.zIndex = '1000'; + + element.appendChild(label); + }); + } + + // Highlight layout-element elements and return count + function highlightLayoutElements() { + // Find all elements with classes starting with LE- + const allElements = document.querySelectorAll('*'); + const layoutElements = Array.from(allElements).filter(element => { + return Array.from(element.classList).some(cls => cls.startsWith('LE-')); + }); + + layoutElements.forEach(element => { + // Highlight with transparent yellow background + element.style.backgroundColor = 'rgba(255, 255, 0, 0.3)'; + element.style.transition = 'background-color 0.3s'; + element.style.position = 'relative'; + + // Extract the type from the classList (e.g., LE-TEXT, LE-TABLE, LE-TITLE) + const classList = Array.from(element.classList); + const typeClass = classList.find(cls => cls.startsWith('LE-')); + + if (typeClass) { + // Create a label to display the type + const label = document.createElement('div'); + label.textContent = typeClass.toUpperCase(); + label.style.position = 'absolute'; + label.style.top = '5px'; + label.style.left = '5px'; + label.style.color = 'rgba(200, 200, 0, 1)'; + label.style.fontWeight = 'bold'; + label.style.fontSize = '10px'; + label.style.backgroundColor = 'rgba(0, 0, 0, 0.5)'; + label.style.padding = '2px 6px'; + label.style.borderRadius = '3px'; + label.style.whiteSpace = 'nowrap'; + label.style.pointerEvents = 'none'; + label.style.zIndex = '1000'; + + element.appendChild(label); + } + }); + + return layoutElements.length; + } +})(); \ No newline at end of file diff --git a/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py b/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py new file mode 100755 index 0000000000000000000000000000000000000000..5548a25ded6961de8c7edebf162afa73bdccd039 --- /dev/null +++ b/docgenie/generation/handwriting_diffusion/add_handwriting_blur.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python3 +""" +Post-process generated handwriting token images (e.g. from `generate_handwriting_diffusion_raw.py`) to reduce pixelation +and add a natural soft edge via Gaussian blur + optional scale anti-aliasing. + +Features: + * Recursively scans an input root directory for PNG images (expects per-document subfolders) + * Applies a randomized (or fixed) Gaussian blur radius (on RGB while preserving alpha) + * Optional downscale+upscale anti-alias pass before blur to smooth jagged edges + * Advanced edge refinement options (erosion/dilation/feather of alpha, contrast/gamma, noise, unsharp mask) + * Writes results either (a) in-place with a suffix before extension or (b) into a mirror output directory tree + * Can update an existing mapping JSON (e.g. raw_token_map.json) by appending `blurred_image` for each segment + +Typical usage (mirror output tree): + python scripts/add_handwriting_blur.py \ + --input-root syn_docvqa/handwriting_raw_tokens \ + --output-root syn_docvqa/handwriting_raw_tokens_blurred \ + --mapping-json syn_docvqa/handwriting_raw_tokens/raw_token_map.json \ + --append-mapping \ + --radius-min 0.6 --radius-max 1.8 --antialias + +In-place variant (adds suffix _b): + python scripts/add_handwriting_blur.py \ + --input-root syn_docvqa/handwriting_raw_tokens \ + --in-place --suffix _soft --radius 1.2 --append-mapping + +Key Arguments: + --radius: Fixed Gaussian blur radius (overrides min/max if set) + --radius-min/max Range for random uniform blur radius per image when --radius not given + --antialias Enable a downscale+upscale pass before blur (slower but smoother) + --scale-factor Downscale factor when antialiasing (default 0.75) + --suffix Filename suffix (only used in --in-place mode) + --append-mapping Update mapping JSON adding a 'blurred_image' key per segment (keeps original). + --skip-existing Skip processing if blurred file already exists + --extensions Comma separated list of extensions to process (default: .png) + +Mapping Update Behavior: + - Loads JSON, finds segments with an 'image' field + - If a blurred counterpart is produced, adds 'blurred_image' (relative path analogous to original) + - Writes updated mapping next to original unless --mapping-output specified + +Limitations: + - Mapping update assumes relative paths in JSON remain valid under original root. If you mirror into a different + --output-root, the blurred path is generated accordingly. + - Only PNG RGBA expected; non-RGBA images are converted. + +Requires: Pillow, numpy (for advanced options) +(Optional) tqdm for progress bar. +""" + +from __future__ import annotations +import argparse +import json +import random +import sys +import shutil +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Dict, Any, Optional +from types import SimpleNamespace + +from PIL import Image, ImageFilter +import numpy as np + +try: + from tqdm import tqdm # type: ignore +except Exception: # pragma: no cover - optional + tqdm = None # type: ignore + + +@dataclass +class BlurConfig: + radius: Optional[float] + radius_min: float + radius_max: float + antialias: bool + scale_factor: float + suffix: str + skip_existing: bool + alpha_erosion: int + alpha_dilation: int + feather: float + contrast: float + ink_gamma: float + add_noise: float + unsharp: Optional[str] + max_alpha: Optional[int] + + +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser( + description="Apply soft blur to handwriting token images." + ) + ap.add_argument( + "--input-root", + type=Path, + required=True, + help="Root directory containing per-document subfolders with images.", + ) + ap.add_argument( + "--output-root", + type=Path, + help="Root to write blurred images (mirrors structure). Omit with --in-place.", + ) + ap.add_argument( + "--in-place", + action="store_true", + help="Blur in-place (creates new files with suffix).", + ) + ap.add_argument( + "--suffix", + type=str, + default="_b", + help="Suffix to append before extension in in-place mode.", + ) + ap.add_argument( + "--radius", + type=float, + default=None, + help="Fixed blur radius (overrides min/max).", + ) + ap.add_argument( + "--radius-min", + type=float, + default=0.35, + help="Min random blur radius when --radius not set (slight blur).", + ) + ap.add_argument( + "--radius-max", + type=float, + default=0.85, + help="Max random blur radius when --radius not set (slight blur).", + ) + ap.add_argument( + "--antialias", + action="store_true", + help="Apply downscale+upscale anti-alias pass before blur.", + ) + ap.add_argument( + "--scale-factor", + type=float, + default=0.75, + help="Downscale factor for anti-alias pass.", + ) + # Advanced edge / tone controls + ap.add_argument( + "--alpha-erosion", + type=int, + default=0, + help="Erode alpha mask this many pixels before feather (default off).", + ) + ap.add_argument( + "--alpha-dilation", + type=int, + default=0, + help="Dilate alpha mask this many pixels after erosion (default off).", + ) + ap.add_argument( + "--feather", + type=float, + default=0.6, + help="Feather (Gaussian blur) radius for alpha edges (subtle).", + ) + ap.add_argument( + "--contrast", + type=float, + default=1.02, + help="Contrast multiplier for RGB (slight).", + ) + ap.add_argument( + "--ink-gamma", + type=float, + default=0.98, + help="Gamma adjustment for ink intensity (<1 darkens mid-tones slightly).", + ) + ap.add_argument( + "--add-noise", + type=float, + default=0.35, + help="Std dev of Gaussian noise (0-10) added to RGB pre-blur (subtle grain).", + ) + ap.add_argument( + "--unsharp", + type=str, + default="0.5,30,2", + help="Unsharp mask params radius,percent,threshold (mild crisp restore).", + ) + ap.add_argument( + "--max-alpha", + type=int, + default=None, + help="Clamp final alpha to at most this (0-255).", + ) + ap.add_argument("--mapping-json", type=Path, help="Path to mapping JSON to update.") + ap.add_argument( + "--mapping-output", + type=Path, + help="Path to write updated mapping (default overwrites original when --append-mapping).", + ) + ap.add_argument( + "--append-mapping", + action="store_true", + help="Append blurred_image field to mapping JSON segments.", + ) + ap.add_argument( + "--skip-existing", + action="store_true", + help="Skip if output blurred file already exists.", + ) + ap.add_argument( + "--extensions", + type=str, + default=".png", + help="Comma-separated list of extensions to process.", + ) + ap.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for reproducible radius sampling.", + ) + ap.add_argument("--no-progress", action="store_true", help="Disable progress bar.") + args = ap.parse_args() + + if args.in_place and args.output_root: + ap.error("Cannot specify --output-root with --in-place.") + if not args.in_place and not args.output_root: + ap.error("Either provide --output-root or use --in-place.") + if args.radius is not None and args.radius <= 0: + ap.error("--radius must be > 0") + if args.radius is None and args.radius_min <= 0: + ap.error("--radius-min must be > 0") + if args.radius is None and args.radius_max < args.radius_min: + ap.error("--radius-max must be >= --radius-min") + if args.scale_factor <= 0 or args.scale_factor >= 1 and args.antialias: + # Allow >1? Not necessary here. + pass + return args + + +def iter_images(root: Path, exts: List[str]) -> Iterable[Path]: + for p in root.rglob("*"): + if p.is_file() and p.suffix.lower() in exts: + yield p + + +def choose_radius(cfg: BlurConfig) -> float: + if cfg.radius is not None: + return cfg.radius + return random.uniform(cfg.radius_min, cfg.radius_max) + + +def anti_alias(im: Image.Image, scale_factor: float) -> Image.Image: + if scale_factor >= 1 or scale_factor <= 0: + return im + w, h = im.size + new_w = max(1, int(w * scale_factor)) + new_h = max(1, int(h * scale_factor)) + if new_w == w or new_h == h: + return im + # Downscale (BOX) then upscale (BICUBIC) to soften + small = im.resize((new_w, new_h), Image.Resampling.BOX) + return small.resize((w, h), Image.Resampling.BICUBIC) + + +def process_image(src: Path, dst: Path, cfg: BlurConfig) -> bool: + if cfg.skip_existing and dst.exists(): + return False + try: + im = Image.open(src).convert("RGBA") + # Separate alpha + r, g, b, a = im.split() + rgb = Image.merge("RGB", (r, g, b)) + # Tone / noise adjustments + if cfg.contrast != 1.0 or cfg.ink_gamma != 1.0 or cfg.add_noise > 0: + arr = np.array(rgb).astype(np.float32) / 255.0 + if cfg.contrast != 1.0: + arr = (arr - 0.5) * cfg.contrast + 0.5 + arr = np.clip(arr, 0, 1) + if cfg.ink_gamma != 1.0: + arr = np.power(arr, cfg.ink_gamma) + if cfg.add_noise > 0: + noise = np.random.normal(0, cfg.add_noise / 255.0, arr.shape).astype( + np.float32 + ) + arr = np.clip(arr + noise, 0, 1) + rgb = Image.fromarray((arr * 255).astype(np.uint8), "RGB") + if cfg.antialias: + rgb = anti_alias(rgb, cfg.scale_factor) + radius = choose_radius(cfg) + rgb = rgb.filter(ImageFilter.GaussianBlur(radius=radius)) + # Optional unsharp mask + if cfg.unsharp: + try: + parts = [p.strip() for p in cfg.unsharp.split(",")] + if len(parts) == 3: + u_radius, u_percent, u_threshold = ( + float(parts[0]), + int(parts[1]), + int(parts[2]), + ) + rgb = rgb.filter( + ImageFilter.UnsharpMask( + radius=u_radius, percent=u_percent, threshold=u_threshold + ) + ) + except Exception: + pass + # Alpha refinement (erosion / dilation / feather / clamp) + if ( + cfg.alpha_erosion > 0 + or cfg.alpha_dilation > 0 + or cfg.feather > 0 + or cfg.max_alpha is not None + ): + a_np = np.array(a).astype(np.uint8) + mask = (a_np > 0).astype(np.uint8) * 255 + + def morph(mask_arr: np.ndarray, iters: int, op: str) -> np.ndarray: + if iters <= 0: + return mask_arr + for _ in range(iters): + padded = np.pad(mask_arr, 1, mode="constant", constant_values=0) + out = mask_arr.copy() + h, w = mask_arr.shape + if op == "erode": + for y in range(h): + for x in range(w): + region = padded[y : y + 3, x : x + 3] + out[y, x] = 255 if np.all(region == 255) else 0 + else: # dilate + for y in range(h): + for x in range(w): + region = padded[y : y + 3, x : x + 3] + out[y, x] = 255 if np.any(region == 255) else 0 + mask_arr = out + return mask_arr + + if cfg.alpha_erosion > 0: + mask = morph(mask, cfg.alpha_erosion, "erode") + if cfg.alpha_dilation > 0: + mask = morph(mask, cfg.alpha_dilation, "dilate") + if cfg.feather > 0: + mask = np.array( + Image.fromarray(mask, "L").filter( + ImageFilter.GaussianBlur(radius=cfg.feather) + ) + ) + if cfg.max_alpha is not None: + mask = np.minimum(mask, cfg.max_alpha).astype(np.uint8) + a = Image.fromarray(mask, "L") + out = Image.merge("RGBA", (*rgb.split(), a)) + dst.parent.mkdir(parents=True, exist_ok=True) + out.save(dst) + return True + except Exception as e: + print(f"[ERROR] Failed to blur {src}: {e}", file=sys.stderr) + return False + + +def map_destination(src: Path, args, input_root: Path) -> Path: + if args.in_place: + return src.with_name(src.stem + args.suffix + src.suffix) + # Mirror path inside output_root + rel = src.relative_to(input_root) + return args.output_root / rel + + +def update_mapping( + mapping_path: Path, output_path: Path, input_root: Path, args +) -> None: + try: + data = json.loads(mapping_path.read_text(encoding="utf-8")) + except Exception as e: + print(f"[WARN] Could not read mapping JSON: {e}", file=sys.stderr) + return + + # Determine if mapping uses 'file_author_styles' or old style; keep untouched. + entries = data.get("entries", []) + changed = False + for entry in entries: + for seg in entry.get("segments", []): + img_rel = seg.get("image") + if not img_rel: + continue + # Destination blurred relative path + src_abs = input_root / img_rel + if args.in_place: + blurred_rel = str( + Path(img_rel).with_name( + Path(img_rel).stem + args.suffix + Path(img_rel).suffix + ) + ) + else: + # Mirror relative path but under output_root + blurred_rel = img_rel # Same relative name inside mirrored root + blurred_abs = ( + (args.output_root / blurred_rel) + if not args.in_place + else src_abs.with_name(Path(blurred_rel).name) + ) + if blurred_abs.exists(): + seg["blurred_image"] = blurred_rel + changed = True + if changed: + out_path = args.mapping_output or mapping_path + out_path.write_text( + json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8" + ) + print(f"[INFO] Updated mapping JSON with blurred_image fields: {out_path}") + else: + print( + "[INFO] No mapping changes applied (maybe images not yet generated or already present)." + ) + + +def blur_handwriting( + input_root: Path, + output_root: Optional[Path] = None, + in_place: bool = False, + suffix: str = "_b", + radius: Optional[float] = None, + radius_min: float = 0.35, + radius_max: float = 0.85, + antialias: bool = False, + scale_factor: float = 0.75, + alpha_erosion: int = 0, + alpha_dilation: int = 0, + feather: float = 0.6, + contrast: float = 1.02, + ink_gamma: float = 0.98, + add_noise: float = 0.35, + unsharp: Optional[str] = "0.5,30,2", + max_alpha: Optional[int] = None, + mapping_json: Optional[Path] = None, + mapping_output: Optional[Path] = None, + append_mapping: bool = False, + skip_existing: bool = False, + extensions: str = ".png", + seed: int = 42, + no_progress: bool = False, +) -> int: + """Apply soft blur to handwriting token images. + + Mirrors the command-line behavior while exposing a reusable API. + + Returns the number of processed images. + """ + # Basic validation to mirror CLI expectations + if in_place and output_root is not None: + raise ValueError("Cannot specify output_root with in_place=True.") + if not in_place and output_root is None: + raise ValueError("Either provide output_root or set in_place=True.") + if radius is not None and radius <= 0: + raise ValueError("--radius must be > 0") + if radius is None and radius_min <= 0: + raise ValueError("--radius-min must be > 0") + if radius is None and radius_max < radius_min: + raise ValueError("--radius-max must be >= --radius-min") + + random.seed(seed) + + exts = [e if e.startswith(".") else f".{e}" for e in extensions.split(",")] + + if not input_root.exists(): + raise FileNotFoundError(f"Input root not found: {input_root}") + if not in_place and output_root is not None: + output_root.mkdir(parents=True, exist_ok=True) + + cfg = BlurConfig( + radius=radius, + radius_min=radius_min, + radius_max=radius_max, + antialias=antialias, + scale_factor=scale_factor, + suffix=suffix, + skip_existing=skip_existing, + alpha_erosion=alpha_erosion, + alpha_dilation=alpha_dilation, + feather=feather, + contrast=contrast, + ink_gamma=ink_gamma, + add_noise=add_noise, + unsharp=unsharp, + max_alpha=max_alpha, + ) + + images = list(iter_images(input_root, exts)) + if not images: + print("[WARN] No images found to process.") + + iterator = images + if not no_progress and tqdm is not None: + iterator = tqdm(images, desc="Blurring tokens", unit="img") # type: ignore + + processed = 0 + # Minimal namespace to reuse map_destination/update_mapping without changing their signatures + ns = SimpleNamespace( + in_place=in_place, + suffix=suffix, + output_root=output_root, + mapping_output=mapping_output, + ) + + for img_path in iterator: # type: ignore + dst = map_destination(img_path, ns, input_root) + if process_image(img_path, dst, cfg): + processed += 1 + + print(f"[INFO] Blurred {processed} / {len(images)} images.") + + if append_mapping and mapping_json: + update_mapping(mapping_json, mapping_json, input_root, ns) + + return processed + + +def main(): + args = parse_args() + try: + blur_handwriting( + input_root=args.input_root, + output_root=args.output_root, + in_place=args.in_place, + suffix=args.suffix, + radius=args.radius, + radius_min=args.radius_min, + radius_max=args.radius_max, + antialias=args.antialias, + scale_factor=args.scale_factor, + alpha_erosion=args.alpha_erosion, + alpha_dilation=args.alpha_dilation, + feather=args.feather, + contrast=args.contrast, + ink_gamma=args.ink_gamma, + add_noise=args.add_noise, + unsharp=args.unsharp, + max_alpha=args.max_alpha, + mapping_json=args.mapping_json, + mapping_output=args.mapping_output, + append_mapping=args.append_mapping, + skip_existing=args.skip_existing, + extensions=args.extensions, + seed=args.seed, + no_progress=args.no_progress, + ) + except FileNotFoundError as e: + print(f"[ERROR] {e}", file=sys.stderr) + sys.exit(1) + except ValueError as e: + print(f"[ERROR] {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py b/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py new file mode 100755 index 0000000000000000000000000000000000000000..0c266dcbc8023b5f90503cf6b55453277753e220 --- /dev/null +++ b/docgenie/generation/handwriting_diffusion/generate_handwriting_diffusion_raw.py @@ -0,0 +1,1680 @@ +#!/usr/bin/env python3 +""" +Diffusion-based handwriting token generator with intelligent word splitting and stitching. + +This script: + - Reads handwriting bbox JSON files with format: "x1,y1,x2,y2,text,block_no,line_no,word_no" + - Intelligently splits long words internally based on --split-length parameter + - Splits numeric sequences within tokens into configurable chunk sizes (default: 2) + - Generates handwriting using HuggingFace diffusion model with text conditioning + - Stitches split word segments horizontally with baseline alignment + - Supports sentence-level reconstruction using line metadata + - Outputs transparent RGBA images with tight cropping + - Maintains consistent writer styles per document + - Supports batched generation for GPU efficiency + +Usage example: + python scripts/generate_handwriting_diffusion_raw.py \ + --input-dir docvqa-handwritten-sizes4/handwriting_bbox \ + --output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \ + --run-dir model/experiments/hf_conditional_latent \ + --checkpoint latest.pt \ + --steps 30 --split-length 7 --batch-size 8 --device cuda + +With sentence stitching and custom baseline: + python scripts/generate_handwriting_diffusion_raw.py \ + --input-dir docvqa-handwritten-sizes4/handwriting_bbox \ + --output-dir docvqa-handwritten-sizes4/handwriting_raw_tokens \ + --run-dir model/experiments/hf_conditional_latent \ + --checkpoint latest.pt \ + --steps 30 --split-length 7 --stitch-sentences \ + --baseline-percentile 85.0 --device cuda + +Install requirements: + pip install torch diffusers transformers Pillow PyYAML + +Mapping file (raw_token_map.json) structure: +{ + "backend": "diffusion-hf", + "split_length": 7, + "entries": [ + { + "source_json": "example.json", + "hw_id": "hw0", + "author_id": "author1", + "words": [ + { + "block_no": 22, + "line_no": 0, + "word_no": 0, + "image": "example/hw0_0.png", + "style_id": 123, + "width": 250, + "height": 64, + "segments": [ + {"token": "genera", "bbox": [x1,y1,x2,y2]}, + {"token": "tion", "bbox": [x1,y1,x2,y2]} + ] + } + ] + } + ], + "file_author_styles": {"example.json": {"author1": {"style_id": 123}}} +} +""" + +from __future__ import annotations +import argparse +import json +import math +import random +import sys +from copy import deepcopy +from datetime import datetime +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from collections import defaultdict + +from .tokenizer import CharTokenizer +from .text_encoder import TextEncoder + +try: + import torch + import torch.nn as nn + from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DPMSolverMultistepScheduler, + UNet2DConditionModel, + ) + from diffusers.training_utils import EMAModel + import numpy as np + from PIL import Image + import yaml + from rich.progress import Progress +except Exception as e: + print( + "[ERROR] Missing dependencies. Install: torch diffusers transformers Pillow PyYAML", + file=sys.stderr, + ) + raise + + +BBox = Tuple[float, float, float, float] + + +@dataclass +class WordSegment: + """Represents a segment of a word after splitting.""" + + token: str + bbox: BBox + original_index: ( + int # Track which part of the word this is (0=first, 1=second, etc.) + ) + space_before: bool = ( + False # True if this segment had a space before it in the original word + ) + + +@dataclass +class WordTask: + """Represents a complete word (possibly split into segments).""" + + source_json: str + hw_id: str + author_id: str + block_no: int + line_no: int + word_no: int + segments: List[WordSegment] # List of segments if word was split + original_bbox: BBox # Original bbox before splitting + include_in_sentence: bool = ( + True # Whether this word should be considered for sentence stitching + ) + sentence_exclusion_reason: Optional[str] = ( + None # Reason for omitting from sentence stitching + ) + + +# ---------------------------- util ---------------------------- + + +def list_json_files(p: Path) -> List[Path]: + return sorted([x for x in p.glob("*.json") if x.is_file()]) + + +def load_json(path: Path): + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def parse_bbox_record(rec: str) -> Tuple[BBox, str, int, int, int]: + """Parse bbox record in format: x1,y1,x2,y2,text,block_no,line_no,word_no""" + parts = rec.split(",") + if len(parts) < 8: + raise ValueError(f"Invalid bbox record (expected at least 8 parts): {rec}") + x1, y1, x2, y2 = map(float, parts[:4]) + block_no = int(parts[-3]) + line_no = int(parts[-2]) + word_no = int(parts[-1]) + # Text is everything between coordinates and the last 3 indices + token = ",".join(parts[4:-3]) + return (x1, y1, x2, y2), token, block_no, line_no, word_no + + +def split_word(word: str, split_length: int) -> List[str]: + """ + Split a word into segments where each segment is AT MOST split_length characters. + All segments will have equal or nearly equal length, with no segment exceeding split_length. + + Args: + word: The word to split + split_length: Maximum length for each segment + + Returns: + List of word segments (all <= split_length) + + Examples: + split_word("generation", 4) -> ["gen", "era", "tio", "n"] (3, 3, 3, 1) + split_word("generation", 5) -> ["gener", "ation"] (5, 5) + split_word("extraordinary", 7) -> ["extraor", "dinary"] (7, 7) + split_word("extraordinary", 5) -> ["extra", "ordin", "ary"] (5, 5, 3) + split_word("hello", 10) -> ["hello"] (5) + + Strategy: + - Calculate minimum number of segments needed (ceil(len/split_length)) + - Distribute characters as evenly as possible + - Ensure no segment exceeds split_length + """ + if split_length <= 0: + return [word] + + word_len = len(word) + + if word_len <= split_length: + return [word] + + # Calculate minimum number of segments needed + num_segments = (word_len + split_length - 1) // split_length # Ceiling division + + # Calculate base length for each segment (will be <= split_length) + base_length = word_len // num_segments + remainder = word_len % num_segments + + # Verify base_length doesn't exceed split_length + # (This should always be true given our calculation, but being safe) + assert base_length <= split_length, ( + f"base_length {base_length} exceeds split_length {split_length}" + ) + + # Build segments: first 'remainder' segments get base_length+1, rest get base_length + segments = [] + start = 0 + + for i in range(num_segments): + # First 'remainder' segments get one extra character + seg_length = base_length + (1 if i < remainder else 0) + segments.append(word[start : start + seg_length]) + start += seg_length + + # Verify all segments are <= split_length + for seg in segments: + assert len(seg) <= split_length, ( + f"Segment '{seg}' (len={len(seg)}) exceeds split_length {split_length}" + ) + + return segments + + +def split_token_preserving_digit_chunks( + token: str, split_length_words: int, split_length_numeric: int +) -> List[str]: + """ + Split a token while keeping numeric sequences in configurable chunk sizes. + + Args: + token: The token to split. + split_length_words: Maximum length for each non-numeric segment. + split_length_numeric: Maximum length for numeric sequences (<=0 disables special handling). + + Returns: + List of token segments in the original order. + """ + if split_length_numeric <= 0: + return split_word(token, split_length_words) + + segments: List[str] = [] + idx = 0 + token_len = len(token) + + while idx < token_len: + if token[idx].isdigit(): + start = idx + while idx < token_len and token[idx].isdigit(): + idx += 1 + digits = token[start:idx] + effective_chunk = max(1, split_length_numeric) + if split_length_words > 0: + effective_chunk = min(effective_chunk, split_length_words) + for chunk_start in range(0, len(digits), effective_chunk): + segments.append(digits[chunk_start : chunk_start + effective_chunk]) + else: + start = idx + while idx < token_len and not token[idx].isdigit(): + idx += 1 + alpha = token[start:idx] + if alpha: + segments.extend(split_word(alpha, split_length_words)) + + return segments or [token] + + +def split_word_with_spaces( + word: str, split_length_words: int, split_length_numeric: int +) -> List[Tuple[str, bool]]: + """ + Split a word into segments, handling spaces first, then applying length-based splitting. + + Args: + word: The word to split (may contain spaces) + split_length_words: Maximum length for each segment + split_length_numeric: Maximum length for numeric sequences within each token (<=0 disables special handling) + + Returns: + List of tuples (segment_text, space_before) where space_before indicates if this + segment was separated by a space in the original word. + + Examples: + split_word_with_spaces("hello world", 10) -> [("hello", False), ("world", True)] + split_word_with_spaces("very long phrase", 5) -> [("very", False), ("long", True), ("phras", True), ("e", False)] + split_word_with_spaces("hello", 3) -> [("hel", False), ("lo", False)] + + Strategy: + 1. Split at spaces first + 2. Apply length-based splitting (with digit chunking) to each space-separated part + 3. Mark segments that were separated by spaces with space_before=True + """ + if not word: + return [] + + # Split at spaces first + space_parts = word.split(" ") + + result = [] + for part_idx, part in enumerate(space_parts): + if not part: # Skip empty parts (from consecutive spaces) + continue + + # Apply length-based splitting to this part + length_segments = split_token_preserving_digit_chunks( + part, split_length_words, split_length_numeric + ) + + for seg_idx, seg in enumerate(length_segments): + # First segment of non-first parts had a space before it + space_before = part_idx > 0 and seg_idx == 0 + result.append((seg, space_before)) + + return result + + +def extract_tasks( + json_path: Path, + data: List[Dict[str, Any]], + split_length_words: int, + split_length_numeric: int, +) -> Tuple[List[WordTask], List[Dict[str, Any]]]: + """ + Extract word tasks from JSON data, splitting long words internally. + + Args: + json_path: Path to the JSON file + data: Parsed JSON data + split_length_words: Maximum word length before splitting + split_length_numeric: Maximum length for numeric sequences within tokens (<=0 disables special handling) + + Returns: + Tuple of (word tasks, extraction log entries) + """ + tasks: List[WordTask] = [] + extraction_logs: List[Dict[str, Any]] = [] + fallback_counters: Dict[str, int] = defaultdict(int) + zero_bbox: BBox = (0.0, 0.0, 0.0, 0.0) + + for obj in data: + # Skip entries without valid data + if obj is None: + continue + + hw_id = obj.get("id") + author_id = obj.get("author-id") or obj.get("author_id") + bboxes = obj.get("bboxes") + text_content = (obj.get("text") or "").strip() + + # Skip entries with None or empty bboxes + if bboxes is None or not bboxes: + if not text_content: + extraction_logs.append( + { + "type": "extraction_skip", + "source_json": json_path.name, + "hw_id": hw_id, + "reason": "missing_bbox_no_text", + } + ) + continue + + fallback_words = [w for w in text_content.split() if w] + if not fallback_words: + extraction_logs.append( + { + "type": "extraction_skip", + "source_json": json_path.name, + "hw_id": hw_id, + "reason": "missing_bbox_no_tokens", + } + ) + continue + + for fallback_idx, raw_word in enumerate(fallback_words): + word_segments_with_flags = split_word_with_spaces( + raw_word, split_length_words, split_length_numeric + ) + if not word_segments_with_flags: + continue + + segments: List[WordSegment] = [] + for seg_idx, (seg_text, space_before) in enumerate( + word_segments_with_flags + ): + segments.append( + WordSegment( + token=seg_text, + bbox=zero_bbox, + original_index=seg_idx, + space_before=space_before, + ) + ) + + fallback_counter = fallback_counters[hw_id] + fallback_counters[hw_id] += 1 + tasks.append( + WordTask( + source_json=json_path.name, + hw_id=hw_id, + author_id=author_id, + block_no=-1, + line_no=-1, + word_no=100000 + fallback_counter, + segments=segments, + original_bbox=zero_bbox, + include_in_sentence=False, + sentence_exclusion_reason="missing_bbox", + ) + ) + + extraction_logs.append( + { + "type": "extraction_notice", + "source_json": json_path.name, + "hw_id": hw_id, + "reason": "missing_bbox_generated", + "num_words": len(fallback_words), + } + ) + continue + + for idx, rec in enumerate(bboxes): + bbox, token, block_no, line_no, word_no = parse_bbox_record(rec) + + # Split word with space-awareness (splits at spaces first, then by length) + word_segments_with_flags = split_word_with_spaces( + token, split_length_words, split_length_numeric + ) + + # Create WordSegment objects for each part + segments = [] + for seg_idx, (seg_text, space_before) in enumerate( + word_segments_with_flags + ): + segments.append( + WordSegment( + token=seg_text, + bbox=bbox, # Use same bbox for all segments (will adjust proportionally if needed) + original_index=seg_idx, + space_before=space_before, + ) + ) + + tasks.append( + WordTask( + source_json=json_path.name, + hw_id=hw_id, + author_id=author_id, + block_no=block_no, + line_no=line_no, + word_no=word_no, + segments=segments, + original_bbox=bbox, + ) + ) + return tasks, extraction_logs + + +def style_id_for_file(json_name: str, author_id: str, seed: int, vocab: int) -> int: + """Deterministically derive a style id for (json_name, author_id) combo.""" + composite = f"{json_name}::{author_id}" + return (hash(composite) ^ seed) % vocab + + +def build_word_filename(task: WordTask) -> str: + """Create a unique filename for a word using hw_id, block, line, and word numbers.""" + block_part = f"b{task.block_no}" if task.block_no is not None else "bX" + line_part = f"l{task.line_no}" if task.line_no is not None else "lX" + word_part = f"w{task.word_no}" + return f"{task.hw_id}_{block_part}_{line_part}_{word_part}.png" + + +# ------------------------ generation ------------------------- + + +def load_experiment( + run_dir: Path, checkpoint_name: str, device: torch.device +) -> Dict[str, Any]: + """ + Load model components from experiment directory. + Based on inference_hf.ipynb load_experiment function. + """ + run_dir = run_dir.expanduser().resolve() + if not run_dir.exists(): + raise FileNotFoundError(f"Run directory {run_dir} does not exist.") + + config_path = run_dir / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError(f"Expected config at {config_path}.") + + with open(config_path, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + + # Load tokenizer + vocab_path = Path(config["data"]["vocab_path"]) + if not vocab_path.is_absolute(): + vocab_path = run_dir / vocab_path + if not vocab_path.exists(): + vocab_path = run_dir.parent / config["data"]["vocab_path"] + + tokenizer = CharTokenizer.load(str(vocab_path)) + + # Load writer_id_map + writer_map_path = run_dir / "writer_id_map.json" + if not writer_map_path.exists(): + raise FileNotFoundError(f"Expected writer mapping at {writer_map_path}.") + with open(writer_map_path, "r", encoding="utf-8") as f: + raw_writer_map = json.load(f) + writer_id_map = {str(k): int(v) for k, v in raw_writer_map.items()} + num_writers = len(writer_id_map) + + # Load text encoder + text_cfg = config["model"]["text_encoder"] + text_encoder = TextEncoder( + vocab_size=len(tokenizer), + d_model=text_cfg["d_model"], + num_layers=text_cfg["num_layers"], + num_heads=text_cfg["num_heads"], + d_ff=text_cfg["d_ff"], + dropout=text_cfg["dropout"], + max_length=text_cfg["max_length"], + output_dim=text_cfg.get("output_dim", text_cfg["d_model"]), + ).to(device) + text_encoder.eval() + + # Load UNet + unet_cfg = deepcopy(config["model"]["unet"]) + pretrained_path = unet_cfg.pop("pretrained_model_name_or_path", None) + + # Ensure tuple types + for key in ("down_block_types", "up_block_types", "block_out_channels"): + if key in unet_cfg and isinstance(unet_cfg[key], list): + unet_cfg[key] = tuple(unet_cfg[key]) + + if "sample_size" in unet_cfg and isinstance(unet_cfg["sample_size"], list): + unet_cfg["sample_size"] = tuple(unet_cfg["sample_size"]) + + # Set num_class_embeds from writer_id_map + unet_cfg["num_class_embeds"] = num_writers + + if pretrained_path: + unet = UNet2DConditionModel.from_pretrained( + pretrained_path, num_class_embeds=num_writers + ).to(device) + else: + unet = UNet2DConditionModel(**unet_cfg).to(device) + + unet.eval() + + # Load scheduler - using DPM-Solver++ with order 3 for fast, high-quality sampling + scheduler_cfg = config["model"]["scheduler"] + noise_scheduler = DPMSolverMultistepScheduler( + num_train_timesteps=scheduler_cfg["num_train_timesteps"], + beta_start=scheduler_cfg["beta_start"], + beta_end=scheduler_cfg["beta_end"], + beta_schedule=scheduler_cfg["beta_schedule"], + prediction_type=scheduler_cfg.get("prediction_type", "epsilon"), + algorithm_type="dpmsolver++", + solver_order=3, # Higher order = better quality + use_karras_sigmas=scheduler_cfg.get("use_karras_sigmas", False), + ) + # Add timestep_spacing if specified in config + if "timestep_spacing" in scheduler_cfg: + noise_scheduler.config.timestep_spacing = scheduler_cfg["timestep_spacing"] + + # Load VAE if latent mode + mode = config["training"].get("mode", "latent") + vae = None + vae_scale_factor = 0.18215 + if mode == "latent": + vae_config = config["model"].get("vae") + if vae_config is None: + raise KeyError("Latent mode requires 'model.vae' configuration.") + vae_model_name = vae_config["model_name"] + + vae_cache_dir = run_dir / "cached_vae" + if vae_cache_dir.exists(): + vae = AutoencoderKL.from_pretrained(vae_cache_dir).to(device) + else: + vae = AutoencoderKL.from_pretrained(vae_model_name).to(device) + vae_cache_dir.mkdir(parents=True, exist_ok=True) + vae.save_pretrained(vae_cache_dir) + + vae.eval() + + # Load checkpoint + checkpoint_path = run_dir / checkpoint_name + + print(checkpoint_path) + if not checkpoint_path.exists(): + checkpoint_path = Path(checkpoint_name) + if not checkpoint_path.exists(): + raise FileNotFoundError(f"Checkpoint {checkpoint_name} not found.") + + checkpoint = torch.load(checkpoint_path, map_location=device) + text_encoder.load_state_dict(checkpoint["text_encoder"]) + unet.load_state_dict(checkpoint["unet"], strict=False) + + # Load EMA if available + ema_model = None + if "ema" in checkpoint: + training_cfg = config.get("training", {}) + use_warmup = training_cfg.get("ema_use_warmup", False) + ema_model = EMAModel( + unet.parameters(), + decay=training_cfg.get("ema_decay", 0.9999), + use_ema_warmup=use_warmup, + inv_gamma=training_cfg.get("ema_inv_gamma", 1.0), + power=training_cfg.get("ema_power", 1.0), + min_decay=training_cfg.get("ema_min_decay", 0.0), + device=device, + model_cls=UNet2DConditionModel, + model_config=unet.config, + ) + ema_model.load_state_dict(checkpoint["ema"]) + ema_model.to(device) + ema_model.copy_to(unet.parameters()) + + latent_shape = config["model"].get("latent_shape") + image_shape = config["model"].get("image_shape") + if mode == "latent": + sample_shape = tuple(latent_shape) + else: + sample_shape = tuple(image_shape) + + return { + "tokenizer": tokenizer, + "text_encoder": text_encoder, + "unet": unet, + "noise_scheduler": noise_scheduler, + "vae": vae, + "vae_scale_factor": vae_scale_factor, + "writer_id_map": writer_id_map, + "device": device, + "config": config, + "sample_shape": sample_shape, + "mode": mode, + } + + +def diffusion_generate_batch( + tokens: List[str], + style_ids: List[int], + components: Dict[str, Any], + steps: int, + temperature: float = 1.0, +) -> List[Image.Image]: + """ + Generate batch of handwriting images using diffusion model. + Based on sample_diffusion from inference_hf.ipynb. + """ + if not tokens: + return [] + + device = components["device"] + tokenizer = components["tokenizer"] + text_encoder = components["text_encoder"] + unet = components["unet"] + noise_scheduler = components["noise_scheduler"] + sample_shape = components["sample_shape"] + mode = components["mode"] + vae = components.get("vae") + vae_scale_factor = components.get("vae_scale_factor", 0.18215) + + # Encode text + encodings = tokenizer.encode_batch(tokens) + input_ids = torch.tensor(encodings["input_ids"], device=device, dtype=torch.long) + attention_mask = torch.tensor( + encodings["attention_mask"], device=device, dtype=torch.float32 + ) + + # Convert writer style IDs to class indices + writer_indices = torch.tensor(style_ids, device=device, dtype=torch.long) + + # Set timesteps + noise_scheduler.set_timesteps(steps, device=device) + timesteps = noise_scheduler.timesteps + + # Initialize latents + batch_shape = (len(tokens),) + tuple(sample_shape) + latents = torch.randn(batch_shape, device=device) * temperature + + # Generate text features + with torch.no_grad(): + text_features = text_encoder(input_ids, attention_mask=attention_mask) + + # Sampling loop + for timestep in timesteps: + t_batch = torch.full( + (len(tokens),), int(timestep), device=device, dtype=torch.long + ) + + model_output = unet( + latents, + t_batch, + encoder_hidden_states=text_features, + encoder_attention_mask=attention_mask, + class_labels=writer_indices, + ) + noise_pred = ( + model_output.sample if hasattr(model_output, "sample") else model_output + ) + + scheduler_step = noise_scheduler.step(noise_pred, int(timestep), latents) + latents = scheduler_step.prev_sample + + # Decode if latent mode + if mode == "latent" and vae is not None: + latents = latents / vae_scale_factor + decoded = vae.decode(latents).sample + else: + decoded = latents + + images = (decoded / 2 + 0.5).clamp(0.0, 1.0) + + # Convert to PIL images with cropping and transparency + results: List[Image.Image] = [] + imgs = images.cpu().numpy() + + for i in range(len(tokens)): + arr = imgs[i] + if arr.shape[0] == 1: + arr = arr[0] # Remove channel dim if grayscale + else: + arr = arr.transpose(1, 2, 0) # CHW -> HWC + + arr8 = (arr * 255).round().astype("uint8") + + # Binarize + if arr8.ndim == 3: + arr8 = arr8.mean(axis=2).astype("uint8") + + thresh = otsu_threshold(arr8) + bin_arr = (arr8 > thresh).astype("uint8") * 255 + + # Crop to content + cropped, crop_box = crop_to_content(bin_arr) + + # Convert to RGBA + rgba = binary_to_rgba(cropped) + rgba.info["crop_box"] = crop_box + results.append(rgba) + + return results + + +# ---------------------- binarization utils ------------------- + + +def otsu_threshold(arr8): + hist = np.bincount(arr8.ravel(), minlength=256).astype(np.float64) + total = arr8.size + sum_total = (hist * np.arange(256)).sum() + weight_bg = 0.0 + sum_bg = 0.0 + max_between = -1.0 + thresh = 0 + for i in range(256): + weight_bg += hist[i] + if weight_bg == 0: + continue + weight_fg = total - weight_bg + if weight_fg == 0: + break + sum_bg += i * hist[i] + mean_bg = sum_bg / weight_bg + mean_fg = (sum_total - sum_bg) / weight_fg + between = weight_bg * weight_fg * (mean_bg - mean_fg) ** 2 + if between > max_between: + max_between = between + thresh = i + return thresh + + +# ---------------------- cropping & alpha -------------------- + + +def crop_to_content(bin_arr: np.ndarray, pad: int = 0): + """Crop binary array (0=ink,255=bg) to tight bounding box. Returns (cropped_array, (x1,y1,x2,y2)).""" + h, w = bin_arr.shape + ink_mask = bin_arr < 255 + if not ink_mask.any(): + # No ink; return 1x1 transparent placeholder + return bin_arr[:1, :1], (0, 0, 1, 1) + rows = np.where(ink_mask.any(axis=1))[0] + cols = np.where(ink_mask.any(axis=0))[0] + y1, y2 = rows[0], rows[-1] + x1, x2 = cols[0], cols[-1] + if pad: + x1 = max(0, x1 - pad) + y1 = max(0, y1 - pad) + x2 = min(w - 1, x2 + pad) + y2 = min(h - 1, y2 + pad) + cropped = bin_arr[y1 : y2 + 1, x1 : x2 + 1] + return cropped, ( + int(x1), + int(y1), + int(x2) + 1, + int(y2) + 1, + ) # x2,y2 exclusive for convenience + + +def binary_to_rgba(bin_arr: np.ndarray) -> Image.Image: + """Convert binary (0 ink, 255 bg) to RGBA with transparent background.""" + h, w = bin_arr.shape + # Ink black RGB (0,0,0), alpha 255 where ink, 0 where bg + alpha = (bin_arr == 0).astype("uint8") * 255 + rgb = np.zeros((h, w, 3), dtype="uint8") # already black + rgba = np.dstack([rgb, alpha]) + return Image.fromarray(rgba, mode="RGBA") + + +def pad_tokens_to_equal_length(tokens: List[str]) -> List[str]: + """Pad tokens to equal length by appending spaces to shorter tokens.""" + if not tokens: + return tokens + max_len = max(len(t) for t in tokens) + print([t.ljust(max_len) for t in tokens]) + return [t.ljust(max_len) for t in tokens] + + +def calculate_baseline_info( + img: Image.Image, baseline_percentile: float = 85.0 +) -> Dict[str, Any]: + """ + Calculate baseline information for an RGBA image. + + Args: + img: RGBA PIL Image + baseline_percentile: Percentile for baseline detection (default: 85.0) + + Returns: + Dictionary with baseline metrics: + - baseline_y: Absolute baseline position (pixels from top) + - baseline_ratio: Baseline as ratio of height (0.0-1.0) + - height_above: Pixels above baseline + - height_below: Pixels below baseline + - ascender_ratio: Ratio of height above baseline + - descender_ratio: Ratio of height below baseline + """ + arr = np.array(img) + height = img.height + + if arr.shape[2] == 4: # RGBA + alpha = arr[:, :, 3] + else: + alpha = np.ones((height, img.width), dtype=np.uint8) * 255 + + ink_mask = alpha > 200 + + if not ink_mask.any(): + # No ink, use bottom as baseline + baseline_y = height - 1 + else: + # Find bottom-most ink pixels for each column + bottom_candidates = [] + cols_with_ink = np.where(ink_mask.any(axis=0))[0] + for col_idx in cols_with_ink: + ink_rows = np.where(ink_mask[:, col_idx])[0] + if ink_rows.size > 0: + bottom_candidates.append(int(ink_rows[-1])) + + if bottom_candidates: + baseline_y = int(np.percentile(bottom_candidates, baseline_percentile)) + else: + baseline_y = height - 1 + + height_above = baseline_y + height_below = height - 1 - baseline_y + + return { + "baseline_y": baseline_y, + "baseline_ratio": baseline_y / height if height > 0 else 0.0, + "height_above": height_above, + "height_below": height_below, + "ascender_ratio": height_above / height if height > 0 else 0.0, + "descender_ratio": height_below / height if height > 0 else 0.0, + } + + +def concatenate_images_horizontal( + images: List[Image.Image], + gap: int = 0, + baseline_align: bool = True, + baseline_percentile: float = 75.0, +) -> Image.Image: + """ + Horizontally concatenate a list of RGBA images with baseline alignment. + + Args: + images: List of RGBA images to concatenate + gap: Spacing between images in pixels + baseline_align: If True, align by baseline; if False, center vertically + baseline_percentile: Percentile for baseline detection (default: 85.0) + + Returns: + Concatenated RGBA image + """ + if not images: + raise ValueError("Cannot concatenate empty image list") + if len(images) == 1: + return images[0] + + if baseline_align: + # Calculate baseline for each image + baselines = [] + max_above_baseline = 0 + max_below_baseline = 0 + + for img in images: + # Convert to grayscale array + arr = np.array(img) + if arr.shape[2] == 4: # RGBA + alpha = arr[:, :, 3] + else: + alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255 + + # Find ink pixels + ink_mask = alpha > 200 + + if not ink_mask.any(): + # No ink, use bottom as baseline + baseline = img.height - 1 + else: + # Find bottom-most ink pixels for each column (optimized: only iterate columns with ink) + bottom_candidates = [] + cols_with_ink = np.where(ink_mask.any(axis=0))[0] + for col_idx in cols_with_ink: + ink_rows = np.where(ink_mask[:, col_idx])[0] + if ink_rows.size > 0: + bottom_candidates.append(int(ink_rows[-1])) + + if bottom_candidates: + baseline = int( + np.percentile(bottom_candidates, baseline_percentile) + ) + else: + baseline = img.height - 1 + + baselines.append(baseline) + + # Calculate space above and below baseline + above = baseline + below = img.height - 1 - baseline + max_above_baseline = max(max_above_baseline, above) + max_below_baseline = max(max_below_baseline, below) + + # Total height needed + canvas_height = max_above_baseline + 1 + max_below_baseline + total_width = sum(img.width for img in images) + gap * (len(images) - 1) + + # Create canvas + result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0)) + + # Paste images aligned by baseline + x_offset = 0 + for img, baseline in zip(images, baselines): + # Calculate y position to align baselines + y_offset = max_above_baseline - baseline + result.paste(img, (x_offset, y_offset), img) + x_offset += img.width + gap + else: + # Simple vertical centering + max_height = max(img.height for img in images) + total_width = sum(img.width for img in images) + gap * (len(images) - 1) + + result = Image.new("RGBA", (total_width, max_height), (0, 0, 0, 0)) + + x_offset = 0 + for img in images: + y_offset = (max_height - img.height) // 2 + result.paste(img, (x_offset, y_offset), img) + x_offset += img.width + gap + + return result + + +def concatenate_segments_with_variable_gaps( + images: List[Image.Image], + segments: List[WordSegment], + segment_gap: int = 2, + word_gap: int = 20, + baseline_percentile: float = 75.0, +) -> Image.Image: + """ + Concatenate word segments with variable gaps based on whether they were separated by spaces. + + Args: + images: List of RGBA segment images (same length as segments) + segments: List of WordSegment objects with space_before flags + segment_gap: Gap for length-split segments (no space in original) + word_gap: Gap for space-separated segments + baseline_percentile: Percentile for baseline detection + + Returns: + Concatenated RGBA image with appropriate gaps + """ + if not images: + raise ValueError("Cannot concatenate empty image list") + if len(images) == 1: + return images[0] + if len(images) != len(segments): + raise ValueError(f"Mismatch: {len(images)} images but {len(segments)} segments") + + # Calculate baseline for each image + baselines = [] + max_above_baseline = 0 + max_below_baseline = 0 + + for img in images: + arr = np.array(img) + if arr.shape[2] == 4: # RGBA + alpha = arr[:, :, 3] + else: + alpha = np.ones((arr.shape[0], arr.shape[1]), dtype=np.uint8) * 255 + + ink_mask = alpha > 200 + + if not ink_mask.any(): + baseline = img.height - 1 + else: + bottom_candidates = [] + cols_with_ink = np.where(ink_mask.any(axis=0))[0] + for col_idx in cols_with_ink: + ink_rows = np.where(ink_mask[:, col_idx])[0] + if ink_rows.size > 0: + bottom_candidates.append(int(ink_rows[-1])) + + if bottom_candidates: + baseline = int(np.percentile(bottom_candidates, baseline_percentile)) + else: + baseline = img.height - 1 + + baselines.append(baseline) + above = baseline + below = img.height - 1 - baseline + max_above_baseline = max(max_above_baseline, above) + max_below_baseline = max(max_below_baseline, below) + + # Calculate total width based on variable gaps + canvas_height = max_above_baseline + 1 + max_below_baseline + total_width = sum(img.width for img in images) + for i in range(1, len(images)): + # Use word_gap if this segment had a space before it, else segment_gap + gap = word_gap if segments[i].space_before else segment_gap + total_width += gap + + # Create canvas and paste images + result = Image.new("RGBA", (total_width, canvas_height), (0, 0, 0, 0)) + + x_offset = 0 + for i, (img, baseline, segment) in enumerate(zip(images, baselines, segments)): + y_offset = max_above_baseline - baseline + result.paste(img, (x_offset, y_offset), img) + x_offset += img.width + + # Add appropriate gap before next image + if i < len(images) - 1: + gap = word_gap if segments[i + 1].space_before else segment_gap + x_offset += gap + + return result + + +# -------------------------- main ----------------------------- + + +def generate_handwriting( + input_dir: Path, + output_dir: Path, + run_dir: Path, + checkpoint: str = "latest.pt", + progress: Progress | None = None, + steps: int = 30, + split_length_words: int = 6, + split_length_numeric: int = 2, + temperature: float = 0.5, + seed: int = 42, + device: str = "cuda", + overwrite: bool = False, + mapping_file: Optional[Path] = None, + log_file: Optional[Path] = None, + batch_size: int = 32, + stitch_sentences: bool = True, + segment_gap: int = 2, + word_gap: int = 20, + baseline_percentile: float = 75.0, + allowed_writers: Optional[List[str]] = None, +) -> None: + """Generate handwriting images and metadata using configured diffusion models.""" + random.seed(seed) + torch.manual_seed(seed) + device_obj = torch.device( + device if torch.cuda.is_available() or device == "cpu" else "cpu" + ) + + input_dir = Path(input_dir) + output_dir = Path(output_dir) + run_dir = Path(run_dir) + mapping_file = Path(mapping_file) if mapping_file is not None else None + log_file = Path(log_file) if log_file is not None else None + + # Load model components + print(f"Loading model from {run_dir}...") + components = load_experiment(run_dir, checkpoint, device_obj) + print(f"✓ Model loaded successfully") + print(f" Mode: {components['mode']}") + print(f" Sample shape: {components['sample_shape']}") + print(f" Writers: {len(components['writer_id_map'])}") + + output_dir.mkdir(parents=True, exist_ok=True) + + # Load JSON files + json_files = list_json_files(input_dir) + if not json_files: + print("[ERROR] No JSON files found.", file=sys.stderr) + sys.exit(1) + + print(f"Found {len(json_files)} JSON files") + + # Extract tasks with word splitting + tasks: List[WordTask] = [] + extraction_logs: List[Dict[str, Any]] = [] + for jf in json_files: + data = load_json(jf) + extracted_tasks, extracted_log_entries = extract_tasks( + jf, data, split_length_words, split_length_numeric + ) + tasks.extend(extracted_tasks) + extraction_logs.extend(extracted_log_entries) + + print(f"Extracted {len(tasks)} word tasks") + if split_length_words > 0: + total_segments = sum(len(t.segments) for t in tasks) + print( + f" Split into {total_segments} segments (split_length={split_length_words}, digit_chunk_length={split_length_numeric})" + ) + + # Per-file author style mapping + file_author_style_ids: Dict[str, Dict[str, int]] = {} + writer_id_map = components["writer_id_map"] + + # Filter to allowed writers if specified + allowed_writer_ids = None + if allowed_writers is not None: + allowed_writer_ids = [] + for w in allowed_writers: + try: + writer_id = int(w) + if 0 <= writer_id < len(writer_id_map): + allowed_writer_ids.append(writer_id) + else: + print( + f"[WARNING] Writer ID {writer_id} out of range (0-{len(writer_id_map) - 1}), ignoring" + ) + except ValueError: + print(f"[WARNING] Invalid writer ID '{w}', must be integer, ignoring") + + if not allowed_writer_ids: + print("[ERROR] No valid writer IDs provided in --allowed-writers") + sys.exit(1) + + print( + f"Using {len(allowed_writer_ids)} allowed writer(s): {sorted(allowed_writer_ids)}" + ) + + # Set up RNG for random writer selection if needed + rng = random.Random(seed) + + for t in tasks: + file_author_style_ids.setdefault(t.source_json, {}) + if t.author_id not in file_author_style_ids[t.source_json]: + # Map author_id to writer index from the model's writer_id_map + if t.author_id in writer_id_map: + style_id = writer_id_map[t.author_id] + # If allowed_writers specified and this author's style not in list, randomly pick from allowed + if ( + allowed_writer_ids is not None + and style_id not in allowed_writer_ids + ): + style_id = rng.choice(allowed_writer_ids) + else: + # Author not in map: use allowed writers if specified, else fallback to hashing + if allowed_writer_ids is not None: + style_id = rng.choice(allowed_writer_ids) + else: + style_id = style_id_for_file( + t.source_json, t.author_id, seed, len(writer_id_map) + ) + file_author_style_ids[t.source_json][t.author_id] = style_id + + results: List[Dict[str, Any]] = [] + generation_skip_log: List[Dict[str, Any]] = [] + generation_error_log: List[Dict[str, Any]] = [] + sentence_exclusion_log: List[Dict[str, Any]] = [] + total_words = len(tasks) + effective_batch_size = max(1, batch_size) + progress = progress or Progress(transient=True) + generation_task_id = progress.add_task("Generating words", total=total_words) + + for word_idx in range(0, total_words, effective_batch_size): + batch_tasks = tasks[word_idx : word_idx + effective_batch_size] + + # Process each word task + for task in batch_tasks: + json_stem = Path(task.source_json).stem + doc_dir = output_dir / json_stem + doc_dir.mkdir(parents=True, exist_ok=True) + + # Output filename includes block and line numbers to avoid collisions across lines + out_name = build_word_filename(task) + relative_image_path = f"{json_stem}/{out_name}" + out_path = doc_dir / out_name + + if out_path.exists() and not overwrite: + # Load existing metadata + try: + existing_img = Image.open(out_path) + w, h = existing_img.size + baseline_info = calculate_baseline_info( + existing_img, baseline_percentile=baseline_percentile + ) + results.append( + { + "image": relative_image_path, + "hw_id": task.hw_id, + "author_id": task.author_id, + "style_id": file_author_style_ids[task.source_json][ + task.author_id + ], + "source_json": task.source_json, + "block_no": task.block_no, + "line_no": task.line_no, + "word_no": task.word_no, + "segments": [ + { + "token": seg.token, + "bbox": list(seg.bbox), + "space_before": seg.space_before, + } + for seg in task.segments + ], + "skipped": True, + "skip_reason": "existing_output", + "include_in_sentence": task.include_in_sentence, + "sentence_exclusion_reason": task.sentence_exclusion_reason, + "width": w, + "height": h, + "baseline": baseline_info, + } + ) + generation_skip_log.append( + { + "type": "existing_output", + "source_json": task.source_json, + "hw_id": task.hw_id, + "word_no": task.word_no, + "block_no": task.block_no, + "line_no": task.line_no, + "image": relative_image_path, + } + ) + if not task.include_in_sentence: + sentence_exclusion_log.append( + { + "source_json": task.source_json, + "hw_id": task.hw_id, + "word_no": task.word_no, + "block_no": task.block_no, + "line_no": task.line_no, + "image": relative_image_path, + "reason": task.sentence_exclusion_reason + or "manual_exclusion", + } + ) + except Exception as e: + print(f"[WARN] Could not load existing {out_path}: {e}") + continue + + # Generate all segments for this word + try: + tokens_batch = [seg.token for seg in task.segments] + style_id = file_author_style_ids[task.source_json][task.author_id] + style_ids_batch = [style_id] * len(tokens_batch) + + segment_images = diffusion_generate_batch( + tokens_batch, + style_ids_batch, + components, + steps, + temperature=temperature, + ) + + # Concatenate segments with variable gaps (word-gap for spaces, segment-gap for length splits) + if len(segment_images) > 1: + final_image = concatenate_segments_with_variable_gaps( + segment_images, + task.segments, + segment_gap=segment_gap, + word_gap=word_gap, + baseline_percentile=baseline_percentile, + ) + else: + final_image = segment_images[0] + + # Save + w, h = final_image.size + final_image.save(out_path) + + # Calculate baseline information for alignment + baseline_info = calculate_baseline_info( + final_image, baseline_percentile=baseline_percentile + ) + + results.append( + { + "image": relative_image_path, + "hw_id": task.hw_id, + "author_id": task.author_id, + "style_id": style_id, + "source_json": task.source_json, + "block_no": task.block_no, + "line_no": task.line_no, + "word_no": task.word_no, + "segments": [ + { + "token": seg.token, + "bbox": list(seg.bbox), + "space_before": seg.space_before, + } + for seg in task.segments + ], + "skipped": False, + "skip_reason": None, + "include_in_sentence": task.include_in_sentence, + "sentence_exclusion_reason": task.sentence_exclusion_reason, + "width": w, + "height": h, + "baseline": baseline_info, + } + ) + if not task.include_in_sentence: + sentence_exclusion_log.append( + { + "source_json": task.source_json, + "hw_id": task.hw_id, + "word_no": task.word_no, + "block_no": task.block_no, + "line_no": task.line_no, + "image": relative_image_path, + "reason": task.sentence_exclusion_reason + or "manual_exclusion", + } + ) + except Exception as e: + print( + f"[ERROR] Generation failed for {task.hw_id} word {task.word_no}: {e}", + file=sys.stderr, + ) + import traceback + + traceback.print_exc() + generation_error_log.append( + { + "type": "generation_error", + "source_json": task.source_json, + "hw_id": task.hw_id, + "word_no": task.word_no, + "block_no": task.block_no, + "line_no": task.line_no, + "reason": str(e), + "traceback": traceback.format_exc(), + } + ) + + if progress and generation_task_id is not None: + progress.advance(generation_task_id, len(batch_tasks)) + + # Sentence-level stitching (if requested) + if stitch_sentences: + print("\nStitching words into sentences...") + sentences_dir = output_dir / "sentences" + sentences_dir.mkdir(exist_ok=True) + + # Group results by (source_json, hw_id, block_no, line_no) + line_groups: Dict[Tuple[str, str, int, int], List[Dict[str, Any]]] = {} + for r in results: + if r["skipped"]: + continue + if not r.get("include_in_sentence", True): + continue + key = (r["source_json"], r["hw_id"], r["block_no"], r["line_no"]) + line_groups.setdefault(key, []).append(r) + + # Sort words within each line by word_no + for key in line_groups: + line_groups[key].sort(key=lambda x: x["word_no"]) + + sentence_results: List[Dict[str, Any]] = [] + sentence_progress = progress + sentence_task_id = sentence_progress.add_task( + "Stitching sentences", total=len(line_groups) + ) + + for (source_json, hw_id, block_no, line_no), word_list in line_groups.items(): + if not word_list: + continue + + json_stem = Path(source_json).stem + sent_doc_dir = sentences_dir / json_stem + sent_doc_dir.mkdir(parents=True, exist_ok=True) + + # Output filename: hw{id}_block{block}_line{line}.png + sent_name = f"{hw_id}_block{block_no}_line{line_no}.png" + sent_relative_path = f"sentences/{json_stem}/{sent_name}" + sent_path = sent_doc_dir / sent_name + + if sent_path.exists() and not overwrite: + if sentence_progress and sentence_task_id is not None: + sentence_progress.advance(sentence_task_id, 1) + continue + + try: + # Load all word images for this line + word_images = [] + for word_data in word_list: + word_img_path = output_dir / word_data["image"] + if word_img_path.exists(): + word_images.append(Image.open(word_img_path)) + + if not word_images: + continue + + # Stitch words together with larger gap + sentence_image = concatenate_images_horizontal( + word_images, + gap=word_gap, + baseline_align=True, + baseline_percentile=baseline_percentile, + ) + + # Save sentence image + sentence_image.save(sent_path) + + # Collect text for this line + line_text = " ".join( + [ + "".join([seg["token"] for seg in w["segments"]]) + for w in word_list + ] + ) + + sentence_results.append( + { + "image": sent_relative_path, + "source_json": source_json, + "hw_id": hw_id, + "block_no": block_no, + "line_no": line_no, + "text": line_text, + "num_words": len(word_list), + "width": sentence_image.width, + "height": sentence_image.height, + } + ) + + except Exception as e: + print( + f"[ERROR] Failed to stitch sentence {hw_id} block{block_no} line{line_no}: {e}", + file=sys.stderr, + ) + + if sentence_progress and sentence_task_id is not None: + sentence_progress.advance(sentence_task_id, 1) + + # Save sentence mapping + sentence_mapping_file = sentences_dir / "sentence_map.json" + with sentence_mapping_file.open("w", encoding="utf-8") as f: + json.dump( + { + "backend": "diffusion-hf-sentences", + "word_gap": word_gap, + "sentences": sentence_results, + }, + f, + ensure_ascii=False, + indent=2, + ) + + print(f"✓ Generated {len(sentence_results)} sentence images") + print(f"✓ Sentence mapping saved: {sentence_mapping_file}") + + # Build mapping structure + entries_map: Dict[Tuple[str, str], List[Dict[str, Any]]] = {} + for r in results: + key = (r["source_json"], r["hw_id"]) + entries_map.setdefault(key, []).append(r) + + # Export file author styles + file_author_styles_export = { + fname: {aid: {"style_id": sid} for aid, sid in inner.items()} + for fname, inner in sorted(file_author_style_ids.items()) + } + + consolidated = { + "backend": "diffusion-hf", + "split_length": split_length_words, + "digit_chunk_length": split_length_numeric, + "temperature": temperature, + "steps": steps, + "segment_gap": segment_gap, + "word_gap": word_gap if stitch_sentences else None, + "baseline_percentile": baseline_percentile, + "entries": [ + { + "source_json": src, + "hw_id": hw, + "author_id": words[0]["author_id"] if words else None, + "words": [ + { + "block_no": w["block_no"], + "line_no": w["line_no"], + "word_no": w["word_no"], + "image": w["image"], + "style_id": w["style_id"], + "width": w["width"], + "height": w["height"], + "baseline": w["baseline"], + "segments": w["segments"], + } + for w in sorted( + words, key=lambda x: (x["block_no"], x["line_no"], x["word_no"]) + ) + ], + } + for (src, hw), words in sorted(entries_map.items()) + ], + "file_author_styles": file_author_styles_export, + } + + mapping_path = mapping_file or (output_dir / "raw_token_map.json") + with mapping_path.open("w", encoding="utf-8") as f: + json.dump(consolidated, f, ensure_ascii=False, indent=2) + + generated_count = sum(1 for r in results if not r["skipped"]) + reused_count = sum(1 for r in results if r["skipped"]) + log_file_path = log_file or (output_dir / "generation_log.json") + log_payload = { + "timestamp": datetime.utcnow().isoformat() + "Z", + "summary": { + "total_tasks": len(tasks), + "extraction_skips": len( + [ + entry + for entry in extraction_logs + if entry.get("type") == "extraction_skip" + ] + ), + "words_generated": generated_count, + "words_reused": reused_count, + "generation_errors": len(generation_error_log), + "sentence_exclusions": len(sentence_exclusion_log), + }, + "details": { + "extraction": extraction_logs, + "generation_skips": generation_skip_log, + "generation_errors": generation_error_log, + "sentence_exclusions": sentence_exclusion_log, + }, + } + with log_file_path.open("w", encoding="utf-8") as log_fp: + json.dump(log_payload, log_fp, ensure_ascii=False, indent=2) + + print(f"\n✓ Generated {len(results)} word images") + print(f"✓ Mapping saved: {mapping_path}") + print(f"✓ Log saved: {log_file_path}") + print("[DONE] Freeing up memory..") + for k, v in components.items(): + del v + del components + torch.cuda.empty_cache() + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Diffusion-based handwriting token generator with intelligent word splitting." + ) + ap.add_argument( + "--input-dir", + type=Path, + required=True, + help="Directory containing bbox JSON files", + ) + ap.add_argument( + "--output-dir", + type=Path, + required=True, + help="Output directory for generated images", + ) + ap.add_argument( + "--run-dir", + type=Path, + required=True, + help="Model experiment directory (e.g., model/experiments/hf_conditional_latent)", + ) + ap.add_argument( + "--checkpoint", type=str, default="latest.pt", help="Checkpoint filename" + ) + ap.add_argument("--steps", type=int, default=30, help="Number of diffusion steps") + ap.add_argument( + "--split-length-words", + type=int, + default=6, + help="Maximum word length before splitting (0 = no splitting)", + ) + ap.add_argument( + "--temperature", type=float, default=0.5, help="Sampling temperature" + ) + ap.add_argument("--seed", type=int, default=42, help="Random seed") + ap.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)") + ap.add_argument( + "--overwrite", action="store_true", help="Overwrite existing images" + ) + ap.add_argument( + "--mapping-file", type=Path, default=None, help="Output mapping JSON path" + ) + ap.add_argument( + "--log-file", + type=Path, + default=None, + help="Optional path for JSON log output (default: output_dir/generation_log.json)", + ) + ap.add_argument( + "--batch-size", type=int, default=32, help="Batch size for generation" + ) + ap.add_argument( + "--stitch-sentences", + default=True, + action="store_true", + help="Generate sentence-level stitched images in separate folder", + ) + ap.add_argument( + "--segment-gap", + type=int, + default=2, + help="Gap between word segments (split parts) in pixels", + ) + ap.add_argument( + "--word-gap", + type=int, + default=20, + help="Gap between words in sentence stitching in pixels", + ) + ap.add_argument( + "--baseline-percentile", + type=float, + default=75.0, + help="Percentile for baseline detection (0-100, default: 85.0)", + ) + ap.add_argument( + "--allowed-writers", + type=str, + nargs="+", + default=None, + help="List of allowed writer IDs to choose from (e.g., --allowed-writers 0 5 10 25)", + ) + args = ap.parse_args() + + generate_handwriting(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/docgenie/generation/handwriting_diffusion/text_encoder.py b/docgenie/generation/handwriting_diffusion/text_encoder.py new file mode 100755 index 0000000000000000000000000000000000000000..532d1a280a072c2bf626e59da4dd3f1b66c36286 --- /dev/null +++ b/docgenie/generation/handwriting_diffusion/text_encoder.py @@ -0,0 +1,258 @@ +""" +Transformer-based text encoder for conditioning diffusion model. +""" +import torch +import torch.nn as nn +import math + + +class PositionalEncoding(nn.Module): + """Sinusoidal positional encoding.""" + + def __init__(self, d_model: int, max_len: int = 5000): + super().__init__() + + # Create positional encoding matrix + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) + ) + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) # [1, max_len, d_model] + + self.register_buffer('pe', pe) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x: Tensor of shape [batch_size, seq_len, d_model] + Returns: + Tensor with positional encoding added + """ + return x + self.pe[:, :x.size(1), :] + + +class TransformerEncoderBlock(nn.Module): + """Single Transformer encoder block.""" + + def __init__( + self, + d_model: int, + num_heads: int, + d_ff: int, + dropout: float = 0.1 + ): + super().__init__() + + self.self_attn = nn.MultiheadAttention( + d_model, + num_heads, + dropout=dropout, + batch_first=True + ) + + self.feed_forward = nn.Sequential( + nn.Linear(d_model, d_ff), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(d_ff, d_model), + nn.Dropout(dropout) + ) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + def forward( + self, + x: torch.Tensor, + attention_mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Args: + x: [batch_size, seq_len, d_model] + attention_mask: [batch_size, seq_len] - 1 for valid, 0 for padding + """ + # Self-attention with residual + attn_output, _ = self.self_attn( + x, x, x, + key_padding_mask=(1 - attention_mask).bool() if attention_mask is not None else None + ) + x = self.norm1(x + self.dropout(attn_output)) + + # Feed-forward with residual + ff_output = self.feed_forward(x) + x = self.norm2(x + ff_output) + + return x + + +class TextEncoder(nn.Module): + """ + Transformer-based text encoder for character-level conditioning. + """ + + def __init__( + self, + vocab_size: int, + char_embed_dim: int = 256, + d_model: int = 512, + num_layers: int = 6, + num_heads: int = 8, + d_ff: int = 2048, + max_length: int = 128, + dropout: float = 0.1, + output_dim: int = 512 + ): + """ + Args: + vocab_size: Size of character vocabulary + char_embed_dim: Dimension of character embeddings + d_model: Hidden dimension of transformer + num_layers: Number of transformer layers + num_heads: Number of attention heads + d_ff: Dimension of feed-forward layer + max_length: Maximum sequence length + dropout: Dropout probability + output_dim: Output dimension for conditioning + """ + super().__init__() + + self.d_model = d_model + self.output_dim = output_dim + + # Character embedding + self.char_embedding = nn.Embedding(vocab_size, char_embed_dim, padding_idx=0) + + # Project char embeddings to model dimension + self.input_projection = nn.Linear(char_embed_dim, d_model) + + # Positional encoding + self.pos_encoding = PositionalEncoding(d_model, max_length) + + # Transformer encoder layers + self.layers = nn.ModuleList([ + TransformerEncoderBlock(d_model, num_heads, d_ff, dropout) + for _ in range(num_layers) + ]) + + # Output projection + self.output_projection = nn.Linear(d_model, output_dim) + + self.dropout = nn.Dropout(dropout) + self.norm = nn.LayerNorm(d_model) + + # Initialize weights + self._init_weights() + + def _init_weights(self): + """Initialize weights.""" + for module in self.modules(): + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0, std=0.02) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Forward pass. + + Args: + input_ids: [batch_size, seq_len] - Token indices + attention_mask: [batch_size, seq_len] - 1 for valid, 0 for padding + + Returns: + Encoded text features [batch_size, seq_len, output_dim] + """ + # Character embedding + x = self.char_embedding(input_ids) # [B, seq_len, char_embed_dim] + + # Project to model dimension + x = self.input_projection(x) # [B, seq_len, d_model] + + # Add positional encoding + x = self.pos_encoding(x) + x = self.dropout(x) + + # Pass through transformer layers + for layer in self.layers: + x = layer(x, attention_mask) + + # Normalize + x = self.norm(x) + + # Project to output dimension + x = self.output_projection(x) # [B, seq_len, output_dim] + + return x + + def get_sequence_embedding( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor = None + ) -> torch.Tensor: + """ + Get single embedding for entire sequence (mean pooling over valid tokens). + + Args: + input_ids: [batch_size, seq_len] + attention_mask: [batch_size, seq_len] + + Returns: + Pooled embedding [batch_size, output_dim] + """ + # Get token-level embeddings + token_embeddings = self.forward(input_ids, attention_mask) # [B, seq_len, output_dim] + + # Mean pooling over valid tokens + if attention_mask is not None: + # Expand mask to match embedding dimension + mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()) + sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1) + sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9) + pooled = sum_embeddings / sum_mask + else: + pooled = token_embeddings.mean(dim=1) + + return pooled + + +if __name__ == "__main__": + # Test the text encoder + vocab_size = 100 + batch_size = 4 + seq_len = 32 + + model = TextEncoder( + vocab_size=vocab_size, + char_embed_dim=256, + d_model=512, + num_layers=6, + num_heads=8, + d_ff=2048, + max_length=128, + output_dim=512 + ) + + # Random input + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)) + attention_mask = torch.ones(batch_size, seq_len) + attention_mask[:, seq_len//2:] = 0 # Simulate padding + + # Forward pass + output = model(input_ids, attention_mask) + pooled = model.get_sequence_embedding(input_ids, attention_mask) + + print(f"Input shape: {input_ids.shape}") + print(f"Output shape: {output.shape}") + print(f"Pooled shape: {pooled.shape}") + print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") diff --git a/docgenie/generation/handwriting_diffusion/tokenizer.py b/docgenie/generation/handwriting_diffusion/tokenizer.py new file mode 100755 index 0000000000000000000000000000000000000000..69e1081a2844fd9c98642a3d30562d8238a2de50 --- /dev/null +++ b/docgenie/generation/handwriting_diffusion/tokenizer.py @@ -0,0 +1,300 @@ +""" +Character-level tokenizer for handwriting generation. +Supports special tokens and can be saved/loaded for inference. +""" +import json +import os +from typing import List, Dict, Optional +import numpy as np + + +class CharTokenizer: + """Character-level tokenizer with special tokens.""" + + # Special tokens + PAD_TOKEN = "" + UNK_TOKEN = "" + SOS_TOKEN = "" + EOS_TOKEN = "" + + def __init__( + self, + vocab: Optional[Dict[str, int]] = None, + max_length: int = 128 + ): + """ + Initialize tokenizer. + + Args: + vocab: Character to index mapping. If None, will be built from data. + max_length: Maximum sequence length for padding/truncation. + """ + self.max_length = max_length + + if vocab is None: + # Initialize with special tokens only + self.char_to_idx = { + self.PAD_TOKEN: 0, + self.UNK_TOKEN: 1, + self.SOS_TOKEN: 2, + self.EOS_TOKEN: 3, + } + else: + self.char_to_idx = vocab + + self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()} + self.vocab_size = len(self.char_to_idx) + + def build_vocab(self, texts: List[str]) -> None: + """ + Build vocabulary from list of texts. + + Args: + texts: List of text strings to build vocabulary from. + """ + # Collect all unique characters + unique_chars = set() + for text in texts: + unique_chars.update(text) + + # Sort for deterministic ordering + unique_chars = sorted(list(unique_chars)) + + # Add to vocabulary (starting after special tokens) + for char in unique_chars: + if char not in self.char_to_idx: + self.char_to_idx[char] = len(self.char_to_idx) + + # Update reverse mapping + self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()} + self.vocab_size = len(self.char_to_idx) + + print(f"Built vocabulary with {self.vocab_size} characters") + print(f"Sample characters: {list(unique_chars)[:20]}") + + def encode( + self, + text: str, + add_special_tokens: bool = True, + padding: bool = True, + truncation: bool = True, + return_attention_mask: bool = True + ) -> Dict[str, np.ndarray]: + """ + Encode text to token indices. + + Args: + text: Input text string. + add_special_tokens: Whether to add SOS/EOS tokens. + padding: Whether to pad to max_length. + truncation: Whether to truncate to max_length. + return_attention_mask: Whether to return attention mask. + + Returns: + Dictionary with 'input_ids' and optionally 'attention_mask'. + """ + # Convert characters to indices + token_ids = [] + + if add_special_tokens: + token_ids.append(self.char_to_idx[self.SOS_TOKEN]) + + for char in text: + token_ids.append( + self.char_to_idx.get(char, self.char_to_idx[self.UNK_TOKEN]) + ) + + if add_special_tokens: + token_ids.append(self.char_to_idx[self.EOS_TOKEN]) + + # Truncation + if truncation and len(token_ids) > self.max_length: + token_ids = token_ids[:self.max_length] + if add_special_tokens: + token_ids[-1] = self.char_to_idx[self.EOS_TOKEN] + + # Create attention mask (1 for real tokens, 0 for padding) + attention_mask = [1] * len(token_ids) + + # Padding + if padding and len(token_ids) < self.max_length: + padding_length = self.max_length - len(token_ids) + token_ids.extend([self.char_to_idx[self.PAD_TOKEN]] * padding_length) + attention_mask.extend([0] * padding_length) + + result = { + 'input_ids': np.array(token_ids, dtype=np.int64) + } + + if return_attention_mask: + result['attention_mask'] = np.array(attention_mask, dtype=np.float32) + + return result + + def encode_batch( + self, + texts: List[str], + add_special_tokens: bool = True, + padding: bool = True, + truncation: bool = True, + return_attention_mask: bool = True + ) -> Dict[str, np.ndarray]: + """ + Encode batch of texts. + + Args: + texts: List of text strings. + add_special_tokens: Whether to add SOS/EOS tokens. + padding: Whether to pad to max_length. + truncation: Whether to truncate to max_length. + return_attention_mask: Whether to return attention mask. + + Returns: + Dictionary with batched 'input_ids' and optionally 'attention_mask'. + """ + batch_encoding = [ + self.encode( + text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + return_attention_mask=return_attention_mask + ) + for text in texts + ] + + result = { + 'input_ids': np.stack([enc['input_ids'] for enc in batch_encoding]) + } + + if return_attention_mask: + result['attention_mask'] = np.stack([enc['attention_mask'] for enc in batch_encoding]) + + return result + + def decode( + self, + token_ids: List[int], + skip_special_tokens: bool = True + ) -> str: + """ + Decode token indices to text. + + Args: + token_ids: List of token indices. + skip_special_tokens: Whether to skip special tokens in output. + + Returns: + Decoded text string. + """ + chars = [] + special_tokens = { + self.char_to_idx[self.PAD_TOKEN], + self.char_to_idx[self.UNK_TOKEN], + self.char_to_idx[self.SOS_TOKEN], + self.char_to_idx[self.EOS_TOKEN] + } + + for idx in token_ids: + if skip_special_tokens and idx in special_tokens: + continue + chars.append(self.idx_to_char.get(idx, self.UNK_TOKEN)) + + return ''.join(chars) + + def save(self, save_path: str) -> None: + """ + Save tokenizer to file. + + Args: + save_path: Path to save tokenizer (JSON file). + """ + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + config = { + 'char_to_idx': self.char_to_idx, + 'max_length': self.max_length, + 'vocab_size': self.vocab_size + } + + with open(save_path, 'w', encoding='utf-8') as f: + json.dump(config, f, ensure_ascii=False, indent=2) + + print(f"Tokenizer saved to {save_path}") + + @classmethod + def load(cls, load_path: str) -> "CharTokenizer": + """ + Load tokenizer from file. + + Args: + load_path: Path to load tokenizer from (JSON file). + + Returns: + Loaded tokenizer instance. + """ + with open(load_path, 'r', encoding='utf-8') as f: + config = json.load(f) + + tokenizer = cls( + vocab=config['char_to_idx'], + max_length=config['max_length'] + ) + + print(f"Tokenizer loaded from {load_path}") + print(f"Vocabulary size: {tokenizer.vocab_size}") + + return tokenizer + + def __len__(self) -> int: + """Return vocabulary size.""" + return self.vocab_size + + def __repr__(self) -> str: + return f"CharTokenizer(vocab_size={self.vocab_size}, max_length={self.max_length})" + + +def build_tokenizer_from_csv(csv_path: str, max_length: int = 128) -> CharTokenizer: + """ + Build tokenizer from IAM dataset CSV file. + + Args: + csv_path: Path to dataset_metadata.csv + max_length: Maximum sequence length + + Returns: + Built tokenizer + """ + import pandas as pd + + print(f"Loading texts from {csv_path}...") + df = pd.read_csv(csv_path) + texts = df['text'].astype(str).tolist() + + print(f"Building vocabulary from {len(texts)} samples...") + tokenizer = CharTokenizer(max_length=max_length) + tokenizer.build_vocab(texts) + + return tokenizer + + +if __name__ == "__main__": + # Example: Build tokenizer from IAM dataset + tokenizer = build_tokenizer_from_csv( + "../iam_dataset_processed/dataset_metadata.csv", + max_length=128 + ) + + # Save tokenizer + tokenizer.save("../training/tokenizer.json") + + # Test encoding + test_text = "Hello, World!" + encoded = tokenizer.encode(test_text) + print(f"\nTest encoding for: '{test_text}'") + print(f"Input IDs: {encoded['input_ids'][:20]}") + print(f"Attention mask: {encoded['attention_mask'][:20]}") + + # Test decoding + decoded = tokenizer.decode(encoded['input_ids']) + print(f"Decoded: '{decoded}'") diff --git a/docgenie/generation/main.py b/docgenie/generation/main.py new file mode 100755 index 0000000000000000000000000000000000000000..d53e6f549f429523e853b6615cc2013a98394a81 --- /dev/null +++ b/docgenie/generation/main.py @@ -0,0 +1,254 @@ +import argparse +import os +import pathlib +import shutil + +from docgenie import ENV +from docgenie.generation.constants import HANDWRITING_DEFAULT_BATCH_SIZE +from docgenie.generation.models import LLMType, DatasetTask +from docgenie.generation.models import PipelineParameters, SynDatasetDefinition +from docgenie.data.interface import load_dataset +from docgenie.generation.pipeline_01_select_seeds import pipeline_select_seeds +from docgenie.generation.pipeline_02_prompt_llm import ( + pipeline_retrieve_document_html_seed_based, +) +from docgenie.generation.pipeline_03_process_response import ( + pipeline_process_response_extract_html_and_gt, +) +from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import ( + pipeline_render_pdf_and_extract_geos_parallel, +) +from docgenie.generation.pipeline_05_extract_bboxes_from_pdf import ( + pipeline_extract_bboxes, +) +from docgenie.generation.pipeline_06_extract_layout_element_definitions_and_annotation_gt import ( + pipeline_extract_layout_element_definitions_and_annotation_gt, +) +from docgenie.generation.pipeline_08_extract_visual_element_definitions import ( + pipeline_extract_visual_element_definitions, +) +from docgenie.generation.pipeline_07_extract_handwriting import ( + pipeline_extract_handwritten_fields, +) +from docgenie.generation.pipeline_09_create_handwriting_images import ( + pipeline_create_handwriting_images, +) +from docgenie.generation.pipeline_11_render_pdf_second_pass import ( + pipeline_render_pdf_second_pass, +) +from docgenie.generation.pipeline_12_insert_handwriting_images import ( + pipeline_handwritten_text_insertion, +) +from docgenie.generation.pipeline_10_create_visual_elements import ( + pipeline_create_visual_elements, +) +from docgenie.generation.pipeline_13_insert_visual_elements import ( + pipeline_insert_visual_elements, +) +from docgenie.generation.pipeline_16_normalize_bboxes import pipeline_normalize_bboxes +from docgenie.generation.pipeline_15_perform_ocr import ( + pipeline_perform_ocr, +) +from docgenie.generation.pipeline_14_render_image import ( + pipeline_render_image, +) +from docgenie.generation.pipeline_17_gt_preparation_verification import ( + pipeline_ground_truth_verification, +) +from docgenie.generation.pipeline_19_create_debug_data import pipeline_create_debug_data +from docgenie.generation.pipeline_18_analyze import pipeline_analyze + + +def parse_args(): + parser = argparse.ArgumentParser( + description="DocGenie Synthetic Document Generator", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "SynDatasetDefinition", + type=str, + help="Filename without extension of the SynDatasetDefinition in data/syn_dataset_definitions", + ) + + parser.add_argument( + "--reset", + "-r", + action="store_true", + help="If set, all previous data is deleted prior to execution, except: prompt batches, prompt responses and seed images.", + ) + + parser.add_argument( + "--entry", + "-e", + type=int, + default=None, + help="If set, starts the pipeline at this step", + ) + + # parser.add_argument( + # "--docids", + # type=str, + # default=None, + # help="Define document ids to which restrict the pipeline", + # ) + + parser.add_argument( + "--hwbs", + type=int, + default=HANDWRITING_DEFAULT_BATCH_SIZE, + help="Handwriting batch size", + ) + + parser.add_argument( + "--nohw", + action="store_true", + help="Runs the pipeline without creating handwriting", + ) + + parser.add_argument( + "--debug", + action="store_true", + help="Runs the pipeline and creates debug data", + ) + + parser.add_argument( + "--seedsonly", + "-s", + action="store_true", + help="If set, the pipeline only collects seed images and then aborts", + ) + + parser.add_argument( + "--apikey", + type=str, + default=None, + help="If given, use the env variable with this name to retrieve the anthropic API key", + ) + + parser.add_argument( + "--LLMType", + type=str, + choices=[e.value for e in LLMType], + default=LLMType.CLAUDE.value, + help="Define the whether to use closed source model or open source (currently just Qwen2.5-32B)", + ) + + parser.add_argument( + "--message_custom_id", + type=str, + default=None, + help="If specified, the pipeline is run only for this message and ignores existing results.", + ) + + args = parser.parse_args() + args.LLMType = LLMType(args.LLMType) + assert args.SynDatasetDefinition + + if args.apikey: + assert os.getenv(args.apikey) + print(f"Using Anthropic API Key from {args.apikey}") + + print(args) + return args + + +if __name__ == "__main__": + args = parse_args() + deffile = ENV.SYN_DATA_DEFINITIONS_DIR / f"{args.SynDatasetDefinition}.yaml" + dsdef: SynDatasetDefinition = SynDatasetDefinition.from_file(deffile) + dsfiles = dsdef.get_file_structure() + + input('PRESS KEY') + + if args.reset: + print(f"""Parameter --reset has been passed. All existing data from {dsdef.name} will be deleted, except: + - {dsfiles.prompt_batches_directory} + - {dsfiles.message_results_directory} + - {dsfiles.ocr_results_directory} + - {dsfiles.preprocessed_seed_images_directory} +""") + dsdef.reset_data_except_prompt_and_seeds() + + dataset = load_dataset(dsdef.base_dataset_name, split="train") + + print(f"The LLM will be used is: {args.LLMType}") + + # Copy used syn dataset defintion to output directory + dst = dsfiles.base_path / f"{args.SynDatasetDefinition}.yaml" + shutil.copy2(deffile, dst) + + params = PipelineParameters( + dsdef=dsdef, + llmtype=args.LLMType, + message_custom_id=args.message_custom_id, + seedsonly=args.seedsonly, + debug=args.debug, + handwriting_batch_size=args.hwbs, + generate_handwriting=not args.nohw, + api_key_env_variable_name=args.apikey, + ) + + entry = args.entry or 0 + + # Execute pipeline + if entry <= 1: + pipeline_select_seeds(params=params) + + if args.seedsonly: + exit(0) + + if entry <= 2: + pipeline_retrieve_document_html_seed_based(params=params) + + if entry <= 3: + pipeline_process_response_extract_html_and_gt(params=params) + + if entry <= 4: + pipeline_render_pdf_and_extract_geos_parallel(params=params) + + if entry <= 5: + pipeline_extract_bboxes(params=params) + + if entry <= 6: + if dsdef.prompt_task == "annotation": + pipeline_extract_layout_element_definitions_and_annotation_gt(params=params) + + if entry <= 7: + pipeline_extract_handwritten_fields(params=params) + + if entry <= 8: + pipeline_extract_visual_element_definitions(params=params) + + if entry <= 9: + pipeline_create_handwriting_images(params=params) + + if entry <= 10: + pipeline_create_visual_elements(params=params) + + if entry <= 11: + pipeline_render_pdf_second_pass(params=params) + + if entry <= 12: + pipeline_handwritten_text_insertion(params=params) + + if entry <= 13: + pipeline_insert_visual_elements(params=params) + + if entry <= 14: + pipeline_render_image(params=params) + + if entry <= 15: + pipeline_perform_ocr(params=params) + + if entry <= 16: + pipeline_normalize_bboxes(params=params) + + if entry <= 17: + pipeline_ground_truth_verification(params=params) + + if entry <= 18: + pipeline_analyze(params=params) + + if entry <= 19 and params.debug: + pipeline_create_debug_data(params=params) diff --git a/docgenie/generation/models/__init__.py b/docgenie/generation/models/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..b31e493579c0424e62e447f9a86e782cd87c7ab9 --- /dev/null +++ b/docgenie/generation/models/__init__.py @@ -0,0 +1,6 @@ +from docgenie.generation.models._bbox import * # noqa +from docgenie.generation.models._consts import * # noqa +from docgenie.generation.models._file import * # noqa +from docgenie.generation.models._log import * # noqa +from docgenie.generation.models._pipeline import * # noqa +from docgenie.generation.models._syndatadef import * # noqa diff --git a/docgenie/generation/models/_bbox.py b/docgenie/generation/models/_bbox.py new file mode 100755 index 0000000000000000000000000000000000000000..977b889406b424f11f30712b3b01dcc8f031027e --- /dev/null +++ b/docgenie/generation/models/_bbox.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class OCRBox: + x0: float + y0: float + x2: float + y2: float + text: str + block_no: int + line_no: int + word_no: int + + @property + def key(self): + return (self.block_no, self.line_no, self.word_no) + + def as_string(self) -> str: + return f"{self.x0},{self.y0},{self.x2},{self.y2},{self.text},{self.block_no},{self.line_no},{self.word_no}" + + @property + def width(self): + return self.x2 - self.x0 + + @property + def height(self): + return self.y2 - self.y0 + + def unnormalize(self, width_px, height_px): + return OCRBox(self.x0 * width_px, self.y0 * height_px, self.x2 * width_px, self.y2 * height_px, text=self.text, block_no=self.block_no, line_no=self.line_no, word_no=self.word_no) + + def scale(self, scale): + return OCRBox(self.x0 * scale, self.y0 * scale, self.x2 * scale, self.y2 * scale, text=self.text, block_no=self.block_no, line_no=self.line_no, word_no=self.word_no) + + +@dataclass(frozen=True) +class LayoutBox: + x0: float + y0: float + x2: float + y2: float + label: str + + @staticmethod + def box_contains(outer: LayoutBox, inner: LayoutBox) -> bool: + """Check if outer box fully contains inner box.""" + return (outer.x0 <= inner.x0 and outer.y0 <= inner.y0 and + outer.x2 >= inner.x2 and outer.y2 >= inner.y2) + + @staticmethod + def calculate_overlap_ratio(box1: LayoutBox, box2: LayoutBox) -> float: + """ + Calculate the overlap ratio between two boxes. + Returns the ratio of intersection area to the smaller box's area. + """ + x_left = max(box1.x0, box2.x0) + y_top = max(box1.y0, box2.y0) + x_right = min(box1.x2, box2.x2) + y_bottom = min(box1.y2, box2.y2) + + if x_right <= x_left or y_bottom <= y_top: + return 0.0 + + intersection_area = (x_right - x_left) * (y_bottom - y_top) + + box1_area = (box1.x2 - box1.x0) * (box1.y2 - box1.y0) + box2_area = (box2.x2 - box2.x0) * (box2.y2 - box2.y0) + + smaller_area = min(box1_area, box2_area) + return intersection_area / smaller_area if smaller_area > 0 else 0.0 + + @staticmethod + def normalize_to_pdf(bbox: LayoutBox, width_pt: float, height_pt: float, dpi: float) -> LayoutBox: + """Convert a bounding box from PDF points to normalized image coordinates.""" + # Convert PDF points to pixels + scale = dpi / 72 + x_min_px = bbox.x0 * scale + y_min_px = bbox.y0 * scale + x_max_px = bbox.x2 * scale + y_max_px = bbox.y2 * scale + + # Get image size in pixels + img_w_px = width_pt * scale + img_h_px = height_pt * scale + + # Normalize bounding box + x_min_norm = x_min_px / img_w_px + y_min_norm = y_min_px / img_h_px + x_max_norm = x_max_px / img_w_px + y_max_norm = y_max_px / img_h_px + + return LayoutBox( + x0=x_min_norm, y0=y_min_norm, + x2=x_max_norm, y2=y_max_norm, + label=bbox.label, + ) diff --git a/docgenie/generation/models/_consts.py b/docgenie/generation/models/_consts.py new file mode 100755 index 0000000000000000000000000000000000000000..f3b716491bb82be7c9ee2a8f005c0f9f4eab85f2 --- /dev/null +++ b/docgenie/generation/models/_consts.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class LLMType(Enum): + CLAUDE = "claude" + QWEN = "qwen" + DEEPSEEK = "deepseek" + + +class DatasetTask(Enum): + KIE = "KIE" + QA = "QA" + DLA = "DLA" + CLASSIFICATION = "CLASSIFICATION" diff --git a/docgenie/generation/models/_file.py b/docgenie/generation/models/_file.py new file mode 100755 index 0000000000000000000000000000000000000000..646b55217bf714c8379b690ee5caa1354b724eba --- /dev/null +++ b/docgenie/generation/models/_file.py @@ -0,0 +1,249 @@ +import pathlib +from typing import Literal + +from docgenie import ENV + + +class SyntheticDatasetFileStructure: + def __init__(self, ds_name: str): + self.ds_name = ds_name + + self.prompt_batches_directory.mkdir(parents=True, exist_ok=True) + self.message_results_directory.mkdir(parents=True, exist_ok=True) + self.preprocessed_seed_images_directory.mkdir(parents=True, exist_ok=True) + self.message_processing_logs_directory.mkdir(parents=True, exist_ok=True) + self.raw_html_directory.mkdir(parents=True, exist_ok=True) + self.render_html_directory.mkdir(parents=True, exist_ok=True) + self.render_html_second_pass_directory.mkdir(parents=True, exist_ok=True) + self.geometries_directory.mkdir(parents=True, exist_ok=True) + self.raw_annotations_directory.mkdir(parents=True, exist_ok=True) + self.pdf_initial_directory.mkdir(parents=True, exist_ok=True) + self.pdf_with_handwriting_directory.mkdir(parents=True, exist_ok=True) + self.pdf_without_handwriting_placeholder_directory.mkdir( + parents=True, exist_ok=True + ) + self.final_pdf_directory.mkdir(parents=True, exist_ok=True) + self.bboxes_pdf_directory.mkdir(parents=True, exist_ok=True) + self.bboxes_final_directory.mkdir(parents=True, exist_ok=True) + self.bboxes_final_normalized_directory.mkdir(parents=True, exist_ok=True) + self.ocr_results_directory.mkdir(parents=True, exist_ok=True) + self.img_directory.mkdir(parents=True, exist_ok=True) + self.gt_directory.mkdir(parents=True, exist_ok=True) + self.document_logs_directory.mkdir(parents=True, exist_ok=True) + self.handwritten_bboxes_directory.mkdir(parents=True, exist_ok=True) + self.visual_element_definitions_directory.mkdir(parents=True, exist_ok=True) + self.visual_elements_directory.mkdir(parents=True, exist_ok=True) + self.layout_element_definitions_directory.mkdir(parents=True, exist_ok=True) + # Directories for handwritten text images + self.handwritten_text_images_directory.mkdir(parents=True, exist_ok=True) + + self.debug_pdf_visual_elements_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_handwriting_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_layout_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_geometries_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_bboxes_final_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_bboxes_directory.mkdir(parents=True, exist_ok=True) + self.debug_pdf_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True) + self.debug_ocr_bboxes_and_geos_directory.mkdir(parents=True, exist_ok=True) + self.debug_html_raw_directory.mkdir(parents=True, exist_ok=True) + + @property + def base_path(self) -> pathlib.Path: + return ENV.SYN_DATASETS_DIR / self.ds_name + + @property + def ds_log_path(self) -> pathlib.Path: + return self.base_path / "dataset_log.json" + + @property + def ds_csv_log_path(self)->pathlib.Path: + return self.base_path / "dataset_log.csv" + + # Keep on reset + @property + def prompt_batches_directory(self) -> pathlib.Path: + return self.base_path / "logs" / "prompt_batches" + + # Keep on reset + @property + def message_results_directory(self) -> pathlib.Path: + return self.base_path / "logs" / "prompt_message_results" + + # Keep on reset + @property + def preprocessed_seed_images_directory(self) -> pathlib.Path: + return self.base_path / "preprocessed_seed_images" + + @property + def message_processing_logs_directory(self) -> pathlib.Path: + return self.base_path / "logs" / "message_processing_logs" + + @property + def _html_directory(self) -> pathlib.Path: + return self.base_path / "html" + + @property + def raw_html_directory(self) -> pathlib.Path: + return self._html_directory / "raw_html" + + @property + def render_html_directory(self) -> pathlib.Path: + return self._html_directory / "render_html_pass1" + + @property + def render_html_second_pass_directory(self) -> pathlib.Path: + return self._html_directory / "render_html_pass2" + + @property + def geometries_directory(self) -> pathlib.Path: + return self.base_path / "geometries" + + @property + def _pdf_directory(self) -> pathlib.Path: + return self.base_path / "pdf" + + @property + def pdf_initial_directory(self) -> pathlib.Path: + """Contains PDFs with handwriting-html-text visible""" + return self._pdf_directory / "pdf_initial" + + @property + def pdf_without_handwriting_placeholder_directory(self) -> pathlib.Path: + """Contains PDFs with handwriting-html-text and visual element placeholders invisible""" + return self._pdf_directory / "pdf_without_handwriting_placeholder" + + @property + def pdf_with_handwriting_directory(self) -> pathlib.Path: + """Contains PDFs where Handwriting and Visual Elements are invisible + (need two render passes because transparent text is not included in PDF)""" + return self._pdf_directory / "pdf_with_handwriting" + + @property + def final_pdf_directory(self) -> pathlib.Path: + """Contains final PDFs with handwriting and visual elements""" + return self._pdf_directory / "pdf_final" + + @property + def _bbox_directory(self) -> pathlib.Path: + return self.base_path / "bbox" + + @property + def bboxes_pdf_directory(self) -> pathlib.Path: + """Contains the bounding boxes which were extracted from the PDF.""" + return self._bbox_directory / "bbox_pdf" + + @property + def bboxes_final_directory(self) -> pathlib.Path: + """For documents which contain handwriting or visual elements, this contains bounding boxes retrieved via OCR. + Otherwise contains the bounding boxes which were extracted from the PDF.""" + return self._bbox_directory / "bbox_final" + + @property + def bboxes_final_normalized_directory(self) -> pathlib.Path: + """Contains the final bboxes but normalized to image size.""" + return self._bbox_directory / "bbox_final_normalized" + + @property + def ocr_results_directory(self) -> pathlib.Path: + """Contains OCR results for documents which contain handwriting or visual elements""" + return self.base_path / "ocr_results" + + @property + def img_directory(self) -> pathlib.Path: + return self.base_path / "img" + + @property + def _annotations_directory(self) -> pathlib.Path: + return self.base_path / "annotations" + + @property + def gt_directory(self) -> pathlib.Path: + return self._annotations_directory / "gt" + + @property + def raw_annotations_directory(self) -> pathlib.Path: + return self._annotations_directory / "raw_annotations" + + @property + def document_logs_directory(self) -> pathlib.Path: + return self.base_path / "logs" / "document_logs" + + @property + def _handwriting_directory(self) -> pathlib.Path: + return self.base_path / "handwriting" + + @property + def handwritten_bboxes_directory(self) -> pathlib.Path: + return self._handwriting_directory / "handwriting_bbox" + + # Directories for handwritten text images + @property + def handwritten_text_images_directory(self) -> pathlib.Path: + return self._handwriting_directory / "handwriting_raw_tokens" + + @property + def _visual_elements_directory(self) -> pathlib.Path: + return self.base_path / "visual_elements" + + @property + def visual_element_definitions_directory(self) -> pathlib.Path: + return self._visual_elements_directory / "visual_element_definitions" + + @property + def visual_elements_directory(self) -> pathlib.Path: + return self._visual_elements_directory / "visual_elements_images" + + @property + def layout_element_definitions_directory(self) -> pathlib.Path: + return self.base_path / "layout_element_definitions" + + @property + def _debug_directory(self) -> pathlib.Path: + return self.base_path / "debug" + + @property + def debug_pdf_visual_elements_directory(self) -> pathlib.Path: + return self._debug_directory / "visual_elements" + + @property + def debug_pdf_handwriting_directory(self) -> pathlib.Path: + return self._debug_directory / "handwriting" + + @property + def debug_pdf_layout_directory(self) -> pathlib.Path: + return self._debug_directory / "layout" + + @property + def debug_pdf_geometries_directory(self) -> pathlib.Path: + return self._debug_directory / "geometries" + + @property + def debug_pdf_bboxes_final_directory(self) -> pathlib.Path: + return self._debug_directory / "bboxes_final" + + @property + def debug_pdf_bboxes_directory(self) -> pathlib.Path: + return self._debug_directory / "bboxes" + + @property + def debug_pdf_bboxes_and_geos_directory(self) -> pathlib.Path: + return self._debug_directory / "bboxes_and_geos" + + @property + def debug_ocr_bboxes_and_geos_directory(self) -> pathlib.Path: + return self._debug_directory / "ocr_bboxes_and_geos" + + @property + def debug_html_raw_directory(self) -> pathlib.Path: + return self._debug_directory / "html_raw" + + def get_pdf_bbox_path(self, level: Literal["word", "char"], doc_id: str): + return self.bboxes_pdf_directory / level / f"{doc_id}.txt" + + def get_final_bbox_path(self, level: Literal["word", "segment"], doc_id: str): + return self.bboxes_final_directory / level / f"{doc_id}.txt" + + def get_final_normalized_bbox_path( + self, level: Literal["word", "segment"], doc_id: str + ): + return self.bboxes_final_normalized_directory / level / f"{doc_id}.txt" diff --git a/docgenie/generation/models/_log.py b/docgenie/generation/models/_log.py new file mode 100755 index 0000000000000000000000000000000000000000..4fe79da83358f1cd6833223f53de5e691e5cba9c --- /dev/null +++ b/docgenie/generation/models/_log.py @@ -0,0 +1,177 @@ +import json +import pathlib + + +class PromptMsgResultLogKey: + custom_id = "custom_id" + id = "id" + result_type = "result_type" + error = "error" + response = "response" + usage_input_tokens = "usage_input_tokens" + usage_output_tokens = "usage_output_tokens" + + +class MessageProcessingLogKey: + custom_id = "custom_id" + result_type = "result_type" + num_documents_expected = "num_documents_expected" + num_documents_found = "num_documents_found" + document_ids = "document_ids" + + +class DocLogKey: + document_id = "document_id" + html_len = "html_len" + + raw_json_gt_found = "raw_json_gt_found" + raw_json_gt_valid_json = "raw_json_gt_valid_json" + + raw_annotation_gt_found = "raw_annotation_gt_found" + raw_annotation_gt_extraction_errors = "raw_annotation_gt_extraction_errors" + + raw_gt_or_annotation_annotations_count = "raw_gt_or_annotation_annotations_count" + + render_html_width = "render_html_width" + render_html_height = "render_html_height" + + pdf_num_pages = "pdf_num_pages" + pdf_render_error = "pdf_render_error" + + num_geometries_extracted = "num_geometries_extracted" + + num_word_bboxes = "num_word_bboxes" + num_char_bboxes = "num_char_bboxes" + can_map_chars_to_words = "can_map_chars_to_words" + + handwriting_num_elements = "handwriting_num_elements" + handwriting_element_extraction_errors = "handwriting_element_extraction_errors" + + handwriting_generation_authorid_to_writerstyle = ( + "handwriting_generation_authorid_to_writerstyle" + ) + + handwriting_insertion_success = "handwriting_insertion_success" + handwriting_images_were_generated = "handwriting_images_were_generated" + handwriting_missing_images = "handwriting_missing_images" + + visual_elements_insertion_success = "visual_elements_insertion_success" + visual_elements_were_generated = "visual_elements_were_generated" + visual_elements_missing_images = "visual_elements_missing_images" + visual_elements_num_elements = "visual_elements_num_elements" + visual_elements_extraction_errors = "visual_elements_extraction_errors" + + visual_elements_generation_logs = "visual_elements_generation_logs" + visual_elements_generation_errors = "visual_elements_generation_errors" + + layout_elements_num_elements = "layout_elements_num_elements" + layout_elements_extraction_errors = "layout_elements_extraction_errors" + + layout_elements_generation_logs = "layout_elements_generation_logs" + layout_elements_generation_errors = "layout_elements_generation_errors" + + ocr_required = "ocr_required" + ocr_found = "ocr_found" + ocr_num_bboxes_words = "ocr_num_bboxes_words" + ocr_num_bboxes_lines = "ocr_num_bboxes_lines" + ocr_error = "ocr_error" + + gt_verification_confirmed_keys = "gt_verification_confirmed_keys" + gt_verification_similarities = "gt_verification_similarities" + gt_verification_passed = "gt_verification_passed" + gt_verification_skipped = "gt_verification_skipped" + + +class SynDocumentLog: + def __init__(self, document_id: str, logdir: pathlib.Path): + self.document_id = document_id + logpath = logdir / f"{document_id}.json" + self.log = json.loads(logpath.read_text(encoding="utf-8")) + + @property + def raw_json_gt_found(self): + return self.log.get(DocLogKey.raw_json_gt_found, False) + + @property + def raw_json_gt_valid_json(self): + return self.log.get(DocLogKey.raw_json_gt_valid_json, False) + + @property + def raw_annotation_gt_found(self): + return self.log.get(DocLogKey.raw_annotation_gt_found, False) + + @property + def raw_annotation_gt_extraction_errors(self): + return self.log.get(DocLogKey.raw_annotation_gt_extraction_errors, [-1]) + + @property + def gt_verification_passed(self): + return self.log.get(DocLogKey.gt_verification_passed, False) + + @property + def gt_verification_skipped(self): + return self.log.get(DocLogKey.gt_verification_skipped, False) + + @property + def pdf_num_pages(self): + return self.log.get(DocLogKey.pdf_num_pages, -1) + + @property + def num_word_bboxes(self): + return self.log.get(DocLogKey.num_word_bboxes, -1) + + @property + def num_char_bboxes(self): + return self.log.get(DocLogKey.num_char_bboxes, -1) + + @property + def can_map_chars_to_words(self): + return self.log.get(DocLogKey.can_map_chars_to_words, False) + + @property + def handwriting_num_elements(self): + return self.log.get(DocLogKey.handwriting_num_elements, -1) + + @property + def handwriting_element_extraction_errors(self): + return self.log.get(DocLogKey.handwriting_element_extraction_errors, [-1]) + + @property + def handwriting_missing_images(self): + return self.log.get(DocLogKey.handwriting_missing_images, [-1]) + + @property + def visual_elements_num_elements(self): + return self.log.get(DocLogKey.visual_elements_num_elements, -1) + + @property + def visual_elements_extraction_errors(self): + return self.log.get(DocLogKey.visual_elements_extraction_errors, [-1]) + + @property + def layout_elements_num_elements(self): + return self.log.get(DocLogKey.layout_elements_num_elements, -1) + + @property + def layout_elements_extraction_errors(self): + return self.log.get(DocLogKey.layout_elements_extraction_errors, [-1]) + + @property + def ocr_required(self): + return self.log.get(DocLogKey.ocr_required, False) + + @property + def ocr_found(self): + return self.log.get(DocLogKey.ocr_found, False) + + @property + def render_html_width(self) -> int | None: + return self.log.get(DocLogKey.render_html_width, None) + + @property + def render_html_height(self) -> int | None: + return self.log.get(DocLogKey.render_html_height, None) + + @property + def annotations_count(self) -> int: + return self.log.get(DocLogKey.raw_gt_or_annotation_annotations_count, 0) diff --git a/docgenie/generation/models/_pipeline.py b/docgenie/generation/models/_pipeline.py new file mode 100755 index 0000000000000000000000000000000000000000..bd5cbdf0206d93150b60e88b1d6d7ff2a3a28f57 --- /dev/null +++ b/docgenie/generation/models/_pipeline.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass + +from docgenie.generation.models._consts import DatasetTask, LLMType +from docgenie.generation.models._syndatadef import SynDatasetDefinition + + +@dataclass +class PipelineParameters: + dsdef: SynDatasetDefinition + llmtype: LLMType + message_custom_id: str | None + seedsonly: bool + handwriting_batch_size: int + debug: bool + api_key_env_variable_name: str + generate_handwriting: bool = True + blur_handwriting_images: bool = True diff --git a/docgenie/generation/models/_syndatadef.py b/docgenie/generation/models/_syndatadef.py new file mode 100755 index 0000000000000000000000000000000000000000..ff9d99810559bef9cd558e2304b04c5b5e4d35a5 --- /dev/null +++ b/docgenie/generation/models/_syndatadef.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import json +import pathlib +from dataclasses import dataclass +from typing import Iterable + +import yaml + +from docgenie import ENV +from docgenie.generation.models._file import SyntheticDatasetFileStructure +from docgenie.generation.models._log import SynDocumentLog +from docgenie.generation.utils.serialization import from_dict + + +@dataclass +class PromptParameters: + num_solutions: int + doc_type: str + language: str + gt_type: str # eg "keys and their values" + gt_format: str # eg {"company": "company value", "date": "date value"...} + + +@dataclass +class SynDatasetDefinition: + # General + name: str + task: str + dataloader_model_task_as: ( + str | None + ) # For Kleister which the data loading pipeline handles as QA + base_dataset_name: str + documents_count: int + valid_labels: list[str] # For DLA, KIE and Classification + label_mapping: ( + dict[str, str] | None + ) # For CORD because original labels have dots in them + valid_secondary_labels: list[str] # For groupings like in CORD or FUNSD + + # Prompt + prompt_template: str + prompt_task: str + prompt_params: PromptParameters + + # Seed Documents + hdbscan_min_cluster_size: int + embedding_type: str + seed_images_count: int + alpha: float + max_seed_pool: int + seed_selection_strategy: str = "v1" + + def get_document_logs(self) -> Iterable[SynDocumentLog]: + dsfiles = self.get_file_structure() + # TODO: dont read files but read from dataset log + for logfile in dsfiles.document_logs_directory.iterdir(): + docid = logfile.stem + yield SynDocumentLog( + document_id=docid, logdir=dsfiles.document_logs_directory + ) + + def write_to_document_log(self, document_id: str, vals: dict): + dsfiles = self.get_file_structure() + log_path = dsfiles.document_logs_directory / f"{document_id}.json" + + log = {} + if log_path.exists(): + log = json.loads(log_path.read_text("utf-8")) + + log.update(vals) + log_path.write_text(json.dumps(log, indent=2), encoding="utf-8") + + def reset_data_except_prompt_and_seeds(self): + import shutil + + dsfiles = self.get_file_structure() + + dirs_to_delete = [ + dsfiles._annotations_directory, + dsfiles._bbox_directory, + dsfiles._debug_directory, + dsfiles._handwriting_directory, + dsfiles._html_directory, + dsfiles.layout_element_definitions_directory, + dsfiles.geometries_directory, + dsfiles._pdf_directory, + dsfiles._visual_elements_directory, + dsfiles.img_directory, + dsfiles.document_logs_directory, + dsfiles.message_processing_logs_directory, + ] + for dir_path in dirs_to_delete: + shutil.rmtree(dir_path) # remove entire directory + + # Clear cache + del self._file_structure + # Recreate directory structure + self.get_file_structure() + + def get_file_structure(self) -> SyntheticDatasetFileStructure: + if hasattr(self, "_file_structure"): + return self._file_structure + else: + self._file_structure = SyntheticDatasetFileStructure(ds_name=self.name) + + return self._file_structure + + def get_prompt_template(self) -> str: + taskname = f"-{self.prompt_task}" if self.prompt_task else "" + return ( + ENV.PROMPT_TEMPLATES_DIR + / self.prompt_template + / f"seed-based{taskname}.txt" + ).read_text() + + def get_prompt(self) -> str: + if hasattr(self, "_prompt"): + return self._prompt + else: + prompt = self.get_prompt_template() + prompt = prompt.replace( + "{num_solutions}", f"{self.prompt_params.num_solutions}" + ) + prompt = prompt.replace("{doc_type}", f"{self.prompt_params.doc_type}") + prompt = prompt.replace("{language}", f"{self.prompt_params.language}") + prompt = prompt.replace("{gt_type}", f"{self.prompt_params.gt_type}") + prompt = prompt.replace("{gt_format}", f"{self.prompt_params.gt_format}") + self._prompt = prompt + + return self._prompt + + @staticmethod + def from_file(yaml_path: str | pathlib.Path) -> SynDatasetDefinition: + with open(yaml_path, "r") as f: + data = yaml.safe_load(f) + return from_dict(SynDatasetDefinition, data) diff --git a/docgenie/generation/pipeline_01/claude_batching.py b/docgenie/generation/pipeline_01/claude_batching.py new file mode 100755 index 0000000000000000000000000000000000000000..fdbb16278b1b21fed73177fbddb40f9a16eeaf9a --- /dev/null +++ b/docgenie/generation/pipeline_01/claude_batching.py @@ -0,0 +1,444 @@ +from dataclasses import dataclass +from datetime import datetime +import time +import os +import pathlib +import uuid +from typing import Iterable, Literal, Callable, TypeVar +import anthropic +from anthropic.types.message_create_params import MessageCreateParamsNonStreaming +from anthropic.types import ( + MessageParam, + ImageBlockParam, + TextBlockParam, + Base64ImageSourceParam, + TextBlock, +) +from anthropic.types.messages.batch_create_params import Request +from anthropic.types.messages.message_batch import MessageBatch +from anthropic.types.messages.message_batch_individual_response import ( + MessageBatchIndividualResponse, +) +from anthropic.types.messages.message_batch_succeeded_result import ( + MessageBatchSucceededResult, +) + +from docgenie import ENV, LLM, GENERATION + +import json + +from docgenie.generation.models import PromptMsgResultLogKey, SynDatasetDefinition +from docgenie.generation.utils.serialization import image_to_base64 +from docgenie.generation.utils.status import StatusLine +from docgenie.generation.pipeline_01.cost import ( + calculate_message_cost, + get_total_cost, + print_cost_report, +) + +T = TypeVar('T') + +def retry_with_backoff( + func: Callable[[], T], + max_retries: int = 3, + initial_delay: float = 1.0, + backoff_factor: float = 2.0, +) -> T: + """ + Retry a function with exponential backoff on transient Anthropic API errors. + + Args: + func: Function to retry (should take no arguments, use lambda if needed) + max_retries: Maximum number of retry attempts + initial_delay: Initial delay in seconds before first retry + backoff_factor: Multiplier for delay after each retry + + Returns: + Result of the function call + + Raises: + The last exception if all retries fail + """ + last_exception = None + delay = initial_delay + + for attempt in range(max_retries + 1): + try: + return func() + except anthropic.InternalServerError as e: + last_exception = e + if attempt < max_retries: + print(f"⚠️ Anthropic API error (attempt {attempt + 1}/{max_retries + 1}): {e}") + print(f" Retrying in {delay:.1f}s...") + time.sleep(delay) + delay *= backoff_factor + else: + print(f"❌ All {max_retries + 1} attempts failed") + raise + except anthropic.RateLimitError as e: + last_exception = e + if attempt < max_retries: + print(f"⚠️ Rate limit hit (attempt {attempt + 1}/{max_retries + 1}): {e}") + print(f" Retrying in {delay:.1f}s...") + time.sleep(delay) + delay *= backoff_factor + else: + print(f"❌ All {max_retries + 1} attempts failed") + raise + + raise last_exception # Should never reach here, but for type safety + + +def create_batch( + client: anthropic.Anthropic, + id_to_message: dict[str, MessageParam], + model=GENERATION.LLM, + max_tokens=GENERATION.MAX_TOKENS, +): + requests = [] + for msg_id, msg in id_to_message.items(): + requests.append( + Request( + custom_id=msg_id, + params=MessageCreateParamsNonStreaming( + model=model, + max_tokens=max_tokens, + messages=[msg], + ), + ) + ) + + # Use retry logic to handle transient Anthropic API errors + message_batch = retry_with_backoff( + lambda: client.messages.batches.create(requests=requests), + max_retries=3, + initial_delay=1.0, + backoff_factor=2.0 + ) + + # print(message_batch) + return message_batch.id + + +def create_message(prompt: str, images_base64: list[str]): + content = [] + # Only prompt is cached, images not (because they come after) as they change with each call + content.append( + TextBlockParam(text=prompt, type="text", cache_control={"type": "ephemeral"}) + ) + if images_base64: + for img_base64 in images_base64: + content.append( + ImageBlockParam( + source=Base64ImageSourceParam( + media_type="image/jpeg", type="base64", data=img_base64 + ), + type="image", + ) + ) + + return MessageParam( + role="user", + content=content, + ) + + +""" +3.7. +Claude-Sonnet 3.7 [2] is employed as the underlying +MLLM for HTML-based document generation. For each +document category, a set of S = 10 real documents is +selected as seed samples to guide the generation process. +The MLLM is prompted with the seed samples and document category to generate N = 10 synthetic documents per +category. Each model call generates 4 HTML-based documents per iteration, repeated until the total target is reached +""" + + +class ClaudeBatchedClient: + def __init__(self, api_key: str): + self.client = anthropic.Anthropic(api_key=api_key) + + def send_batch( + self, + model: str, + prompts: Iterable[str], + images_base64: Iterable[list[str]], + image_docids: Iterable[list[str]], + batch_data_directory: pathlib.Path, + max_tokens: int = 8192, + ): + # assert len(prompts) == len(images_base64) + + # Collect batch data + id_to_message = dict() + id_to_message_seed_docids = dict() + for prompt, image_base64s, seed_docids in zip( + prompts, images_base64, image_docids + ): + # Create GUID message ID + message_id = str(uuid.uuid4()) + message = create_message(prompt=prompt, images_base64=image_base64s) + id_to_message[message_id] = message + id_to_message_seed_docids[message_id] = seed_docids + + # Send batch + batch_id = create_batch( + client=self.client, + id_to_message=id_to_message, + model=model, + max_tokens=max_tokens, + ) + + # Store batch data + batch_data_file = batch_data_directory / f"{batch_id}.json" + batch_metadata = { + "id": batch_id, + "model": model, + "processing_status": "in_progress", + "message_ids": list(id_to_message.keys()), + "message_id_to_seed_docids": id_to_message_seed_docids, + "created_at": datetime.now().isoformat(), + "ended_at": "", + "cost_tracking": { + "total_cost_usd": 0.0, + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_cache_creation_tokens": 0, + "total_cache_read_tokens": 0, + }, + } + batch_metadata_json = json.dumps(batch_metadata, indent=2) + batch_data_file.write_text(batch_metadata_json, encoding="utf-8") + + def get_running_batches(self, batch_data_directory: pathlib.Path): + # Get metadata for all batches that are currently running + running_batches = [] + awaited_messages_total = 0 + for f in batch_data_directory.iterdir(): + if f.is_file(): + batch_metadata = json.loads(f.read_text()) + if batch_metadata["processing_status"] == "in_progress": + running_batches.append(batch_metadata) + awaited_messages_total += len(batch_metadata["message_ids"]) + + return running_batches, awaited_messages_total + + def await_batches( + self, + batch_data_directory: pathlib.Path, + message_data_directory: pathlib.Path, + sleep_seconds_between_batch: float = 2, + sleep_seconds_iteration: float = 30, + ): + running_batches, awaited_messages_total = self.get_running_batches( + batch_data_directory=batch_data_directory + ) + running_batches_count = len(running_batches) + print( + f"Found {running_batches_count} batches with {awaited_messages_total} messages in total." + ) + + status = StatusLine() + status.start() + + while any(running_batches): + finished_batches = [] + # print(f"Awaiting {len(running_batches)} batches...") + status.update_message(f"Awaiting {len(running_batches)} batches...") + + for batch_metadata in running_batches: + batch_id = batch_metadata["id"] + + # Use retry logic to handle transient Anthropic API errors + message_batch = retry_with_backoff( + lambda: self.client.messages.batches.retrieve( + message_batch_id=batch_id + ), + max_retries=3, + initial_delay=2.0, + backoff_factor=2.0 + ) + + if message_batch.processing_status != "in_progress": + # Batch has finished or was canceled + # print(f"Batch {message_batch.id} processing status is now {message_batch.processing_status}") + status.log( + f"Batch {message_batch.id} processing status is now {message_batch.processing_status}" + ) + + # Retrieve batch results if batch was processed + if message_batch.processing_status == "ended": + cost_tracking = self._finalize_batch( + message_batch=message_batch, + batch_id=batch_id, + message_ids=set(batch_metadata["message_ids"]), + message_data_directory=message_data_directory, + model=batch_metadata.get("model", GENERATION.LLM), + ) + batch_metadata["cost_tracking"] = cost_tracking + + # Update batch metadata + batch_metadata["processing_status"] = ( + message_batch.processing_status + ) + batch_metadata["ended_at"] = datetime.now().isoformat() + batch_metadata_json = json.dumps(batch_metadata, indent=2) + batch_data_file = batch_data_directory / f"{batch_id}.json" + batch_data_file.write_text(batch_metadata_json, encoding="utf-8") + + # Dont keep polling this batch + finished_batches.append(batch_metadata) + + time.sleep(sleep_seconds_between_batch) + + for batch_metadata in finished_batches: + running_batches.remove(batch_metadata) + + time.sleep(sleep_seconds_iteration) + + status.stop() + + print(f"Finished awaiting {running_batches_count} batches.") + + def get_total_cost(self, batch_data_directory: pathlib.Path) -> dict: + return get_total_cost(batch_data_directory) + + def print_cost_report( + self, batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path = None + ): + print_cost_report(batch_data_directory, dataset_log_path) + + def _finalize_batch( + self, + message_batch: MessageBatch, + batch_id: str, + message_ids: set[str], + message_data_directory: pathlib.Path, + model: str, + ) -> dict: + """ + Finalize a batch by processing results and calculating costs. + + Returns: + Dictionary with cost tracking information + """ + assert message_batch.processing_status == "ended" + + # Initialize cost tracking + cost_tracking = { + "total_cost_usd": 0.0, + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_cache_creation_tokens": 0, + "total_cache_read_tokens": 0, + } + + # Stream results file in memory-efficient chunks, processing one at a time + result: MessageBatchIndividualResponse + + # Retrieve results with retry logic for transient errors + results_iterator = retry_with_backoff( + lambda: list(self.client.messages.batches.results(message_batch_id=batch_id)), + max_retries=3, + initial_delay=2.0, + backoff_factor=2.0 + ) + + for result in results_iterator: + # Ensure we know this message in this batch + assert result.custom_id in message_ids, ( + f"Unknown message '{result.custom_id}' in batch '{batch_id}'" + ) + + message_data = { + PromptMsgResultLogKey.custom_id: result.custom_id, + PromptMsgResultLogKey.id: "", + PromptMsgResultLogKey.result_type: result.result.type, + PromptMsgResultLogKey.error: "", + PromptMsgResultLogKey.response: "", + PromptMsgResultLogKey.usage_input_tokens: -1, + PromptMsgResultLogKey.usage_output_tokens: -1, + } + + match result.result.type: + case "succeeded": + res: MessageBatchSucceededResult = result.result + message_data["id"] = res.message.id + + # Extract token usage + input_tokens = res.message.usage.input_tokens + output_tokens = res.message.usage.output_tokens + cache_creation_tokens = getattr( + res.message.usage, "cache_creation_input_tokens", 0 + ) + cache_read_tokens = getattr( + res.message.usage, "cache_read_input_tokens", 0 + ) + + message_data[PromptMsgResultLogKey.usage_input_tokens] = ( + input_tokens + ) + message_data[PromptMsgResultLogKey.usage_output_tokens] = ( + output_tokens + ) + + # Calculate cost for this message + message_cost = calculate_message_cost( + model=model, + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_creation_input_tokens=cache_creation_tokens, + cache_read_input_tokens=cache_read_tokens, + ) + + # Update batch totals + cost_tracking["total_cost_usd"] += message_cost + cost_tracking["total_input_tokens"] += input_tokens + cost_tracking["total_output_tokens"] += output_tokens + cost_tracking["total_cache_creation_tokens"] += ( + cache_creation_tokens + ) + cost_tracking["total_cache_read_tokens"] += cache_read_tokens + + if res.message.stop_reason == "refusal": + # The LLM refused to process the request because of a policy violation + url = "https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/handle-streaming-refusals" + print(f"[SKIPPING] Policy Violation error ({url})") + message_data[PromptMsgResultLogKey.error] = "refusal" + message_data[PromptMsgResultLogKey.response] = None + else: + # raise Exception(f"Policy violation from Claude API ({url})") + + # Assert that content is of expected shape and type + assert len(res.message.content) == 1 and isinstance( + res.message.content[0], TextBlock + ), ( + f"Content validation failed: len={len(res.message.content)}, " + f"content={res.message.content}, " + f"type={type(res.message.content[0]).__name__ if res.message.content else 'empty'}" + ) + + # Fetch actual response + response: TextBlock = res.message.content[0] + message_data[PromptMsgResultLogKey.response] = response.text + + case "errored": + if result.result.error.type == "invalid_request": + # Request body must be fixed before re-sending request + print(f"Validation error {result.custom_id}") + raise Exception( + f"Validation error from Claude API: {result.result.error}" + ) + else: + # Request can be retried directly + print(f"Server error {result.custom_id} {result.result.error}") + message_data[PromptMsgResultLogKey.error] = ( + f"{result.result.error}" + ) + + # Save message to disk + message_data_file = message_data_directory / f"{result.custom_id}.json" + message_data_json = json.dumps(message_data, indent=2) + message_data_file.write_text(message_data_json, encoding="utf-8") + + return cost_tracking diff --git a/docgenie/generation/pipeline_01/cost.py b/docgenie/generation/pipeline_01/cost.py new file mode 100755 index 0000000000000000000000000000000000000000..bef41600d330403c7ae736c24b64e13c11f0de9d --- /dev/null +++ b/docgenie/generation/pipeline_01/cost.py @@ -0,0 +1,182 @@ +import json +import pathlib + +from rich.console import Console +from rich.table import Table + +ANTHROPIC_PRICING = { + "claude-sonnet-4-20250514": { + "input": 3.00, + "output": 15.00, + "cache_write": 3.75, + "cache_read": 0.30, + }, + "claude-sonnet-4-5-20250929": { + "input": 3.00, + "output": 15.00, + "cache_write": 3.75, + "cache_read": 0.30, + }, + "claude-haiku-4-5-20251001": { + "input": 1.00, + "output": 5.00, + "cache_write": 1.25, + "cache_read": 0.10, + }, +} + + +def calculate_message_cost( + model: str, + input_tokens: int, + output_tokens: int, + cache_creation_input_tokens: int = 0, + cache_read_input_tokens: int = 0, +) -> float: + """ + Calculate the cost of a single message based on token usage. + + Args: + model: The model name (e.g., "claude-sonnet-4-5-20250929") + input_tokens: Number of input tokens + output_tokens: Number of output tokens + cache_creation_input_tokens: Number of tokens used for cache creation + cache_read_input_tokens: Number of tokens read from cache + + Returns: + Cost in USD + """ + if model not in ANTHROPIC_PRICING: + print(f"Warning: Unknown model '{model}'. Using Claude Sonnet 4.5 pricing.") + model = "claude-sonnet-4-5-20250929" + + pricing = ANTHROPIC_PRICING[model] + + regular_input_tokens = ( + input_tokens - cache_creation_input_tokens - cache_read_input_tokens + ) + + cost_usd = ( + (regular_input_tokens / 1_000_000) * pricing["input"] + + (output_tokens / 1_000_000) * pricing["output"] + + (cache_creation_input_tokens / 1_000_000) * pricing["cache_write"] + + (cache_read_input_tokens / 1_000_000) * pricing["cache_read"] + ) + + return cost_usd + + +def get_total_cost(batch_data_directory: pathlib.Path) -> dict: + """ + Calculate the total cost across all batches in a directory. + + Args: + batch_data_directory: Directory containing batch metadata files + + Returns: + Dictionary with aggregated cost information + """ + total_cost_summary = { + "total_cost_usd": 0.0, + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_cache_creation_tokens": 0, + "total_cache_read_tokens": 0, + "num_batches": 0, + "num_messages": 0, + } + + for batch_file in batch_data_directory.iterdir(): + if batch_file.is_file() and batch_file.suffix == ".json": + batch_metadata = json.loads(batch_file.read_text()) + + if batch_metadata.get("processing_status") == "ended": + cost_tracking = batch_metadata.get("cost_tracking", {}) + + total_cost_summary["total_cost_usd"] += cost_tracking.get( + "total_cost_usd", 0.0 + ) + total_cost_summary["total_input_tokens"] += cost_tracking.get( + "total_input_tokens", 0 + ) + total_cost_summary["total_output_tokens"] += cost_tracking.get( + "total_output_tokens", 0 + ) + total_cost_summary["total_cache_creation_tokens"] += cost_tracking.get( + "total_cache_creation_tokens", 0 + ) + total_cost_summary["total_cache_read_tokens"] += cost_tracking.get( + "total_cache_read_tokens", 0 + ) + total_cost_summary["num_batches"] += 1 + total_cost_summary["num_messages"] += len( + batch_metadata.get("message_ids", []) + ) + + return total_cost_summary + + +def print_cost_report( + batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path | None = None +): + """ + Print a formatted cost report using Rich tables. + + Args: + batch_data_directory: Directory containing batch metadata files + dataset_log_path: Optional path to dataset log for per-document cost calculation + """ + single_page_pdfs_count = -1 + if dataset_log_path and dataset_log_path.exists(): + dataset_log = json.loads(dataset_log_path.read_text(encoding="utf-8")) + single_page_pdfs_count = dataset_log.get("valid_samples", {}).get("total", 0) + + total_cost_summary = get_total_cost(batch_data_directory) + + console = Console() + + table = Table( + title="Batch Cost Report", show_header=True, header_style="bold magenta" + ) + table.add_column("Metric", style="cyan", width=35) + table.add_column("Value", justify="right", style="white", width=20) + + table.add_row("Number of batches", str(total_cost_summary["num_batches"])) + table.add_row("Number of messages", str(total_cost_summary["num_messages"])) + table.add_row("Number of PDFs", str(single_page_pdfs_count)) + + table.add_section() + table.add_row("Total input tokens", f"{total_cost_summary['total_input_tokens']:,}") + table.add_row( + "Total output tokens", f"{total_cost_summary['total_output_tokens']:,}" + ) + table.add_row( + "Total cache creation tokens", + f"{total_cost_summary['total_cache_creation_tokens']:,}", + ) + table.add_row( + "Total cache read tokens", f"{total_cost_summary['total_cache_read_tokens']:,}" + ) + + table.add_section() + total_cost_usd = total_cost_summary["total_cost_usd"] / 2.0 + table.add_row( + "[bold green]TOTAL COST \n(including 50% batch discount)[/bold green]", + f"[bold green]${total_cost_usd:.2f} USD[/bold green]", + ) + + if single_page_pdfs_count > 0: + avg_cost_per_document = total_cost_usd / single_page_pdfs_count + table.add_row( + "[bold yellow]Average cost per document[/bold yellow]", + f"[bold yellow]${avg_cost_per_document:.2f} USD[/bold yellow]", + ) + table.add_row( + " Documents counted", f"{single_page_pdfs_count} single-page PDFs" + ) + + console.print() + console.print(table) + console.print() + + return total_cost_summary diff --git a/docgenie/generation/pipeline_01/deepseek.py b/docgenie/generation/pipeline_01/deepseek.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docgenie/generation/pipeline_01/opensource_batching.py b/docgenie/generation/pipeline_01/opensource_batching.py new file mode 100755 index 0000000000000000000000000000000000000000..4041f96c0200a18e4285ab64f4b1930c25efea80 --- /dev/null +++ b/docgenie/generation/pipeline_01/opensource_batching.py @@ -0,0 +1,369 @@ +from dataclasses import dataclass +from datetime import datetime +import time +import os +import pathlib +import uuid +from typing import Iterable, Literal +import anthropic +from anthropic.types.message_create_params import MessageCreateParamsNonStreaming +from anthropic.types import ( + MessageParam, + ImageBlockParam, + TextBlockParam, + Base64ImageSourceParam, + TextBlock, +) +from anthropic.types.messages.batch_create_params import Request +from anthropic.types.messages.message_batch import MessageBatch +from anthropic.types.messages.message_batch_individual_response import ( + MessageBatchIndividualResponse, +) +from anthropic.types.messages.message_batch_succeeded_result import ( + MessageBatchSucceededResult, +) + +from docgenie import ENV, LLM, GENERATION + +import json + +from docgenie.generation.models import PromptMsgResultLogKey, SynDatasetDefinition +from docgenie.generation.utils.serialization import image_to_base64 +from docgenie.generation.utils.status import StatusLine +from docgenie.generation.pipeline_01.cost import ( + calculate_message_cost, + get_total_cost, + print_cost_report, +) + + +def create_batch( + client: anthropic.Anthropic, + id_to_message: dict[str, MessageParam], + model=GENERATION.LLM, + max_tokens=GENERATION.MAX_TOKENS, +): + requests = [] + for msg_id, msg in id_to_message.items(): + requests.append( + Request( + custom_id=msg_id, + params=MessageCreateParamsNonStreaming( + model=model, + max_tokens=max_tokens, + messages=[msg], + ), + ) + ) + message_batch = client.messages.batches.create(requests=requests) + + # print(message_batch) + return message_batch.id + + +def create_message(prompt: str, images_base64: list[str]): + content = [] + # Only prompt is cached, images not (because they come after) as they change with each call + content.append( + TextBlockParam(text=prompt, type="text", cache_control={"type": "ephemeral"}) + ) + if images_base64: + for img_base64 in images_base64: + content.append( + ImageBlockParam( + source=Base64ImageSourceParam( + media_type="image/jpeg", type="base64", data=img_base64 + ), + type="image", + ) + ) + + return MessageParam( + role="user", + content=content, + ) + + +""" +3.7. +Claude-Sonnet 3.7 [2] is employed as the underlying +MLLM for HTML-based document generation. For each +document category, a set of S = 10 real documents is +selected as seed samples to guide the generation process. +The MLLM is prompted with the seed samples and document category to generate N = 10 synthetic documents per +category. Each model call generates 4 HTML-based documents per iteration, repeated until the total target is reached +""" + + +class ClaudeBatchedClient: + def __init__(self, api_key: str): + self.client = anthropic.Anthropic(api_key=api_key) + + def send_batch( + self, + model: str, + prompts: Iterable[str], + images_base64: Iterable[list[str]], + image_docids: Iterable[list[str]], + batch_data_directory: pathlib.Path, + max_tokens: int = 8192, + ): + # assert len(prompts) == len(images_base64) + + # Collect batch data + id_to_message = dict() + id_to_message_seed_docids = dict() + for prompt, image_base64s, seed_docids in zip( + prompts, images_base64, image_docids + ): + # Create GUID message ID + message_id = str(uuid.uuid4()) + message = create_message(prompt=prompt, images_base64=image_base64s) + id_to_message[message_id] = message + id_to_message_seed_docids[message_id] = seed_docids + + # Send batch + batch_id = create_batch( + client=self.client, + id_to_message=id_to_message, + model=model, + max_tokens=max_tokens, + ) + + # Store batch data + batch_data_file = batch_data_directory / f"{batch_id}.json" + batch_metadata = { + "id": batch_id, + "model": model, + "processing_status": "in_progress", + "message_ids": list(id_to_message.keys()), + "message_id_to_seed_docids": id_to_message_seed_docids, + "created_at": datetime.now().isoformat(), + "ended_at": "", + "cost_tracking": { + "total_cost_usd": 0.0, + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_cache_creation_tokens": 0, + "total_cache_read_tokens": 0, + }, + } + batch_metadata_json = json.dumps(batch_metadata, indent=2) + batch_data_file.write_text(batch_metadata_json, encoding="utf-8") + + def get_running_batches(self, batch_data_directory: pathlib.Path): + # Get metadata for all batches that are currently running + running_batches = [] + awaited_messages_total = 0 + for f in batch_data_directory.iterdir(): + if f.is_file(): + batch_metadata = json.loads(f.read_text()) + if batch_metadata["processing_status"] == "in_progress": + running_batches.append(batch_metadata) + awaited_messages_total += len(batch_metadata["message_ids"]) + + return running_batches, awaited_messages_total + + def await_batches( + self, + batch_data_directory: pathlib.Path, + message_data_directory: pathlib.Path, + sleep_seconds_between_batch: float = 2, + sleep_seconds_iteration: float = 30, + ): + running_batches, awaited_messages_total = self.get_running_batches( + batch_data_directory=batch_data_directory + ) + running_batches_count = len(running_batches) + print( + f"Found {running_batches_count} batches with {awaited_messages_total} messages in total." + ) + + status = StatusLine() + status.start() + + while any(running_batches): + finished_batches = [] + # print(f"Awaiting {len(running_batches)} batches...") + status.update_message(f"Awaiting {len(running_batches)} batches...") + + for batch_metadata in running_batches: + batch_id = batch_metadata["id"] + message_batch = self.client.messages.batches.retrieve( + message_batch_id=batch_id + ) + + if message_batch.processing_status != "in_progress": + # Batch has finished or was canceled + # print(f"Batch {message_batch.id} processing status is now {message_batch.processing_status}") + status.log( + f"Batch {message_batch.id} processing status is now {message_batch.processing_status}" + ) + + # Retrieve batch results if batch was processed + if message_batch.processing_status == "ended": + cost_tracking = self._finalize_batch( + message_batch=message_batch, + batch_id=batch_id, + message_ids=set(batch_metadata["message_ids"]), + message_data_directory=message_data_directory, + model=batch_metadata.get("model", GENERATION.LLM), + ) + batch_metadata["cost_tracking"] = cost_tracking + + # Update batch metadata + batch_metadata["processing_status"] = ( + message_batch.processing_status + ) + batch_metadata["ended_at"] = datetime.now().isoformat() + batch_metadata_json = json.dumps(batch_metadata, indent=2) + batch_data_file = batch_data_directory / f"{batch_id}.json" + batch_data_file.write_text(batch_metadata_json, encoding="utf-8") + + # Dont keep polling this batch + finished_batches.append(batch_metadata) + + time.sleep(sleep_seconds_between_batch) + + for batch_metadata in finished_batches: + running_batches.remove(batch_metadata) + + time.sleep(sleep_seconds_iteration) + + status.stop() + + print(f"Finished awaiting {running_batches_count} batches.") + + def get_total_cost(self, batch_data_directory: pathlib.Path) -> dict: + return get_total_cost(batch_data_directory) + + def print_cost_report( + self, batch_data_directory: pathlib.Path, dataset_log_path: pathlib.Path = None + ): + print_cost_report(batch_data_directory, dataset_log_path) + + def _finalize_batch( + self, + message_batch: MessageBatch, + batch_id: str, + message_ids: set[str], + message_data_directory: pathlib.Path, + model: str, + ) -> dict: + """ + Finalize a batch by processing results and calculating costs. + + Returns: + Dictionary with cost tracking information + """ + assert message_batch.processing_status == "ended" + + # Initialize cost tracking + cost_tracking = { + "total_cost_usd": 0.0, + "total_input_tokens": 0, + "total_output_tokens": 0, + "total_cache_creation_tokens": 0, + "total_cache_read_tokens": 0, + } + + # Stream results file in memory-efficient chunks, processing one at a time + result: MessageBatchIndividualResponse + for result in self.client.messages.batches.results(message_batch_id=batch_id): + # Ensure we know this message in this batch + assert result.custom_id in message_ids, ( + f"Unknown message '{result.custom_id}' in batch '{batch_id}'" + ) + + message_data = { + PromptMsgResultLogKey.custom_id: result.custom_id, + PromptMsgResultLogKey.id: "", + PromptMsgResultLogKey.result_type: result.result.type, + PromptMsgResultLogKey.error: "", + PromptMsgResultLogKey.response: "", + PromptMsgResultLogKey.usage_input_tokens: -1, + PromptMsgResultLogKey.usage_output_tokens: -1, + } + + match result.result.type: + case "succeeded": + res: MessageBatchSucceededResult = result.result + message_data["id"] = res.message.id + + # Extract token usage + input_tokens = res.message.usage.input_tokens + output_tokens = res.message.usage.output_tokens + cache_creation_tokens = getattr( + res.message.usage, "cache_creation_input_tokens", 0 + ) + cache_read_tokens = getattr( + res.message.usage, "cache_read_input_tokens", 0 + ) + + message_data[PromptMsgResultLogKey.usage_input_tokens] = ( + input_tokens + ) + message_data[PromptMsgResultLogKey.usage_output_tokens] = ( + output_tokens + ) + + # Calculate cost for this message + message_cost = calculate_message_cost( + model=model, + input_tokens=input_tokens, + output_tokens=output_tokens, + cache_creation_input_tokens=cache_creation_tokens, + cache_read_input_tokens=cache_read_tokens, + ) + + # Update batch totals + cost_tracking["total_cost_usd"] += message_cost + cost_tracking["total_input_tokens"] += input_tokens + cost_tracking["total_output_tokens"] += output_tokens + cost_tracking["total_cache_creation_tokens"] += ( + cache_creation_tokens + ) + cost_tracking["total_cache_read_tokens"] += cache_read_tokens + + if res.message.stop_reason == "refusal": + # The LLM refused to process the request because of a policy violation + url = "https://docs.claude.com/en/docs/test-and-evaluate/strengthen-guardrails/handle-streaming-refusals" + print(f"[SKIPPING] Policy Violation error ({url})") + message_data[PromptMsgResultLogKey.error] = "refusal" + message_data[PromptMsgResultLogKey.response] = None + else: + # raise Exception(f"Policy violation from Claude API ({url})") + + # Assert that content is of expected shape and type + assert len(res.message.content) == 1 and isinstance( + res.message.content[0], TextBlock + ), ( + f"Content validation failed: len={len(res.message.content)}, " + f"content={res.message.content}, " + f"type={type(res.message.content[0]).__name__ if res.message.content else 'empty'}" + ) + + # Fetch actual response + response: TextBlock = res.message.content[0] + message_data[PromptMsgResultLogKey.response] = response.text + + case "errored": + if result.result.error.type == "invalid_request": + # Request body must be fixed before re-sending request + print(f"Validation error {result.custom_id}") + raise Exception( + f"Validation error from Claude API: {result.result.error}" + ) + else: + # Request can be retried directly + print(f"Server error {result.custom_id} {result.result.error}") + message_data[PromptMsgResultLogKey.error] = ( + f"{result.result.error}" + ) + + # Save message to disk + message_data_file = message_data_directory / f"{result.custom_id}.json" + message_data_json = json.dumps(message_data, indent=2) + message_data_file.write_text(message_data_json, encoding="utf-8") + + return cost_tracking diff --git a/docgenie/generation/pipeline_01_select_seeds.py b/docgenie/generation/pipeline_01_select_seeds.py new file mode 100755 index 0000000000000000000000000000000000000000..2beba24bffdb6c5023ef036909a2a19f62bb8f8c --- /dev/null +++ b/docgenie/generation/pipeline_01_select_seeds.py @@ -0,0 +1,163 @@ +""" +TODO: select seeds based on clusters +""" + +import math +from pathlib import Path + +import pandas as pd +from tqdm import tqdm + +from docgenie.analyzation.clustering.cmds.generate_seeds import ( + GenerateSeedsConfig, + generate_seeds_for_embedding_type, +) +from docgenie.analyzation.clustering.core._utilities import EmbeddingType +from docgenie.data.interface import load_dataset +from docgenie.generation.constants import SEED_IMAGE_MAX_WIDTH, SEED_IMAGE_QUALITY +from docgenie.generation.models import PipelineParameters, SynDatasetDefinition +from docgenie.generation.utils.image import ( + downscale_and_compress, +) +from docgenie.generation.utils.log import log_pipeline_level + + +def prepare_seed_images(dsdef: SynDatasetDefinition, seeds_df: pd.DataFrame): + dsfiles = dsdef.get_file_structure() + + dataset = load_dataset(dsdef.base_dataset_name, split="train") + + all_doc_ids = set(seeds_df.stack()) + for seed in tqdm(all_doc_ids, desc="Downscaling and compressing seed images"): + outfile = dsfiles.preprocessed_seed_images_directory / f"{seed}.jpg" + if not outfile.exists(): + img = dataset.train.get_by_id(seed).image.content # type: ignore + + downscale_and_compress( + img=img, + save_to_path=outfile, + max_width=SEED_IMAGE_MAX_WIDTH, + quality=SEED_IMAGE_QUALITY, + ) + + +def visualize_selected_seed_labels( + dsdef: SynDatasetDefinition, seeds_df: pd.DataFrame, save_to: Path +): + import matplotlib.pyplot as plt + + dataset = load_dataset(dsdef.base_dataset_name, split="train") + + all_doc_ids = set(seeds_df.stack()) + label_counter: dict[str, int] = {} + for seed in tqdm(all_doc_ids, desc="Extracting class labels for seed images"): + doc = dataset.train.get_by_id(seed) # type: ignore + + document_label = None + for annotation in doc.annotations: # type: ignore + if annotation._type == "classification": + document_label = annotation.label.name + break + + if document_label is not None: + label_counter[document_label] = label_counter.get(document_label, 0) + 1 + + if len(label_counter) == 0: + return + + print("Seed image class label distribution:") + for label, count in label_counter.items(): + print(f"Label: {label}, Count: {count}") + + # visualize the seed label distribution as a bar chart + fig = plt.figure(figsize=(10, 6)) + plt.bar(list(label_counter.keys()), list(label_counter.values())) + plt.xlabel("Class Labels") + plt.ylabel("Frequency") + plt.title("Seed Image Class Label Distribution") + plt.xticks(rotation=90) + plt.tight_layout() + plt.savefig(save_to) + plt.close(fig) + + +def visualize_selected_clusters(clusters_df, save_to): + from collections import Counter + + import matplotlib.pyplot as plt + + # Flatten all values into a single list + all_clusters = clusters_df.values.flatten() + + # Count occurrences per cluster + cluster_counts = Counter(all_clusters) + + # Sort by cluster index for plotting + clusters_sorted = sorted(cluster_counts.keys()) + counts_sorted = [cluster_counts[c] for c in clusters_sorted] + + # Plot histogram + plt.bar(clusters_sorted, counts_sorted) + plt.xlabel("Cluster") + plt.ylabel("Frequency") + plt.title("Histogram of Cluster Occurrences") + plt.savefig(save_to) + + +def pipeline_select_seeds(params: PipelineParameters): + log_pipeline_level() + + dsdef: SynDatasetDefinition = params.dsdef + dsfiles = dsdef.get_file_structure() + + total_prompt_calls = int( + math.ceil(dsdef.documents_count / dsdef.prompt_params.num_solutions) + ) + # Add a bit of buffer because some documents will fail and we need to prompt more often + # total_prompt_calls += 100 + + cfg = GenerateSeedsConfig( + dataset_name=dsdef.base_dataset_name, + hdbscan_min_cluster_size=dsdef.hdbscan_min_cluster_size, + output_dir=dsfiles.base_path, + total_seed_runs=total_prompt_calls, + total_seeds_per_run=dsdef.seed_images_count, + visualize_seeds=True, + alpha=dsdef.alpha, + max_pool_size=dsdef.max_seed_pool, + seed=42, + seed_selection_strategy=dsdef.seed_selection_strategy, + ) + embedding_type = EmbeddingType(dsdef.embedding_type) + seeds_path: Path + clusters_path: Path + seeds_path, clusters_path = generate_seeds_for_embedding_type( + cfg=cfg, embedding_type=embedding_type + ) + + # Rename seeds file + new_file = seeds_path.with_name("seeds.csv") + seeds_path.rename(new_file) + seeds_path = new_file + + # Rename clusters file + new_file = clusters_path.with_name("clusters.csv") + clusters_path.rename(new_file) + clusters_path = new_file + + seeds_df = pd.read_csv(seeds_path) + + # visualize document classes if possible + visualize_selected_seed_labels( + dsdef=dsdef, + seeds_df=seeds_df, + save_to=seeds_path.with_name("seed_label_distribution.png"), + ) + + # Prepare seed images + prepare_seed_images(dsdef=dsdef, seeds_df=seeds_df) + + clusters_df = pd.read_csv(clusters_path) + visualize_selected_clusters( + clusters_df, save_to=clusters_path.with_name("clusters_hist.png") + ) diff --git a/docgenie/generation/pipeline_02_prompt_llm.py b/docgenie/generation/pipeline_02_prompt_llm.py new file mode 100755 index 0000000000000000000000000000000000000000..9df5fda3ab7cdd991752427f208d70084fac868a --- /dev/null +++ b/docgenie/generation/pipeline_02_prompt_llm.py @@ -0,0 +1,175 @@ +import json +import math +import os + +import pandas as pd +from docgenie import GENERATION +from docgenie.generation.constants import ( + PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE, + PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH, + PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION, +) +from docgenie.generation.utils.serialization import image_to_base64 +from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient +from docgenie.generation.models import ( + PipelineParameters, + SynDatasetDefinition, + LLMType, + SyntheticDatasetFileStructure, +) +from docgenie.generation.utils.log import log_pipeline_level + + +def _create_batch( + prompt: str, + cur_msg_index: int, + seeds_df: pd.DataFrame, + batch_size: int, + dsfiles: SyntheticDatasetFileStructure, +): + def prompt_gen(): + for _ in range(batch_size): + yield prompt + + def seed_doc_ids_gen(): + for i in range(batch_size): + index = (cur_msg_index + i) % len( + seeds_df + ) # Start all over again when we have more messages than seeds + seeds_row = seeds_df.iloc[index] + doc_ids = seeds_row.tolist() + yield doc_ids + + batch_seed_doc_ids = list(seed_doc_ids_gen()) + + def imgs_gen(): + for i in range(batch_size): + doc_ids = batch_seed_doc_ids[i] + img_base64 = [ + image_to_base64( + imgpath=dsfiles.preprocessed_seed_images_directory / f"{docid}.jpg" + ) + for docid in doc_ids + ] + yield img_base64 + + return cur_msg_index + batch_size, prompt_gen(), imgs_gen(), batch_seed_doc_ids + + +def get_remaining_prompt_calls_count( + dsdef: SynDatasetDefinition, awaited_messages_total: int +): + dsfiles = dsdef.get_file_structure() + + probable_message_responses = awaited_messages_total + total_message_responses = awaited_messages_total + for f in dsfiles.message_results_directory.iterdir(): + if f.is_file(): + total_message_responses += 1 + msg_result = json.loads(f.read_text(encoding="utf-8")) + if msg_result["result_type"] == "succeeded": + probable_message_responses += 1 + + # Each message response should contain N HTML documents + # TODO: better count actual number of responses here in case num_solutions was changed afterwards + probable_raw_documents = ( + probable_message_responses * dsdef.prompt_params.num_solutions + ) + remaining_documents_count = dsdef.documents_count - probable_raw_documents + print( + f"{dsdef.documents_count=} {probable_message_responses=} {probable_raw_documents=} {remaining_documents_count=}" + ) + + # Create batches + remaining_prompt_calls = ( + remaining_documents_count / dsdef.prompt_params.num_solutions + ) + + return ( + total_message_responses, + remaining_documents_count, + remaining_prompt_calls, + ) + + +def pipeline_retrieve_document_html_seed_based( + params: PipelineParameters, +): + log_pipeline_level() + + prompt_batch_max_size = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE + prompt_batch_poll_sleep_seconds_between_batch = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH + prompt_batch_poll_sleep_seconds_between_iteration = PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION + + dsdef = params.dsdef + # Check which LLM Type sanity + assert params.llmtype.value in [e.value for e in LLMType], ( + f"Invalid model:{params.llmtype.value}" + ) + + dsfiles = dsdef.get_file_structure() + prompt = dsdef.get_prompt() + + seeds_path = dsfiles.base_path / "seeds.csv" + seeds_df = pd.read_csv(seeds_path) + assert len(seeds_df.columns) == dsdef.seed_images_count + + # TODO Saifullah: implement opensource VLMs here + + api_key_env_variable_name = params.api_key_env_variable_name or "ANTHROPIC_API_KEY" + api_key = os.getenv(api_key_env_variable_name) + print(f"{api_key_env_variable_name=} len: {len(api_key)}") # type: ignore + if params.api_key_env_variable_name: + input("PRESS ENTER TO CONFIRM") + + client = ClaudeBatchedClient(api_key=api_key) # type: ignore + running_batches, awaited_messages_total = client.get_running_batches( + batch_data_directory=dsfiles.prompt_batches_directory + ) + total_message_responses, remaining_documents_count, remaining_prompt_calls = ( + get_remaining_prompt_calls_count( + dsdef=dsdef, awaited_messages_total=awaited_messages_total + ) + ) + + cur_msg_index = total_message_responses + if remaining_documents_count > 0: + num_batches = math.ceil(remaining_prompt_calls / prompt_batch_max_size) + print(f"{remaining_prompt_calls=} {num_batches=}") + + for i_batch in range(num_batches): + # For each batch, we select images from all seed images + this_batch_size = min( + math.ceil( + remaining_documents_count / dsdef.prompt_params.num_solutions + ), + prompt_batch_max_size, + ) + cur_msg_index, batch_prompts, batch_imgs, batch_seed_docids = _create_batch( + prompt=prompt, + cur_msg_index=cur_msg_index, + seeds_df=seeds_df, + batch_size=this_batch_size, + dsfiles=dsfiles, + ) + + # Batch is sent to LLM and batch infos are stored locally on disk + client.send_batch( + model=GENERATION.LLM, + prompts=batch_prompts, + images_base64=batch_imgs, + image_docids=batch_seed_docids, + batch_data_directory=dsfiles.prompt_batches_directory, + max_tokens=GENERATION.MAX_TOKENS, # up to 32k + ) + + remaining_documents_count -= this_batch_size + + # Await all previously sent batches + # LLM Responses are saved to disk + client.await_batches( + batch_data_directory=dsfiles.prompt_batches_directory, + message_data_directory=dsfiles.message_results_directory, + sleep_seconds_between_batch=prompt_batch_poll_sleep_seconds_between_batch, + sleep_seconds_iteration=prompt_batch_poll_sleep_seconds_between_iteration, + ) diff --git a/docgenie/generation/pipeline_03/HeadlessRenderer.py b/docgenie/generation/pipeline_03/HeadlessRenderer.py new file mode 100755 index 0000000000000000000000000000000000000000..a85708219e8ae1bf8d8eb0c67cb65df6d72dfba1 --- /dev/null +++ b/docgenie/generation/pipeline_03/HeadlessRenderer.py @@ -0,0 +1,39 @@ +import json +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed +from PyPDF2 import PdfReader +from rich.progress import ( + Progress, + BarColumn, + TaskProgressColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) +from playwright.sync_api import sync_playwright + + +class HeadlessRenderer: + """Keep a single Chromium browser open to measure HTML body size.""" + + def __init__(self): + self.playwright = sync_playwright().start() + self.browser = self.playwright.chromium.launch( + headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"] + ) + self.page = self.browser.new_page() + + def calc_size(self, html: str): + """Return rendered width/height of .""" + self.page.set_content(html) + size = self.page.evaluate( + """() => { + const rect = document.body.getBoundingClientRect(); + return { width: rect.width, height: rect.height }; + }""" + ) + return size["width"], size["height"] + + def close(self): + self.page.close() + self.browser.close() + self.playwright.stop() diff --git a/docgenie/generation/pipeline_03/css copy.py b/docgenie/generation/pipeline_03/css copy.py new file mode 100755 index 0000000000000000000000000000000000000000..e3d9f5e40c730196dc1e30fc352054a0d35bf5b7 --- /dev/null +++ b/docgenie/generation/pipeline_03/css copy.py @@ -0,0 +1,408 @@ +import logging + +from bs4 import BeautifulSoup +import cssutils +from docgenie import ENV +from docgenie.generation.constants import ( + BS_PARSER, + HANDWRITING_CLASS_NAME, + HANDWRITING_FONT_SIZE, +) +from docgenie.generation.utils.handwriting import get_all_author_ids +from docgenie.generation.utils.visualelement import get_visual_element_id + +# Your input HTML (replace with reading from file if needed) +html = """ + + + + + +

Hello World

+ + +""" + +# html_path = ENV.DATA_DIR / "html" / "receipt3.html" +# html = html_path.read_text(encoding="utf-8") + + +# Get cssutils logger +cssutils_logger = logging.getLogger("CSSUTILS") + +# Remove all handlers (if any) and prevent propagation +cssutils_logger.handlers.clear() +cssutils_logger.propagate = False + +# Add a NullHandler so it discards all logs +cssutils_logger.addHandler(logging.NullHandler()) + + +def mark_visual_elements_for_ocr(soup: BeautifulSoup): + fields = soup.find_all(attrs={"data-placeholder": True}) + for i, div in enumerate(fields): + div.clear() # type: ignore + div.string = get_visual_element_id(i) # type: ignore + style = cssutils.parseStyle(div.get("style")) # type: ignore + style["font-size"] = "1px" + style["text-align"] = "center" + style["display"] = "flex" + style["justify-content"] = "center" + style["align-items"] = "center" + + div["style"] = style.cssText # type: ignore + # keep "position:absolute;top:50mm;right:20mm;width:35mm;height:35mm;z-index:10; + + return soup + + +def unmark_visual_elements(soup: BeautifulSoup): + fields = soup.find_all(attrs={"data-placeholder": True}) + + style_tag = soup.find("style") + if not style_tag: + raise ValueError("No + + +

Hello World

+ + +""" + +# html_path = ENV.DATA_DIR / "html" / "receipt3.html" +# html = html_path.read_text(encoding="utf-8") + + +# Get cssutils logger +cssutils_logger = logging.getLogger("CSSUTILS") + +# Remove all handlers (if any) and prevent propagation +cssutils_logger.handlers.clear() +cssutils_logger.propagate = False + +# Add a NullHandler so it discards all logs +cssutils_logger.addHandler(logging.NullHandler()) + + +def unmark_visual_elements_alt(soup: BeautifulSoup): + fields = soup.find_all(attrs={"data-placeholder": True}) + + style_tag = soup.find("style") + if not style_tag: + raise ValueError("No +""" + + html_content = re.sub( + r"", override_style, html_content, count=1, flags=re.IGNORECASE + ) + + return html_content + + +def preprocess_html_for_pdf(html_content: str) -> str: + """ + Universal preprocessing for LLM-generated HTML. + Removes @page rules and ensures content determines PDF size. + Safe for: receipts, multi-column, tables, any layout. + """ + # Step 1: Remove @page rules + html_content = re.sub( + r"@page\s*\{[^}]*\}", "", html_content, flags=re.IGNORECASE | re.DOTALL + ) + + # Step 2: Add override styles + # Key: Don't force any widths - let content be measured as-is + override_style = """ +""" + + html_content = re.sub( + r"", override_style, html_content, count=1, flags=re.IGNORECASE + ) + + return html_content + + +async def render_pdf_async( + doc_id, + html, + dsfiles: SyntheticDatasetFileStructure, + extract_geos_for_classes: list[str], + semaphore: asyncio.Semaphore, + max_retries=2, + timeout_seconds=60, # Add timeout parameter +): + """ + Async version: Render PDF using Playwright with automatic size detection. + Also extracts element geometries for specified classes. + """ + selectorMap = { + "layout_element": '[class*="LE-"]', + "handwriting": f".{HANDWRITING_CLASS_NAME}", + "visual_element": "[data-placeholder]", + } + if any(extract_geos_for_classes): + selectorMap["custom"] = ", ".join([f".{c}" for c in extract_geos_for_classes]) + + last_error = None + for attempt in range(1, max_retries + 2): + browser = None + try: + pdf_path = dsfiles.pdf_initial_directory / f"{doc_id}.pdf" + render_html_path = dsfiles.render_html_directory / f"{doc_id}.html" + geometry_json_path = dsfiles.geometries_directory / f"{doc_id}.json" + + # Preprocess HTML (synchronous - fast) + html = preprocess_html_for_pdf(html) + soup = BeautifulSoup(html, BS_PARSER) + + soup = increase_handwriting_font_size( + soup, dbg=doc_id == "1f100208-1fd8-4f60-b071-a51be9d7b495_2" + ) + # soup = postprocess_handwriting(soup) + soup = unmark_visual_elements(soup) + + prep_html = soup.prettify() + render_html_path.write_text(prep_html, encoding="utf-8") # type: ignore + + # Acquire semaphore for Chromium concurrency control + async with semaphore: + try: + async with asyncio.timeout(timeout_seconds): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + # Load HTML + await page.goto( + f"file://{render_html_path}", + wait_until="domcontentloaded", + ) + await page.emulate_media(media="screen") + + # Auto-detect content dimensions + dimensions = await page.evaluate(MEASURE_DIMENSIONS) + + page_width_px = dimensions["width"] + page_height_px = dimensions["height"] + + # Set viewport and wait for layout + await page.set_viewport_size( + {"width": page_width_px, "height": page_height_px} + ) + await page.wait_for_timeout(30) + + # Extract geometries + # class_selectors = ", ".join( + # [f".{cls}" for cls in extract_positions_for_classes] + # ) + + geo_eval_str = f""" + () => {{ + const data = []; + + // Define individual selectors with labels + const selectorMap = {selectorMap}; + + const processedElements = new Map(); // Use Map to track matches + + // First pass: collect all elements and their matching selectors + Object.entries(selectorMap).forEach(([label, selector]) => {{ + document.querySelectorAll(selector).forEach(el => {{ + if (!processedElements.has(el)) {{ + processedElements.set(el, []); + }} + processedElements.get(el).push(label); + }}); + }}); + + // Second pass: create geometry data for each unique element + processedElements.forEach((selectorTypes, el) => {{ + const rect = el.getBoundingClientRect(); + const computed = window.getComputedStyle(el); + + // Get text content (matches your Python logic) + let text = ''; + if (el.tagName.toLowerCase() === 'input') {{ + text = (el.value || '').trim(); + }} else {{ + text = (el.innerText || el.textContent || '').trim(); + }} + + data.push({{ + id: el.id || null, + tag: el.tagName.toLowerCase(), + classes: el.className || null, + rect: {{ + x: rect.x, + y: rect.y, + width: rect.width, + height: rect.height + }}, + visibility: computed.visibility, + dataContent: el.getAttribute('data-content') || null, + dataPlaceholder: el.getAttribute('data-placeholder') || null, + style: el.getAttribute('style') || null, + text: text, + selectorTypes: selectorTypes // Array of all matching selector types + }}); + }}); + + return data; + }} + """ + # input(geo_eval_str) + geometries = await page.evaluate(geo_eval_str) + + # Generate PDF + page_width_inches = page_width_px / 96 + page_height_inches = page_height_px / 96 + + await page.pdf( + path=str(pdf_path), + width=f"{page_width_inches}in", + height=f"{page_height_inches}in", + margin={ + "top": "0", + "bottom": "0", + "left": "0", + "right": "0", + }, + print_background=True, + display_header_footer=False, + prefer_css_page_size=False, + scale=1.0, + ) + + await browser.close() + except asyncio.TimeoutError: + print( + f"PDF rendering timed out after {timeout_seconds}s for {doc_id}" + ) + raise TimeoutError( + f"PDF rendering timed out after {timeout_seconds}s for {doc_id}" + ) + finally: + # Ensure browser closes even on timeout + if browser is not None: + try: + await browser.close() + except Exception: + pass + # os.unlink(temp_path) + + # Convert CSS pixels to PDF points + scale = 72 / 96 + for g in geometries: + g["rect"]["x"] *= scale + g["rect"]["y"] *= scale + g["rect"]["width"] *= scale + g["rect"]["height"] *= scale + + # Save geometry JSON + with open(geometry_json_path, "w") as f: + json.dump(geometries, f, indent=2) + + # DEBUG + draw_geos_on_pdf( + geos=geometries, + pdf_in=pdf_path, + pdf_out=dsfiles.debug_pdf_geometries_directory / f"{doc_id}.pdf", + ) + + pdf_num_pages = safe_count_pages(pdf_path) + + return { + DocLogKey.document_id: doc_id, + DocLogKey.render_html_width: page_width_px, + DocLogKey.render_html_height: page_height_px, + DocLogKey.pdf_num_pages: pdf_num_pages, + DocLogKey.pdf_render_error: None, + DocLogKey.num_geometries_extracted: len(geometries), + } + + except Exception as e: + print(f"[yellow]Attempt {attempt} failed for {doc_id}: {e}") + await asyncio.sleep(1) + last_error = str(e) + + return { + DocLogKey.document_id: doc_id, + DocLogKey.render_html_width: None, + DocLogKey.render_html_height: None, + DocLogKey.pdf_num_pages: None, + DocLogKey.pdf_render_error: last_error, + DocLogKey.num_geometries_extracted: 0, + } + + +async def process_batch_async( + html_data, + dsfiles, + extract_geos_for_classes, + chromium_concurrency, + dsdef, + progress, + render_task, + max_retries, + timeout_seconds, +): + """Process a batch of PDFs asynchronously.""" + semaphore = asyncio.Semaphore(chromium_concurrency) + + tasks = [ + render_pdf_async( + doc_id, + html, + dsfiles, + extract_geos_for_classes, + semaphore, + max_retries=max_retries, + timeout_seconds=timeout_seconds, + ) + for (doc_id, html) in html_data + ] + + results = [] + for coro in asyncio.as_completed(tasks): + try: + result = await coro + dsdef.write_to_document_log( + document_id=result[DocLogKey.document_id], vals=result + ) + progress.update(render_task, advance=1) + results.append(result) + + if result[DocLogKey.pdf_render_error]: + print( + f"[red]PDF failed for {result[DocLogKey.document_id]}: {result[DocLogKey.pdf_render_error]}" + ) + elif ( + result[DocLogKey.pdf_num_pages] and result[DocLogKey.pdf_num_pages] > 1 + ): + print( + f"[yellow]Warning: {result[DocLogKey.document_id]} rendered to {result[DocLogKey.pdf_num_pages]} pages" + ) + + except Exception as e: + print(f"[red]Unexpected error: {e}") + progress.update(render_task, advance=1) + + return results + + +def pipeline_render_pdf_and_extract_geos_parallel(params: PipelineParameters): + """ + Render HTML documents to PDF using async Playwright with automatic size detection. + Much faster than sync version! + """ + log_pipeline_level() + + chromium_concurrency = PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY + max_retries = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES + timeout_seconds = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT + # extract_positions_for_classes = ["handwritten"] # or whatever you need + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + extract_geos_for_classes = dsdef.valid_labels or [] + + # Get valid documents that need PDF generation + html_data = [] + total_samples = 0 + + for doc in dsdef.get_document_logs(): + total_samples += 1 + pdf_path = dsfiles.pdf_initial_directory / f"{doc.document_id}.pdf" + valid_pdf = pdf_path.exists() and doc.pdf_num_pages == 1 + if not valid_pdf: + html_path = dsfiles.raw_html_directory / f"{doc.document_id}.html" + if html_path.exists(): + html = html_path.read_text(encoding="utf-8") + html_data.append((doc.document_id, html)) + + total = len(html_data) + print( + f"{total} valid samples out of {total_samples} total samples need to be converted." + ) + + with get_progress_bar() as progress: + render_task = progress.add_task("[red]Rendering PDFs Pass 1...", total=total) + + # Run async event loop + results = asyncio.run( + process_batch_async( + html_data, + dsfiles, + extract_geos_for_classes, + chromium_concurrency, + dsdef, + progress, + render_task, + max_retries=max_retries, + timeout_seconds=timeout_seconds, + ) + ) + + print(f"✅ Finished rendering {len(results)}/{total} PDFs.") + + # Summary stats + successful = sum(1 for r in results if r[DocLogKey.pdf_num_pages] == 1) + multi_page = sum( + 1 + for r in results + if r[DocLogKey.pdf_num_pages] and r[DocLogKey.pdf_num_pages] > 1 + ) + failed = sum(1 for r in results if r[DocLogKey.pdf_render_error]) + + print( + f"📊 Summary: {successful} single-page, {multi_page} multi-page, {failed} failed" + ) + + return results diff --git a/docgenie/generation/pipeline_05/pdftoimage.py b/docgenie/generation/pipeline_05/pdftoimage.py new file mode 100755 index 0000000000000000000000000000000000000000..9773220cb8c265d3f062aa005737a1e7cc969a52 --- /dev/null +++ b/docgenie/generation/pipeline_05/pdftoimage.py @@ -0,0 +1,14 @@ +import pathlib +from pdf2image import convert_from_path +from PIL import Image + +from docgenie.generation.constants import PDF_DPI + + +def convert_from_path_singlepage( + pdf_path: pathlib.Path, target_size: tuple[int, int] | None = None +) -> Image.Image: + images = convert_from_path(pdf_path, dpi=PDF_DPI, size=target_size) + assert len(images) == 1, "Multi-page document are not supported" + img = images[0] + return img diff --git a/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py b/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py new file mode 100755 index 0000000000000000000000000000000000000000..6765f549389e95cf82d61794caba8efbda9281c5 --- /dev/null +++ b/docgenie/generation/pipeline_05_extract_bboxes_from_pdf.py @@ -0,0 +1,114 @@ +import json +from docgenie.generation.models import ( + DocLogKey, + OCRBox, + PipelineParameters, +) +from docgenie.generation.models._syndatadef import SynDatasetDefinition +from docgenie.generation.pipeline_04.extract_bbox import ( + extract_bboxes_from_pdf, + validate_char_bbox_word_mapping, +) +from docgenie.generation.utils.bboxes import ( + draw_bboxes_on_pdf, + read_syn_dataset_bboxes, + save_bboxes, +) +from docgenie.generation.utils.debug import draw_geos_and_bboxes_on_pdf +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + + bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid) + bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path) + + pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf" + outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf" + + geo_path = dsfiles.geometries_directory / f"{docid}.json" + geos = json.loads(geo_path.read_text(encoding="utf-8")) + + outpath2 = dsfiles.debug_pdf_bboxes_and_geos_directory / f"{docid}.pdf" + + try: + draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm) + draw_geos_and_bboxes_on_pdf( + pdf_in=pdf_path, + pdf_out=outpath2, + bboxes_=bbox_unnorm, + geos=geos, + verbose=False, + ) + except Exception as err: + print(f"[ERROR]: Skipping debug PDF: {str(err)}") + + +def pipeline_extract_bboxes(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + valid_document_ids = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1: + bbox_path = dsfiles.get_pdf_bbox_path( + level="word", doc_id=doclog.document_id + ) + if not bbox_path.exists(): + valid_document_ids.append(doclog.document_id) + + print( + f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for BBox extraction." + ) + + with get_progress_bar() as progress: + bbox_task = progress.add_task( + f"[red]Extracting BBoxes from {len(valid_document_ids)} PDFs...", + total=len(valid_document_ids), + ) + + for document_id in valid_document_ids: + pdf_path = dsfiles.pdf_initial_directory / f"{document_id}.pdf" + + word_bboxes = extract_bboxes_from_pdf(pdf_path=pdf_path, level="word") + # Save word level bounding boxes + save_bboxes( + bboxes=word_bboxes, + bbox_path=dsfiles.get_pdf_bbox_path(level="word", doc_id=document_id), + ) + + if params.debug: + draw_bbox_debug(dsdef=dsdef, docid=document_id) + + # Save character level bounding boxes for splitting handwritting text + # before inputting to difussion model (they support only short text) + char_bboxes = extract_bboxes_from_pdf(pdf_path=pdf_path, level="char") + can_map_chars_to_words = validate_char_bbox_word_mapping( + char_bboxes=char_bboxes, word_bboxes=word_bboxes + ) + + if can_map_chars_to_words: + save_bboxes( + bboxes=char_bboxes, + bbox_path=dsfiles.get_pdf_bbox_path( + level="char", doc_id=document_id + ), + ) + + dsdef.write_to_document_log( + document_id=document_id, + vals={ + DocLogKey.num_word_bboxes: len(word_bboxes), + DocLogKey.num_char_bboxes: len(char_bboxes), + DocLogKey.can_map_chars_to_words: can_map_chars_to_words, + }, + ) + + progress.update(bbox_task, advance=1) diff --git a/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py b/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py new file mode 100755 index 0000000000000000000000000000000000000000..46fc79923228ababc183bb313ad84258e2b1466b --- /dev/null +++ b/docgenie/generation/pipeline_06_extract_layout_element_definitions_and_annotation_gt.py @@ -0,0 +1,252 @@ +from dataclasses import asdict +import json +from docgenie.generation.models import ( + DocLogKey, + PipelineParameters, + SynDatasetDefinition, + SynDocumentLog, +) +from docgenie.generation.models._bbox import LayoutBox +from docgenie.generation.models._consts import DatasetTask +from docgenie.generation.models._file import SyntheticDatasetFileStructure +from docgenie.generation.utils.debug import draw_geos_on_pdf +from docgenie.generation.utils.geos import ( + read_custom_elements_from_geos, + read_layout_elements_from_geos, +) +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def extract_layout_elements_from_geos( + dsdef: SynDatasetDefinition, doc_id: str, debug: bool +) -> list[dict]: + files = dsdef.get_file_structure() + geo_path = files.geometries_directory / f"{doc_id}.json" + geos_gen = read_layout_elements_from_geos(geo_path=geo_path) + geos: list[dict] = list(geos_gen) + + # Add temporary unique ID to each layout element. + results = [] + for i, geo in enumerate(geos): + r = geo["rect"] + if not geo["classes"]: + result = { + "id": f"le{i}", + "class": None, + "content": None, + "rect": r, + "error": "no-class", + } + else: + classes = geo["classes"].split(" ") + # layout_node_class cannot be None, since the elementes are selected by class in pipeline_04 + layout_node_class = next( + (cls for cls in classes if cls.startswith("LE-")), None + ) + result = { + "id": f"le{i}", + "class": layout_node_class, + "content": layout_node_class, + "rect": r, + "error": None, + } + + if r["width"] == 0 or r["height"] == 0: + result["error"] = "invalid-size" + + results.append(result) + + if debug: + debug_pdf_file = files.debug_pdf_layout_directory / f"{doc_id}.pdf" + draw_geos_on_pdf( + geos=geos, + pdf_in=files.pdf_initial_directory / f"{doc_id}.pdf", + pdf_out=debug_pdf_file, + ) + + return results + + +def process_dla( + document_log: SynDocumentLog, + dsdef: SynDatasetDefinition, + dsfiles: SyntheticDatasetFileStructure, + debug: bool, +): + document_id = document_log.document_id + data = extract_layout_elements_from_geos( + dsdef=dsdef, doc_id=document_id, debug=debug + ) + valid_data = [d for d in data if d["error"] is None] + errors = [f"{d['id']}: {d['error']}" for d in data if d["error"] is not None] + + dsdef.write_to_document_log( + document_id=document_id, + vals={ + DocLogKey.layout_elements_num_elements: len(data), + DocLogKey.layout_elements_extraction_errors: errors, + DocLogKey.layout_elements_generation_logs: data, + DocLogKey.raw_annotation_gt_found: len(valid_data) > 0, + DocLogKey.raw_annotation_gt_extraction_errors: errors, + DocLogKey.raw_gt_or_annotation_annotations_count: len(data), + }, + ) + + # data is None if there were no bboxes extracted for layout elements + if data is None or len(data) == 0: # type: ignore + return 0 + + result_path = dsfiles.layout_element_definitions_directory / f"{document_id}.json" + result_path.write_text(json.dumps(data, indent=4), encoding="utf-8") + + # Write GT for DLA: raw_annotations are layout bboxes and gt is normalized layout bboxes (created later in pipeline where bboxes are normalized) + raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json" + layout_bboxes = [] + for d in data: + x0 = d["rect"]["x"] + y0 = d["rect"]["y"] + x2 = d["rect"]["x"] + d["rect"]["width"] + y2 = d["rect"]["y"] + d["rect"]["height"] + layout_bboxes.append(LayoutBox(x0=x0, y0=y0, x2=x2, y2=y2, label=d["content"])) + + boxes_dicts = [asdict(b) for b in layout_bboxes] + raw_annotations_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8") + + return len(data) + + +def extract_kie_elements_from_geos( + dsdef: SynDatasetDefinition, doc_id: str +) -> list[dict]: + files = dsdef.get_file_structure() + geo_path = files.geometries_directory / f"{doc_id}.json" + geos_gen = read_custom_elements_from_geos(geo_path=geo_path) + geos: list[dict] = list(geos_gen) + + # Add temporary unique ID to each layout element. + results = [] + for i, geo in enumerate(geos): + classes = geo["classes"].split(" ") + # kie_label cannot be None, since the elementes are selected by class in pipeline_04 + all_kie_labels = [c for c in classes if c in dsdef.valid_labels] + + kie_label = all_kie_labels[0] + # kie_secondary_label can be none, depending on the task + all_secondary_labels = [ + c for c in classes if c in (dsdef.valid_secondary_labels or []) + ] + kie_secondary_label = ( + all_secondary_labels[0] if len(all_secondary_labels) > 0 else None + ) + + # print(f"{classes=} {kie_label=} {kie_secondary_label=}") + # input() + + result = { + "id": f"{i}_{kie_label}_{kie_secondary_label}", + "group": kie_secondary_label, + "key": kie_label, + "value": geo["text"], + "rect": geo["rect"], + "error": None, + } + + # Don't allow empty key + if not geo["text"] or not geo["text"].strip(): + result["error"] = "missing-value" + + if len(all_kie_labels) > 1: + result["error"] = "multiple-labels" + + results.append(result) + + return results + + +def process_kie( + document_log: SynDocumentLog, + dsdef: SynDatasetDefinition, + dsfiles: SyntheticDatasetFileStructure, +): + document_id = document_log.document_id + data = extract_kie_elements_from_geos(dsdef=dsdef, doc_id=document_id) + valid_data = [d for d in data if d["error"] is None] + errors = [f"{d['id']}: {d['error']}" for d in data if d["error"] is not None] + + dsdef.write_to_document_log( + document_id=document_id, + vals={ + DocLogKey.layout_elements_num_elements: 0, + DocLogKey.layout_elements_extraction_errors: [], + DocLogKey.layout_elements_generation_logs: [], + DocLogKey.raw_annotation_gt_found: len(valid_data) > 0, + DocLogKey.raw_annotation_gt_extraction_errors: errors, + DocLogKey.raw_gt_or_annotation_annotations_count: len(valid_data), + }, + ) + + # data is None if there were no bboxes extracted for layout elements + if data is None or len(data) == 0: # type: ignore + return 0 + + # Write GT for KIE: raw_annotations are extracted annotation and gt is with value mapped to word bboxes (created later) + raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json" + raw_annotations_path.write_text(json.dumps(data, indent=4), encoding="utf-8") + + return len(data) + + +def pipeline_extract_layout_element_definitions_and_annotation_gt( + params: PipelineParameters, +): + log_pipeline_level() + + if params.dsdef.prompt_task != "annotation": + return + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + document_logs = {} + valid_document_ids = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1: + valid_document_ids.append(doclog.document_id) + document_logs[doclog.document_id] = doclog + + total_document_count = 0 + total_layout_elements_count = 0 + dataset_task = DatasetTask(dsdef.task) + + with get_progress_bar() as progress: + annotation_task = progress.add_task( + description=f"[red]Extracting layout elements from {len(valid_document_ids)} PDFs...", + total=len(valid_document_ids), + ) + + for document_id, document_log in document_logs.items(): + if dataset_task == DatasetTask.DLA: + found_annotations = process_dla( + document_log=document_log, + dsdef=dsdef, + dsfiles=dsfiles, + debug=params.debug, + ) + total_document_count += 1 if found_annotations > 0 else 0 + total_layout_elements_count += found_annotations + + # This whole pipeline step is only executed if prompt_task == 'annotation', thus KIE raw_annotations are not overridden if its modelled as prompt_task=='json' + elif dataset_task == DatasetTask.KIE: + found_annotations = process_kie( + document_log=document_log, dsdef=dsdef, dsfiles=dsfiles + ) + total_document_count += 1 if found_annotations > 0 else 0 + total_layout_elements_count += found_annotations + + progress.update(annotation_task, advance=1) + + print(f"Extracted {total_layout_elements_count=} from {total_document_count=}") diff --git a/docgenie/generation/pipeline_07_extract_handwriting.py b/docgenie/generation/pipeline_07_extract_handwriting.py new file mode 100755 index 0000000000000000000000000000000000000000..c9699a6f7f7d2816b31b7993c274c106444ee975 --- /dev/null +++ b/docgenie/generation/pipeline_07_extract_handwriting.py @@ -0,0 +1,284 @@ +from collections import defaultdict +from curses.ascii import isspace +import json +import pathlib + +from bs4 import BeautifulSoup +from docgenie.generation.constants import ( + BBOX_TO_GEO_MATCHING_THRESHOLD, + HANDWRITING_CLASS_NAME, + PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN, + SIGNATURE_CLASS_NAME, +) +from docgenie.generation.models import ( + DocLogKey, + OCRBox, + PipelineParameters, + SynDatasetDefinition, +) + +from docgenie.generation.utils.bboxes import is_in_rect, read_syn_dataset_bboxes +from docgenie.generation.utils.geos import read_handwriting_elements_from_geos +from docgenie.generation.utils.handwriting import get_author_id +from docgenie.generation.utils.html import get_field_text +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + +__BS_PARSER = "lxml" # "html.parser" + + +""" +Need to work with char level here as well and split words longer than N characters +""" + + +def decompose_word_bboxes(word_bboxes, char_bboxes, N: int = 7, verbose=False): + bbox: OCRBox + word_no_to_chars = defaultdict(list) + for bbox in char_bboxes: + word_no_to_chars[bbox.key].append(bbox) + + result = [] + for bbox in word_bboxes: + # result.append([]) + if N > 0 and len(bbox.text) > N: + # Split bbox -> collect char bboxes + chars = word_no_to_chars[bbox.key] + # print(bbox.text, ''.join([c.text for c in chars])) + for i in range(0, len(bbox.text), N): + subword_chars: list[OCRBox] = chars[i : i + N] + if verbose: + for c in subword_chars: + print(c) + subword_bbox = OCRBox( + x0=subword_chars[0].x0, + y0=subword_chars[0].y0, + x2=subword_chars[-1].x2, + y2=subword_chars[-1].y2, + text=bbox.text[i : i + N], + block_no=bbox.block_no, + line_no=bbox.line_no, + word_no=bbox.word_no, + ) + # print(subword_bbox) + # result[-1].append(subword_bbox) + result.append(subword_bbox) + else: + # result[-1].append(bbox) + result.append(bbox) + + # print(result) + # input() + return result + + +def extract_handwritten_fields( + dsdef: SynDatasetDefinition, doc_id: str, max_word_len: int = -1 +) -> list[dict]: + paths = dsdef.get_file_structure() + word_bbox_path = paths.get_pdf_bbox_path(level="word", doc_id=doc_id) + word_bboxes = read_syn_dataset_bboxes(word_bbox_path) + char_bbox_path = paths.get_pdf_bbox_path(level="char", doc_id=doc_id) + char_bboxes = read_syn_dataset_bboxes(char_bbox_path) + + geo_path = paths.geometries_directory / f"{doc_id}.json" + geos = read_handwriting_elements_from_geos(geo_path=geo_path) + geos = list(geos) + + # Extract text content + result = [] + taken_bbox_indices = set() + for i, geo in enumerate(geos): + field_text = geo["text"] + + # Get author ID + classes = geo["classes"].split(" ") + author_id = get_author_id(classes) # type: ignore + + is_signature = SIGNATURE_CLASS_NAME in classes # type: ignore + + if author_id is None: + value = { + "id": f"hw{i}", + "text": field_text, + "author-id": None, + "bboxes": None, + "rect": geo["rect"], + "is_signature": is_signature, + "error": "no-authorid", + } + result.append(value) + continue + + if not field_text or field_text.isspace(): + value = { + "id": f"hw{i}", + "text": field_text, + "author-id": None, + "bboxes": None, + "rect": geo["rect"], + "is_signature": is_signature, + "error": "no-text", + } + result.append(value) + continue + + startidx, stopidx = find_bbox_indices( + word_bboxes, + query=field_text, + taken_indices=taken_bbox_indices, + rect=geo["rect"], + verbose=False, + ) + taken_bbox_indices.add((startidx, stopidx)) + if startidx is None or stopidx is None: + value = { + "id": f"hw{i}", + "text": field_text, + "author-id": None, + "bboxes": None, + "rect": geo["rect"], + "is_signature": is_signature, + "error": "not-found", + } + result.append(value) + continue + + corresponding_boxes = word_bboxes[startidx:stopidx] + extracted_text = " ".join([b.text for b in corresponding_boxes]) + extracted_text = extracted_text.strip() + # assert field_text == extracted_text, f'{field_text=} {extracted_text=}' + + # Split words to max len for diffusion model + if max_word_len > 1: + split_bboxes = decompose_word_bboxes( + word_bboxes=corresponding_boxes, + char_bboxes=char_bboxes, + N=max_word_len, + verbose=False, + ) + bboxes = split_bboxes + else: + bboxes = corresponding_boxes + + value = { + "id": f"hw{i}", + "text": field_text, + "author-id": author_id, + "bboxes": [b.as_string() for b in bboxes], + "rect": geo["rect"], + "is_signature": is_signature, + "error": None, + } + result.append(value) + + return result + + +def find_bbox_indices( + bboxes: list[OCRBox], + query: str, + taken_indices: set[tuple[int, int]], + rect: dict, + verbose: bool, +) -> tuple[int | None, int | None]: + """ + Find consecutive bounding boxes matching the full query string. + + Parameters: + bboxes (list of tuples): [(x1, y1, x2, y2, text), ...] + query (str): The full string to search for (words separated by spaces) + + Returns: + list of tuples: The matching sublist of bounding boxes, or [] if not found + """ + words = query.split() + n = len(words) + + for i in range(len(bboxes) - n + 1): + # Extract the text from a consecutive slice + slice_texts = [b.text for b in bboxes[i : i + n]] + start, stop = i, i + n + + if slice_texts == words: + if (start, stop) not in taken_indices: + start_in_rect = is_in_rect( + rect=rect, + bbox=bboxes[start], + threshold=BBOX_TO_GEO_MATCHING_THRESHOLD, + ) + stop_in_rect = is_in_rect( + rect=rect, + bbox=bboxes[stop - 1], + threshold=BBOX_TO_GEO_MATCHING_THRESHOLD, + ) + # # start_in_rect = True + # # stop_in_rect = True + # if query == "K. Thompson": + # print( + # f"{bboxes[start]=} {bboxes[stop]=} {rect=} {start_in_rect=} {stop_in_rect=}" + # ) + # input() + if start_in_rect and stop_in_rect: + return (start, stop) + + return (None, None) + + +def pipeline_extract_handwritten_fields(params: PipelineParameters): + log_pipeline_level() + + max_word_len = PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + valid_document_ids = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1 and doclog.can_map_chars_to_words: + bbox_path = dsfiles.get_pdf_bbox_path( + level="char", doc_id=doclog.document_id + ) + if bbox_path.exists(): + valid_document_ids.append(doclog.document_id) + + print( + f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for handwritten&signature extraction." + ) + + with get_progress_bar() as progress: + hw_task = progress.add_task( + f"[red]Extracting Handwriting from {len(valid_document_ids)} PDFs...", + total=len(valid_document_ids), + ) + + for document_id in valid_document_ids: + data = extract_handwritten_fields( + dsdef=dsdef, doc_id=document_id, max_word_len=max_word_len + ) + + errors = [ + f'{d["id"]}: {d["error"]}, text: "{d["text"]}"' + for d in data + if d["error"] is not None + ] + + if len(data) > 0: + result_path = ( + dsfiles.handwritten_bboxes_directory / f"{document_id}.json" + ) + json_str = json.dumps(data, indent=4) + result_path.write_text(json_str, encoding="utf-8") + + dsdef.write_to_document_log( + document_id=document_id, + vals={ + DocLogKey.handwriting_num_elements: len(data), + DocLogKey.handwriting_element_extraction_errors: errors, + }, + ) + + progress.update(hw_task, advance=1) diff --git a/docgenie/generation/pipeline_08_extract_visual_element_definitions.py b/docgenie/generation/pipeline_08_extract_visual_element_definitions.py new file mode 100755 index 0000000000000000000000000000000000000000..eeb12b9888c26b6228996b79e7af182dbfb4c722 --- /dev/null +++ b/docgenie/generation/pipeline_08_extract_visual_element_definitions.py @@ -0,0 +1,210 @@ +import json +import pathlib +import re + +from bs4 import BeautifulSoup +import cssutils + +from docgenie.generation.constants import ( + BS_PARSER, + VISUAL_ELEMENT_TYPE_SYNONYMS, + VISUAL_ELEMENT_TYPES, +) +from docgenie.generation.models import ( + DocLogKey, + OCRBox, + PipelineParameters, + SynDatasetDefinition, +) +from rich.progress import ( + Progress, + TimeElapsedColumn, + BarColumn, + TaskProgressColumn, + TimeRemainingColumn, +) + +from docgenie.generation.utils.bboxes import draw_bboxes_on_pdf, read_syn_dataset_bboxes +from docgenie.generation.utils.geos import read_visual_elements_from_geos +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar +from docgenie.generation.utils.visualelement import get_visual_element_id + + +def extract_dimensions(style: str) -> tuple[int | None, int | None]: + """ + Returns width,height in milimeters or None,None + """ + # Parse width and height + width = None + height = None + + for prop in style.split(";"): + if ":" not in prop: + continue + key, value = prop.split(":", 1) + key = key.strip().lower() + value = value.strip() + if key == "width": + width = value + elif key == "height": + height = value + + def normalize(val: str | None): + if val is None: + return None + + if val.endswith("mm"): + return int(val.replace("mm", "")) + elif val.endswith("cm"): + return int(float(val.replace("cm", "")) * 10) + else: + print(f'Encountered an unknown size unit "{val}". (Setting size to `None`)') + return None + + return normalize(width), normalize(height) + + +def parse_2d_rotation(transform_str): + """ + Extracts the 2D rotation angle in degrees from a CSS transform string. + Returns None if no rotation is found. + """ + # Regex to match rotate(deg) + match = re.search(r"rotate\(\s*([-+]?\d*\.?\d+)\s*deg\s*\)", transform_str) + if match: + return float(match.group(1)) + return None + + +def extract_rotation_from_transform(style: str) -> float | None | None: + if not style: + return None + style = cssutils.parseStyle(style) + return parse_2d_rotation(style["transform"]) # type: ignore + + +def extract_visual_elements_from_geos( + dsdef: SynDatasetDefinition, doc_id: str +) -> list[dict]: + files = dsdef.get_file_structure() + geo_path = files.geometries_directory / f"{doc_id}.json" + geos = read_visual_elements_from_geos(geo_path=geo_path) + geos = list(geos) + + result = [] + for i, geo in enumerate(geos): + data_type = geo["dataPlaceholder"] + + # Map using type synonyms + valid_type = data_type in VISUAL_ELEMENT_TYPES + type_mapped = data_type + if not valid_type: + if data_type in VISUAL_ELEMENT_TYPE_SYNONYMS: + type_mapped = VISUAL_ELEMENT_TYPE_SYNONYMS[data_type] # type: ignore + else: + type_mapped = None + + data_content = geo["dataContent"] + + style = geo["style"] + # width, height = extract_dimensions(style) # type: ignore + rotation = extract_rotation_from_transform(style) # type: ignore + invalid_size = geo["rect"]["width"] == 0 or geo["rect"]["height"] == 0 + if type_mapped is None: + value = { + "id": f"ve{i}", + "type": None, + "type_unmapped": data_type, + "content": data_content, + "rect": geo["rect"], + "rotation": rotation, + "error": "unknown-type", + } + elif invalid_size: + value = { + "id": f"ve{i}", + "type": type_mapped, + "type_unmapped": data_type, + "content": data_content, + "rect": geo["rect"], + "rotation": rotation, + "error": "invalid-size", + } + else: + value = { + "id": f"ve{i}", + "type": type_mapped, + "type_unmapped": data_type, + "content": data_content, + "rect": geo["rect"], + "rotation": rotation, + "error": None, + } + + # print(value) + result.append(value) + + return result + + +def mm_to_px(mm): + return mm * 72 / 25.4 + + +def pipeline_extract_visual_element_definitions(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + valid_document_ids = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1: + valid_document_ids.append(doclog.document_id) + + print( + f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for visual element extraction." + ) + + with get_progress_bar() as progress: + vee_task = progress.add_task( + f"[red]Extracting visual elements from {len(valid_document_ids)} PDFs...", + total=len(valid_document_ids), + ) + + total_visual_elements_count = 0 + for document_id in valid_document_ids: + data = extract_visual_elements_from_geos(dsdef=dsdef, doc_id=document_id) + + errors = [ + f"{d['id']}: {d['error']}" for d in data if d["error"] is not None + ] + + dsdef.write_to_document_log( + document_id=document_id, + vals={ + DocLogKey.visual_elements_num_elements: len(data), + DocLogKey.visual_elements_extraction_errors: errors, + }, + ) + + # data is None if there were no bboxes extracted for visual elements + if data is None or len(data) == 0: # type: ignore + progress.update(vee_task, advance=1) + continue + + total_visual_elements_count += len(data) + + result_path = ( + dsfiles.visual_element_definitions_directory / f"{document_id}.json" + ) + json_str = json.dumps(data, indent=4) + result_path.write_text(json_str, encoding="utf-8") + + progress.update(vee_task, advance=1) + + print(f"{total_visual_elements_count=}") diff --git a/docgenie/generation/pipeline_09_create_handwriting_images.py b/docgenie/generation/pipeline_09_create_handwriting_images.py new file mode 100755 index 0000000000000000000000000000000000000000..e7aad2cae23baa0cccfef613efc62d4063719c67 --- /dev/null +++ b/docgenie/generation/pipeline_09_create_handwriting_images.py @@ -0,0 +1,66 @@ +import json +import pathlib +from docgenie import GENERATION +from docgenie.generation.constants import WRITER_STYLES +from docgenie.generation.handwriting_diffusion.add_handwriting_blur import ( + blur_handwriting, +) +from docgenie.generation.handwriting_diffusion.generate_handwriting_diffusion_raw import ( + generate_handwriting, +) +from docgenie.generation.models import PipelineParameters +from docgenie.generation.models._log import DocLogKey, SynDocumentLog +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def pipeline_create_handwriting_images(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + has_handwriting = list( + [f for f in dsfiles.handwritten_bboxes_directory.iterdir() if f.is_file()] + ) + + if params.generate_handwriting and has_handwriting: + # sentences_path = dsfiles.handwritten_text_images_directory / "sentences" + # handwriting_exists = sentences_path.exists() + # if handwriting_exists: + # print(f"Existing handwriting found at {sentences_path} - skipping step.") + # return + + with get_progress_bar() as progress: + generate_handwriting( + input_dir=dsfiles.handwritten_bboxes_directory, + output_dir=dsfiles.handwritten_text_images_directory, + run_dir=GENERATION.HANDWRITING_MODEL_CHECKPOINT.parent, + checkpoint=GENERATION.HANDWRITING_MODEL_CHECKPOINT.name, + progress=progress, + word_gap=40, + segment_gap=0, + allowed_writers=[str(s) for s in WRITER_STYLES], + baseline_percentile=50, + batch_size=params.handwriting_batch_size, + ) + + print(f"{params.blur_handwriting_images=}") + if params.blur_handwriting_images: + blur_handwriting( + input_root=dsfiles.handwritten_text_images_directory / "sentences", + in_place=True, + suffix="", + ) + + # Log selected writer styles + log_path = dsfiles.handwritten_text_images_directory / "raw_token_map.json" + genlogs = json.loads(log_path.read_text(encoding="utf-8")) + for k, v in genlogs["file_author_styles"].items(): + doc_id = k.replace(".json", "") + dsdef.write_to_document_log( + document_id=doc_id, + vals={DocLogKey.handwriting_generation_authorid_to_writerstyle: v}, + ) + + else: + print("No handwriting bboxes found - skipping step.") diff --git a/docgenie/generation/pipeline_10_create_visual_elements.py b/docgenie/generation/pipeline_10_create_visual_elements.py new file mode 100755 index 0000000000000000000000000000000000000000..42c256776dd84968b4f5b7fafb555b9a0f62351c --- /dev/null +++ b/docgenie/generation/pipeline_10_create_visual_elements.py @@ -0,0 +1,300 @@ +""" +TODO: latent diffusion model inference +""" + +import pathlib +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.stamp import ( + create_stamp, +) +import json +from docgenie import ENV +import random +from pathlib import Path +from PIL import Image +import io +from barcode import Code128 +from barcode.writer import ImageWriter +from docgenie.generation.models import ( + DocLogKey, + PipelineParameters, + SyntheticDatasetFileStructure, + SynDatasetDefinition, + LLMType, +) +from docgenie.generation.utils.status import get_progress_bar + +__LOGO_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "logo" +__FIGURE_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "figure" +__PHOTO_PREFABS__ = ENV.VISUAL_ELEMENT_PREFABS_DIR / "photo" +_LOGO_CACHE = None +_PHOTO_CACHE = None +_CHART_CACHE = None + + +def _get_prefabs_paths(image_type: str) -> list[Path]: + """Cache logo paths to avoid repeated directory scans.""" + global _LOGO_CACHE, _PHOTO_CACHE, _CHART_CACHE + + image_type_lower = image_type.lower() + + if image_type_lower == "logo": + if _LOGO_CACHE is None: + _LOGO_CACHE = _scan_directory(__LOGO_PREFABS__, "logo") + return _LOGO_CACHE + elif image_type_lower == "photo": + if _PHOTO_CACHE is None: + _PHOTO_CACHE = _scan_directory(__PHOTO_PREFABS__, "photo") + return _PHOTO_CACHE + elif image_type_lower == "figure": + if _CHART_CACHE is None: + _CHART_CACHE = _scan_directory(__FIGURE_PREFABS__, "figure") + return _CHART_CACHE + else: + raise ValueError( + f"Invalid image_type: {image_type}. Must be 'logo', 'photo', or 'figure'" + ) + + +def _scan_directory(directory, image_type): + """Helper to scan directory for images.""" + paths = [] + for ext in ("*.png", "*.jpg", "*.jpeg"): + paths.extend(directory.glob(ext)) + + if not paths: + raise FileNotFoundError(f"No {image_type} images found in {directory}") + + return paths + + +""" +{ + "id": "ve0", + "type": "stamp", + "type_unmapped": "stamp", + "content": "CONFIDENTIAL", + "rect": { + "x": 766.7671508789062, + "y": 100.63824462890625, + "width": 138.8602294921875, + "height": 138.8602294921875 + }, + "rotation": -15.0, + "error": null + } +""" + + +def _prepare_stamp( + result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +): + content = ved["content"] + rotation = ved["rotation"] + width = ved["rect"]["width"] + height = ved["rect"]["height"] + # we dont pass rotation here, each stamp has a slight random rotation, we apply rotation in insertion + stamp = create_stamp(text=content, width=width, height=height, rot_angle=None) + stamp.save(result_path) + + +def _prepare_logo( + result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +): + logo_paths = _get_prefabs_paths("logo") # getting chached logo paths here + + selected_logo_image_path = random.choice(logo_paths) + logo_image = Image.open(selected_logo_image_path).convert( + "RGBA" + ) # check this conversion if face any issues + """If anyone want to do any processing on image do it here->like text insertion""" + logo_image.save(result_path) + + +# Generate barcode with transparent background +writer = ImageWriter() +writer.set_options( + { # I think we have to play around with these numbers + "module_width": 0.3, + "module_height": 15.0, + "quiet_zone": 6.5, + "font_size": 7, + "text_distance": 5, + "background": "rgba(255, 255, 255, 0)", # Transparent background + "foreground": "black", + } +) + + +def _prepare_barcode( + result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +): + content = ved["content"] + if content and content.strip().isdigit(): + barcode_content = content.strip() + else: + # Generate random number if content is invalid or empty + barcode_content = str( + random.randint(100000000000, 999999999999) + ) # 12-digit number + + code128 = Code128(barcode_content, writer=writer) + + # Save to buffer first to handle transparency + buffer = io.BytesIO() + code128.write(buffer, options={"format": "PNG"}) + buffer.seek(0) + + barcode_image = Image.open(buffer).convert("RGBA") # Transparent background + barcode_image.save(result_path) + + +def _prepare_photo( + result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +): + photo_paths = _get_prefabs_paths("photo") # getting chached photo paths here + + selected_photo_image_path = random.choice(photo_paths) + photo_image = Image.open( + selected_photo_image_path + ) # check this conversion if face any issues + photo_image.save(result_path) + + +def _prepare_figure( + result_path: Path, ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +): + chart_paths = _get_prefabs_paths("figure") # getting chached charts paths here + + selected_chart_image_path = random.choice(chart_paths) + chart_image = Image.open( + selected_chart_image_path + ) # check this conversion if face any issues + chart_image.save(result_path) + + +def process_visual_element_definition( + ved: dict, docid: str, dsfiles: SyntheticDatasetFileStructure +) -> dict: + content = ved["content"] + ved_id = ved["id"] + error = ved["error"] + log = { + "id": ved_id, + "type": ved["type"], + "type_unmapped": ved["type_unmapped"], + "content": content, + "error": error, + } + + document_visual_elements_dir = dsfiles.visual_elements_directory / docid + document_visual_elements_dir.mkdir(parents=True, exist_ok=True) + result_path = document_visual_elements_dir / f"{ved_id}.png" + + # Skip already generated vis elements + if error is None and not result_path.exists(): + match ved["type"]: + case "stamp": + _prepare_stamp( + result_path=result_path, + ved=ved, + docid=docid, + dsfiles=dsfiles, + ) + case "logo": + _prepare_logo( + result_path=result_path, + ved=ved, + docid=docid, + dsfiles=dsfiles, + ) + case "barcode": + _prepare_barcode( + result_path=result_path, + ved=ved, + docid=docid, + dsfiles=dsfiles, + ) + case "photo": + _prepare_photo( + result_path=result_path, + ved=ved, + docid=docid, + dsfiles=dsfiles, + ) + case "figure": + _prepare_figure( + result_path=result_path, + ved=ved, + docid=docid, + dsfiles=dsfiles, + ) + case _: + log["error"] = "unknown-type" + + log["image_path"] = str(result_path) if result_path is not None else None + + return log + + +def prepare_visual_elements( + defs: list[dict], docid: str, dsfiles: SyntheticDatasetFileStructure +) -> list[dict]: + logs = [] + + random.seed(docid) + for ved in defs: + log = process_visual_element_definition(ved, docid=docid, dsfiles=dsfiles) + logs.append(log) + + return logs + + +def pipeline_create_visual_elements(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid documents + valid_documents = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1: + has_visual_elements = doclog.visual_elements_num_elements > 0 + if has_visual_elements: + valid_documents.append(doclog.document_id) + + print( + f"{len(valid_documents)} of {total_pdfs_count} documents valid for visual element generation." + ) + + with get_progress_bar() as progress: + insert_task = progress.add_task( + "[red]Creating visual elements...", total=len(valid_documents) + ) + for docid in valid_documents: + visual_element_def_file = ( + dsfiles.visual_element_definitions_directory / f"{docid}.json" + ) + visual_element_definitions = json.loads( + visual_element_def_file.read_text(encoding="utf-8") + ) + insertion_logs = prepare_visual_elements( + defs=visual_element_definitions, docid=docid, dsfiles=dsfiles + ) + + errors = [ + f"{d['id']}: {d['error']}" + for d in insertion_logs + if d["error"] is not None + ] + dsdef.write_to_document_log( + document_id=docid, + vals={ + DocLogKey.visual_elements_generation_logs: insertion_logs, + DocLogKey.visual_elements_generation_errors: errors, + }, + ) + progress.update(insert_task, advance=1) diff --git a/docgenie/generation/pipeline_11_render_pdf_second_pass.py b/docgenie/generation/pipeline_11_render_pdf_second_pass.py new file mode 100755 index 0000000000000000000000000000000000000000..77576d23afb7d98a479b94b5851950100bdc5a6f --- /dev/null +++ b/docgenie/generation/pipeline_11_render_pdf_second_pass.py @@ -0,0 +1,281 @@ +import asyncio +from playwright.async_api import async_playwright +from concurrent.futures import ThreadPoolExecutor, as_completed +import json +import pathlib +import re +import time +import tempfile +import os + +from PyPDF2 import PdfReader +from bs4 import BeautifulSoup +from rich.progress import Progress + +from docgenie import ENV +from docgenie.generation.constants import ( + BS_PARSER, + HANDWRITING_CLASS_NAME, + PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY, + PIPELINE_03_RENDER_PDF__MAX_WORKERS, + PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES, + PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT, +) +from docgenie.generation.models import ( + DocLogKey, + PipelineParameters, + SyntheticDatasetFileStructure, + SynDatasetDefinition, +) +from docgenie.generation.models._log import SynDocumentLog +from docgenie.generation.pipeline_03.css import ( + increase_handwriting_font_size, + postprocess_handwriting, + unmark_visual_elements, +) +from docgenie.generation.utils.debug import draw_geos_on_pdf +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def safe_count_pages(pdf_path: pathlib.Path): + with open(pdf_path, "rb") as f: + reader = PdfReader(f) + return len(reader.pages) + + +async def render_pdf_async( + doclog: SynDocumentLog, + dsfiles: SyntheticDatasetFileStructure, + extract_geos_for_classes: list[str], + semaphore: asyncio.Semaphore, + max_retries=2, + timeout_seconds=60, +): + """ + Async version: Render PDF using Playwright with automatic size detection. + """ + doc_id = doclog.document_id + + last_error = None + for attempt in range(1, max_retries + 2): + browser = None + try: + pdf_path = ( + dsfiles.pdf_without_handwriting_placeholder_directory / f"{doc_id}.pdf" + ) + render_html_path = ( + dsfiles.render_html_second_pass_directory / f"{doc_id}.html" + ) + html_path = dsfiles.render_html_directory / f"{doc_id}.html" + html = html_path.read_text(encoding="utf-8") + + soup = BeautifulSoup(html, BS_PARSER) + soup = postprocess_handwriting(soup) + prep_html = soup.prettify() + render_html_path.write_text(prep_html, encoding="utf-8") # type: ignore + + # Acquire semaphore for Chromium concurrency control + async with semaphore: + try: + async with asyncio.timeout(timeout_seconds): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + # Load HTML + await page.goto( + f"file://{render_html_path}", + wait_until="domcontentloaded", + ) + await page.emulate_media(media="screen") + + page_width_px = doclog.render_html_width + page_height_px = doclog.render_html_height + + # Set viewport and wait for layout + await page.set_viewport_size( + {"width": page_width_px, "height": page_height_px} # type: ignore + ) + await page.wait_for_timeout(30) + + # Generate PDF + page_width_inches = page_width_px / 96 # type: ignore + page_height_inches = page_height_px / 96 # type: ignore + + await page.pdf( + path=str(pdf_path), + width=f"{page_width_inches}in", + height=f"{page_height_inches}in", + margin={ + "top": "0", + "bottom": "0", + "left": "0", + "right": "0", + }, + print_background=True, + display_header_footer=False, + prefer_css_page_size=False, + scale=1.0, + ) + + await browser.close() + except asyncio.TimeoutError: + print( + f"PDF rendering timed out after {timeout_seconds}s for {doc_id}" + ) + raise TimeoutError( + f"PDF rendering timed out after {timeout_seconds}s for {doc_id}" + ) + finally: + # Ensure browser closes even on timeout + if browser is not None: + try: + await browser.close() + except Exception: + pass + + pdf_num_pages = safe_count_pages(pdf_path) + + return { + DocLogKey.document_id: doc_id, + DocLogKey.pdf_num_pages: pdf_num_pages, + DocLogKey.pdf_render_error: None, + } + + except Exception as e: + print(f"[yellow]Attempt {attempt} failed for {doc_id}: {e}") + await asyncio.sleep(1) + last_error = str(e) + + return { + DocLogKey.document_id: doc_id, + DocLogKey.pdf_num_pages: None, + DocLogKey.pdf_render_error: last_error, + } + + +async def process_batch_async( + doclogs: list[SynDocumentLog], + dsfiles, + extract_geos_for_classes, + chromium_concurrency, + dsdef, + progress, + render_task, + max_retries, + timeout_seconds, +): + """Process a batch of PDFs asynchronously.""" + semaphore = asyncio.Semaphore(chromium_concurrency) + + tasks = [ + render_pdf_async( + doclog, + dsfiles, + extract_geos_for_classes, + semaphore, + max_retries=max_retries, + timeout_seconds=timeout_seconds, + ) + for (doclog) in doclogs + ] + + results = [] + for coro in asyncio.as_completed(tasks): + try: + result = await coro + dsdef.write_to_document_log( + document_id=result[DocLogKey.document_id], vals=result + ) + progress.update(render_task, advance=1) + results.append(result) + + if result[DocLogKey.pdf_render_error]: + print( + f"[red]PDF failed for {result[DocLogKey.document_id]}: {result[DocLogKey.pdf_render_error]}" + ) + elif ( + result[DocLogKey.pdf_num_pages] and result[DocLogKey.pdf_num_pages] > 1 + ): + print( + f"[yellow]Warning: {result[DocLogKey.document_id]} rendered to {result[DocLogKey.pdf_num_pages]} pages" + ) + + except Exception as e: + print(f"[red]Unexpected error: {e}") + progress.update(render_task, advance=1) + + return results + + +def pipeline_render_pdf_second_pass(params: PipelineParameters): + """ + Render HTML documents to PDF using async Playwright with automatic size detection. + Much faster than sync version! + """ + log_pipeline_level() + + chromium_concurrency = PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY + max_retries = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES + timeout_seconds = PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT + # extract_positions_for_classes = ["handwritten"] # or whatever you need + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + extract_geos_for_classes = dsdef.valid_labels or [] + + # Get valid documents that need PDF generation + valid_doclogs = [] + total_samples = 0 + + for doc in dsdef.get_document_logs(): + total_samples += 1 + pdf_path = ( + dsfiles.pdf_without_handwriting_placeholder_directory + / f"{doc.document_id}.pdf" + ) + if not pdf_path.exists(): + html_path = dsfiles.render_html_directory / f"{doc.document_id}.html" + if html_path.exists(): + valid_doclogs.append(doc) + + total = len(valid_doclogs) + print( + f"{total} valid samples out of {total_samples} total samples need to be converted." + ) + + with get_progress_bar() as progress: + render_task = progress.add_task("[red]Rendering PDFs Pass 2...", total=total) + + # Run async event loop + results = asyncio.run( + process_batch_async( + valid_doclogs, + dsfiles, + extract_geos_for_classes, + chromium_concurrency, + dsdef, + progress, + render_task, + max_retries=max_retries, + timeout_seconds=timeout_seconds, + ) + ) + + print(f"✅ Finished rendering {len(results)}/{total} PDFs.") + + # Summary stats + successful = sum(1 for r in results if r[DocLogKey.pdf_num_pages] == 1) + multi_page = sum( + 1 + for r in results + if r[DocLogKey.pdf_num_pages] and r[DocLogKey.pdf_num_pages] > 1 + ) + failed = sum(1 for r in results if r[DocLogKey.pdf_render_error]) + + print( + f"📊 Summary: {successful} single-page, {multi_page} multi-page, {failed} failed" + ) + + return results diff --git a/docgenie/generation/pipeline_12_insert_handwriting_images.py b/docgenie/generation/pipeline_12_insert_handwriting_images.py new file mode 100755 index 0000000000000000000000000000000000000000..613e954f0c5e146b51be177862407f1093cb07ee --- /dev/null +++ b/docgenie/generation/pipeline_12_insert_handwriting_images.py @@ -0,0 +1,281 @@ +""" +Handwriting insertion with left-alignment only (no region-aware scaling). +""" + +from collections import Counter, defaultdict +from io import BytesIO +import json +import pathlib +import random +import shutil +from PIL import Image + +import fitz # PyMuPDF +from fitz import Page +from docgenie import ENV +from docgenie.generation.constants import ( + FIXED_HANDWRITING_X_OFFSET, + MAX_HANDWRITING_RAND_DEG_ROT, + MAX_HANDWRITING_RAND_X_OFFSET_LEFT, + MAX_HANDWRITING_RAND_X_OFFSET_RIGHT, + MAX_HANDWRITING_RAND_Y_OFFSET_DOWN, + MAX_HANDWRITING_RAND_Y_OFFSET_UP, + PIPELINE_04_3_SCALE_UP_FACTOR, +) +from docgenie.generation.models import ( + DocLogKey, + OCRBox, + PipelineParameters, + SyntheticDatasetFileStructure, +) + +from docgenie.generation.utils.bboxes import ( + draw_bboxes_on_pdf, + read_syn_dataset_bbox_str, +) +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def resize_to_bbox_highres(img, bbox_width, bbox_height, scale_up=3): + """Resize with preserved aspect ratio, pad to bbox, upscale for sharpness.""" + bbox_width = round(bbox_width) + bbox_height = round(bbox_height) + + # Aspect Ratio + iw, ih = img.size + scale = min(bbox_width / iw, bbox_height / ih) + + new_w = int(iw * scale * scale_up) + new_h = int(ih * scale * scale_up) + + img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA") + final_img = Image.new("RGBA", (new_w, new_h), (255, 255, 255, 0)) + final_img.paste(img_resized, (0, 0), mask=img_resized) + + return final_img + + +def group_handwriting_bboxes_by_block_line(entry: dict): + """Group handwriting bboxes by block and line.""" + groupedbboxes = defaultdict(list) + + for seg_idx, bbox in enumerate(entry["bboxes"]): + box = read_syn_dataset_bbox_str(bbox) + groupedbboxes[(box.block_no, box.line_no)].append(box) + + for key, bboxes in groupedbboxes.items(): + first = bboxes[0] + # x0, y0 = first.x0, first.y0 + x0 = min([b.x0 for b in bboxes]) + y0 = min([b.y0 for b in bboxes]) + x2 = max([b.x2 for b in bboxes]) + y2 = max([b.y2 for b in bboxes]) + # last = bboxes[-1] + # x2, y2 = last.x2, last.y2 + txt = " ".join(b.text for b in bboxes) + yield OCRBox( + x0=x0, + y0=y0, + x2=x2, + y2=y2, + text=txt, + block_no=key[0], + line_no=key[1], + word_no=first.word_no, + ) + + +def insert_handwriting_images( + docid: str, dsfiles: SyntheticDatasetFileStructure, scale_up: int, debug: bool +): + """ + Insert handwriting images with LEFT-ALIGNMENT at rect.x position. + Uses original bbox height, no region-aware scaling. + """ + images_path = dsfiles.handwritten_text_images_directory / "sentences" / docid + images_generated = images_path.exists() + + json_path = dsfiles.handwritten_bboxes_directory / f"{docid}.json" + handwriting_bboxes = json.loads(json_path.read_text(encoding="utf-8")) + pdf_path = dsfiles.pdf_without_handwriting_placeholder_directory / f"{docid}.pdf" + doc = fitz.open(pdf_path) + + missing_images = [] + inserted_bboxes = [] + + for entry in handwriting_bboxes: + hw_id = entry["id"] + rect = entry["rect"] + + for seg_idx, bbox in enumerate(group_handwriting_bboxes_by_block_line(entry)): + img_name_prefix = f"{hw_id}_block{bbox.block_no}_line{bbox.line_no}" + + if not images_generated: + if img_name_prefix not in missing_images: + missing_images.append(img_name_prefix) + continue + + img_path = images_path / f"{img_name_prefix}.png" + + if not img_path.exists(): + if img_name_prefix not in missing_images: + missing_images.append(img_name_prefix) + continue + + img = Image.open(img_path) + bbox_w, bbox_h = bbox.x2 - bbox.x0, bbox.y2 - bbox.y0 + + # Resize using original logic + # print(f"{docid=} {img_name_prefix=} {bbox_w=} {bbox_h=}") + img_resized = resize_to_bbox_highres(img, bbox_w, bbox_h, scale_up=scale_up) + + # Random rotation + rnddeg = 0 # random.random() * 1.5 - (1.5 / 2) + img_resized = img_resized.rotate(rnddeg) + + # Convert to bytes + img_bytes = BytesIO() + img_resized.save(img_bytes, format="png") + img_bytes = img_bytes.getvalue() + + # LEFT-ALIGN at rect.x instead of bbox.x0 + y_padding = 50 + offset_x = ( + random.randint( + -MAX_HANDWRITING_RAND_X_OFFSET_LEFT, + MAX_HANDWRITING_RAND_X_OFFSET_RIGHT, + ) + + FIXED_HANDWRITING_X_OFFSET + ) + offset_y = random.randint( + -MAX_HANDWRITING_RAND_Y_OFFSET_UP, MAX_HANDWRITING_RAND_Y_OFFSET_DOWN + ) + x0 = rect["x"] + offset_x + y0 = bbox.y0 + offset_y - y_padding + x2 = min(x0 + img_resized.size[0] / scale_up, bbox.x2) + offset_x + y2 = ( + min(y0 + img_resized.size[1] / scale_up, bbox.y2) + + offset_y + + 2 * y_padding + ) + + # print( + # f"{bbox=} {offset_x=} {x0=} {x2=} {img_resized.size[0] / scale_up=} {docid=} {img_name_prefix=}" + # ) + + rect_fitz = fitz.Rect(x0, y0, x2, y2) + + assert len(doc) == 1 + page: Page = doc[0] + page.insert_image(rect_fitz, stream=img_bytes) + + # Store for debug + debug_bbox = OCRBox( + x0=x0, + y0=y0, + x2=x2, + y2=y2, + text=bbox.text, + block_no=bbox.block_no, + line_no=bbox.line_no, + word_no=bbox.word_no, + ) + inserted_bboxes.append(debug_bbox) + + output_path = dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf" + doc.save(output_path) + doc.close() + + # Debug + if debug: + draw_bboxes_on_pdf( + dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf", + dsfiles.debug_pdf_handwriting_directory / f"{docid}.pdf", + inserted_bboxes, + color=(1, 0, 0), # handwriting red + ) + + return { + DocLogKey.handwriting_insertion_success: images_generated + and len(missing_images) == 0, + DocLogKey.handwriting_images_were_generated: images_generated, + DocLogKey.handwriting_missing_images: missing_images, + } + + +def pipeline_handwritten_text_insertion(params: PipelineParameters, scale_up: int = 3): + """Pipeline for inserting handwritten text with left-alignment.""" + log_pipeline_level() + + scale_up = PIPELINE_04_3_SCALE_UP_FACTOR + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + valid_document_ids = [] + total_documents_count = 0 + cnt = Counter() + + for doclog in dsdef.get_document_logs(): + total_documents_count += 1 + if doclog.pdf_num_pages == 1: + cnt["pdf_num_pages"] += 1 + + # Copy each PDF to pdf_with_handwriting_directory + src = ( + dsfiles.pdf_without_handwriting_placeholder_directory + / f"{doclog.document_id}.pdf" + ) + dst = dsfiles.pdf_with_handwriting_directory / f"{doclog.document_id}.pdf" + shutil.copy(src, dst) + + if doclog.handwriting_num_elements > 0: + cnt["has_handwriting"] += 1 + if len(doclog.handwriting_element_extraction_errors) == 0: + cnt["no_errors"] += 1 + valid_document_ids.append(doclog.document_id) + else: + print( + doclog.document_id, doclog.handwriting_element_extraction_errors + ) + else: + dsdef.write_to_document_log( + document_id=doclog.document_id, + vals={ + DocLogKey.handwriting_insertion_success: True, + DocLogKey.handwriting_images_were_generated: True, + DocLogKey.handwriting_missing_images: [], + }, + ) + + print( + f"{len(valid_document_ids)} out of {total_documents_count} Documents valid for handwriting image insertion: {cnt}" + ) + + with get_progress_bar() as progress: + insert_task = progress.add_task( + "[red]Inserting text into pdfs...", total=len(valid_document_ids) + ) + + success = 0 + all_logs = [] + for docid in valid_document_ids: + insertion_log = insert_handwriting_images( + docid=docid, dsfiles=dsfiles, scale_up=scale_up, debug=params.debug + ) + + dsdef.write_to_document_log(document_id=docid, vals=insertion_log) + all_logs.append(insertion_log) + + if insertion_log[DocLogKey.handwriting_insertion_success]: + success += 1 + + progress.update(insert_task, advance=1) + + print( + f"""Inserted handwriting images in {success} PDFs + {len(valid_document_ids) - success} errors: + {len([1 for insertlog in all_logs if not insertlog[DocLogKey.handwriting_images_were_generated]])} documents dont have images generated + {sum([len(insertlog[DocLogKey.handwriting_missing_images]) for insertlog in all_logs if insertlog[DocLogKey.handwriting_images_were_generated]])} images missing for documents where images were generated""" + ) diff --git a/docgenie/generation/pipeline_13_insert_visual_elements.py b/docgenie/generation/pipeline_13_insert_visual_elements.py new file mode 100755 index 0000000000000000000000000000000000000000..c444c1c4f9f45bc912dd8c329c36930cfd937450 --- /dev/null +++ b/docgenie/generation/pipeline_13_insert_visual_elements.py @@ -0,0 +1,212 @@ +import pathlib +import shutil +from docgenie.generation.models import ( + DocLogKey, + PipelineParameters, + SyntheticDatasetFileStructure, + SynDocumentLog, + OCRBox, +) +from rich.progress import ( + Progress, + TimeElapsedColumn, + BarColumn, + TaskProgressColumn, + TimeRemainingColumn, +) +from docgenie.generation.constants import PIPELINE_04_3_SCALE_UP_FACTOR +import fitz +from fitz import Page +from PIL import Image +from io import BytesIO + +import json +from typing import Union + +from docgenie.generation.utils.geos import rect_to_ocrbox +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + +__SCALE_UP__ = PIPELINE_04_3_SCALE_UP_FACTOR + + +def resize_to_bbox_highres(img, bbox_width, bbox_height, scale_up=3): + """Resize with preserved aspect ratio, pad to bbox, upscale for sharpness.""" + + """I am not directly resizing image to bbox coords, + First calculate a scale factor that avoids overfllow + in horizontal and vertical direction(that's why min)""" + """Because scale is used for both width and height, + aspect ratio = display_w/display_h = iw/ih (unchanged) ratio will remain same.""" + bbox_width = round(bbox_width) + bbox_height = round(bbox_height) + + # -----------Aspect Ratio--------------- + iw, ih = img.size + scale = min(bbox_width / iw, bbox_height / ih) + + new_w = int(iw * scale * scale_up) + new_h = int(ih * scale * scale_up) + # -----------Aspect Ratio--------------- + # ------------Resolution----------------- + """f you embed an image whose pixel dimensions are exactly (display_w, display_h), + those are the only pixels available to draw the strokes — often too few for a crisp + rendering, especially if display_w or display_h is small. + If we X with scale_up we have more pixels to draw image.""" + # ------------Resolution----------------- + + img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS).convert("RGBA") + + # Create high-res white background + final_img = Image.new( + "RGBA", (bbox_width * scale_up, bbox_height * scale_up), (255, 255, 255, 0) + ) + + # Paste resized image centered + offset_x = (bbox_width * scale_up - new_w) // 2 + offset_y = (bbox_height * scale_up - new_h) // 2 + final_img.paste(img_resized, (offset_x, offset_y), mask=img_resized) + + return final_img + + +def mm_to_px(mm: Union[int, float]): + return mm * 72 / 25.4 + + +def insert_visual_elements( + veds: list[dict], + docid: str, + dsfiles: SyntheticDatasetFileStructure, +): + input_path = dsfiles.pdf_with_handwriting_directory / f"{docid}.pdf" + output_pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf" + + ve_dir = dsfiles.visual_elements_directory / f"{docid}" + ve_generated = ve_dir.exists() + missing_ves = [] + + doc = fitz.open(input_path) + for d in veds: + ve_id = d.get("id", None) + + if not ve_generated: + print( + f"[Warning] Visual elements directory does not exist for {docid}. Skipping" + ) + if ve_id not in missing_ves: + missing_ves.append(ve_id) + continue + img_path = ve_dir / f"{ve_id}.png" + + if not img_path.exists(): + print( + f"[Warning] Visual element with id {ve_id} do not exist for {docid}. Skipping" + ) + if ve_id not in missing_ves: + missing_ves.append(ve_id) + continue + + # computing bbox as in gitlab ticket + # width_pt = mm_to_px(d["width_mm"]) + # height_pt = mm_to_px(d["height_mm"]) + # off_x, off_y = width_pt / 2.0, height_pt / 2.0 + # b = OCRBox( + # x0=d["center_x"] - off_x, + # x2=d["center_x"] - off_x + width_pt, + # y0=d["center_y"] - off_y, + # y2=d["center_y"] - off_y + height_pt, + # text="", + # block_no=-1, + # line_no=-1, + # word_no=-1, + # ) + rect = d["rect"] + b = rect_to_ocrbox(rect) + bbox_w, bbox_h = b.width, b.height + + img = Image.open(img_path) + img_resized = resize_to_bbox_highres(img, bbox_w, bbox_h, scale_up=__SCALE_UP__) + + img_bytes = BytesIO() + img_resized.save(img_bytes, format="PNG") + img_bytes = img_bytes.getvalue() + + rect = fitz.Rect(b.x0, b.y0, b.x2, b.y2) + assert len(doc) == 1, ( + f"Multipage: {dsfiles.pdf_initial_directory / f'{docid}.pdf'}, {dsfiles.pdf_with_handwriting_directory / f'{docid}.pdf'}" + ) + page: Page = doc[0] # single-page assumption + page.insert_image(rect, stream=img_bytes) # type: ignore + + doc.save(output_pdf_path) + doc.close() + return { + DocLogKey.visual_elements_insertion_success: ve_generated + and len(missing_ves) == 0, + DocLogKey.visual_elements_were_generated: ve_generated, + DocLogKey.visual_elements_missing_images: missing_ves, + } + + +def pipeline_insert_visual_elements(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + valid_document_ids = [] + total_documents_count = 0 + + for doclog in dsdef.get_document_logs(): + total_documents_count += 1 + + if doclog.pdf_num_pages == 1: + # Already copy each PDF to pdf_final, those which have vis elems inserted are later overridden + src = dsfiles.pdf_with_handwriting_directory / f"{doclog.document_id}.pdf" + dst = dsfiles.final_pdf_directory / f"{doclog.document_id}.pdf" + shutil.copy(src, dst) + + if ( + doclog.visual_elements_num_elements > 0 + and len(doclog.visual_elements_extraction_errors) == 0 + ): + valid_document_ids.append(doclog.document_id) + print( + f"{len(valid_document_ids)} of {total_documents_count} documents valid for visual element insertion." + ) + + with get_progress_bar() as progress: + insert_task = progress.add_task( + "[red]Inserting visual elements into pdfs...", total=len(valid_document_ids) + ) + success = 0 + examples = list() + for docid in valid_document_ids: + visual_element_def_file = ( + dsfiles.visual_element_definitions_directory / f"{docid}.json" + ) + visual_element_definitions = json.loads( + visual_element_def_file.read_text(encoding="utf-8") + ) + + insertion_logs = insert_visual_elements( + veds=visual_element_definitions, docid=docid, dsfiles=dsfiles + ) + dsdef.write_to_document_log(document_id=docid, vals=insertion_logs) + if insertion_logs[DocLogKey.visual_elements_insertion_success]: + success += 1 + examples.append( + { + "docid": docid, + "types": sorted( + {v["type"] for v in visual_element_definitions} + ), + } + ) + progress.update(insert_task, advance=1) + + print( + f"""Inserted visual elements in {success} PDFs and {len(valid_document_ids) - success} errors occur. + Examples: {examples[:3]}""" + ) diff --git a/docgenie/generation/pipeline_14_render_image.py b/docgenie/generation/pipeline_14_render_image.py new file mode 100755 index 0000000000000000000000000000000000000000..00254bc50902ffe8c33ab6793490edf6176facc0 --- /dev/null +++ b/docgenie/generation/pipeline_14_render_image.py @@ -0,0 +1,61 @@ +import pathlib +from rich.progress import ( + Progress, + TimeElapsedColumn, + BarColumn, + TaskProgressColumn, + TimeRemainingColumn, +) + +from docgenie.generation.constants import IMAGE_RENDER_EXT +from docgenie.generation.models import ( + PipelineParameters, +) +from docgenie.generation.pipeline_05.pdftoimage import convert_from_path_singlepage +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + +# In a PDF, the default coordinate system uses points as its unit of measurement, and a point is defined as 1/72 of an inch. +# This means the coordinate system is effectively 72 DPI (dots per inch). + + +def pipeline_render_image(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + valid_document_ids = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1: + final_pdf_path = dsfiles.final_pdf_directory / f"{doclog.document_id}.pdf" + img_path = dsfiles.img_directory / f"{doclog.document_id}.png" + if final_pdf_path.exists() and not img_path.exists(): + valid_document_ids.append(doclog.document_id) + + print( + f"{len(valid_document_ids)} out of {total_pdfs_count} PDFs valid for image conversion." + ) + + with get_progress_bar() as progress: + img_task = progress.add_task( + f"[red]Converting {len(valid_document_ids)} PDFs to images...", + total=len(valid_document_ids), + ) + + for document_id in valid_document_ids: + # Convert PDF to list of PIL images + """Changing pdf locattion to final_pdf directory""" + pdf_path = dsfiles.final_pdf_directory / f"{document_id}.pdf" + img = convert_from_path_singlepage(pdf_path) + + img_path = dsfiles.img_directory / f"{document_id}.{IMAGE_RENDER_EXT}" + img.save(img_path, IMAGE_RENDER_EXT.upper()) + + # bboxes_path = dsfiles.bboxes_directory / f'{sample_id}.txt' + # _draw_bboxes(img_path, bboxes_path) + + progress.update(img_task, advance=1) diff --git a/docgenie/generation/pipeline_15_perform_ocr.py b/docgenie/generation/pipeline_15_perform_ocr.py new file mode 100755 index 0000000000000000000000000000000000000000..e9c94961b1244731e08fcc0fbc65329775b9cec2 --- /dev/null +++ b/docgenie/generation/pipeline_15_perform_ocr.py @@ -0,0 +1,236 @@ +from collections import defaultdict +import json +from typing import Literal +from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI +from docgenie.generation.models import DocLogKey, OCRBox, PipelineParameters +from docgenie.generation.models._syndatadef import SynDatasetDefinition +from docgenie.generation.utils.bboxes import read_syn_dataset_bboxes, save_bboxes +from docgenie.generation.utils.debug import draw_geos_and_bboxes_on_pdf +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.ocr import call_ocr_service_from_file +from docgenie.generation.utils.status import get_progress_bar +from docgenie.utils.ocr import MicrosoftOCR, MicrosoftOCRWord + + +def _convert_ms_ocr_to_ocrbox( + ocr: MicrosoftOCR, level: Literal["word", "lines"] +) -> list[OCRBox]: + res = list() + word: MicrosoftOCRWord + collection = ocr.words if level == "word" else ocr.lines + for word in collection: + (left, top, width, height) = tuple(word.geo) + box = OCRBox( + x0=left, + y0=top, + x2=left + width, + y2=top + height, + text=word.text, + block_no=-1, # not supplied + line_no=-1, # not supplied + word_no=-1, # not supplied + ) + res.append(box) + + return res + + +def _convert_word_level_to_line_level_bboxes(bboxes: list[OCRBox]) -> list[OCRBox]: + grouped = defaultdict(list) + for b in bboxes: + grouped[(b.block_no, b.line_no)].append(b) + + result = [] + for (block_no, line_no), boxes in grouped.items(): + first: OCRBox = boxes[0] + last: OCRBox = boxes[-1] + txt = " ".join([b.text for b in boxes]) + result.append( + OCRBox( + x0=first.x0, + y0=first.y0, + x2=last.x2, + y2=last.y2, + text=txt, + block_no=block_no, + line_no=line_no, + word_no=first.word_no, + ) + ) + + return result + + +def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str, bboxes: list[OCRBox]): + dsfiles = dsdef.get_file_structure() + + pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf" + + geo_path = dsfiles.geometries_directory / f"{docid}.json" + geos = json.loads(geo_path.read_text(encoding="utf-8")) + + outpath = dsfiles.debug_ocr_bboxes_and_geos_directory / f"{docid}.pdf" + + # for g in geos: + # g["rect"] = pdf_region_to_image(g["rect"]) + + bboxes = [b.scale(72.0 / PDF_DPI) for b in bboxes] + + try: + draw_geos_and_bboxes_on_pdf( + pdf_in=pdf_path, + pdf_out=outpath, + bboxes_=bboxes, + geos=geos, + verbose=False, + ) + except Exception as err: + print(f"[ERROR]: Skipping debug PDF: {str(err)}") + + +def pipeline_perform_ocr(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid PDF paths (single page, not processed yet) + documents_requiring_ocr = [] + documents_not_requiring_ocr = [] + for doclog in dsdef.get_document_logs(): + has_valid_handwriting = ( + doclog.handwriting_num_elements > 0 + and len(doclog.handwriting_element_extraction_errors) == 0 + ) + has_valid_visual_elements = ( + doclog.visual_elements_num_elements > 0 + and len(doclog.visual_elements_extraction_errors) == 0 + ) + is_valid_document = doclog.pdf_num_pages == 1 + if not is_valid_document: + continue + + if has_valid_handwriting or has_valid_visual_elements: + documents_requiring_ocr.append(doclog.document_id) + else: + documents_not_requiring_ocr.append(doclog.document_id) + + total_valid_documents = len(documents_requiring_ocr) + len( + documents_not_requiring_ocr + ) + print( + f"{len(documents_requiring_ocr)} out of {total_valid_documents} valid documents require OCR." + ) + + """ + We collect BBoxes and supply segment-level bounding boxes in the end + """ + + # First copy BBoxes extracted from PDF for those documents that don't require OCR + with get_progress_bar() as progress: + task = progress.add_task( + "[white]Copy BBoxes for documents not requiring OCR...", + total=len(documents_not_requiring_ocr), + ) + + for docid in documents_not_requiring_ocr: + pdf_bbox_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid) + word_bboxes = read_syn_dataset_bboxes(pdf_bbox_path) + result_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid) + save_bboxes(word_bboxes, result_path) + + line_bboxes = _convert_word_level_to_line_level_bboxes(word_bboxes) + result_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid) + save_bboxes(line_bboxes, result_path) + + dsdef.write_to_document_log( + document_id=docid, + vals={ + DocLogKey.ocr_required: False, + DocLogKey.ocr_found: True, + DocLogKey.ocr_num_bboxes_words: len(word_bboxes), + DocLogKey.ocr_num_bboxes_lines: len(line_bboxes), + }, + ) + + progress.update(task, advance=1) + + with get_progress_bar() as progress: + task = progress.add_task( + "[white]Performing OCR for documents...", total=len(documents_requiring_ocr) + ) + + # Then parse OCR results for other documents + ocr_not_found_count = 0 + for docid in documents_requiring_ocr: + image_file = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}" + ocr_result_file = ( + dsfiles.ocr_results_directory + / f"{docid}.{IMAGE_RENDER_EXT}.0.MicrosoftOcrService.json" + ) + + ocr_error = None + try: + if ocr_result_file.exists(): + ocr_result = MicrosoftOCR.load_from_file(ocr_result_file) + else: + ocr_result: MicrosoftOCR = call_ocr_service_from_file( + image_file, client_caching=False + ) + ocr_result.save_to_file(ocr_result_file) + except Exception as e: + ocr_error = str(e) + + ocr_found = ocr_result_file.exists() + + num_bboxes_words = -1 + num_bboxes_lines = -1 + if ocr_found: + bboxes: list[OCRBox] = _convert_ms_ocr_to_ocrbox( + ocr=ocr_result, # type: ignore + level="word", + ) + + # Write to file + result_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid) + save_bboxes( + bboxes=bboxes, + bbox_path=result_path, + ) + num_bboxes_words = len(bboxes) + + if params.debug: + draw_bbox_debug(dsdef=dsdef, docid=docid, bboxes=bboxes) + + # Parse Microsoft OCR for lines + ocr_result: MicrosoftOCR = MicrosoftOCR.load_from_file(ocr_result_file) + bboxes: list[OCRBox] = _convert_ms_ocr_to_ocrbox( + ocr=ocr_result, level="lines" + ) + + # Write to file + result_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid) + save_bboxes( + bboxes=bboxes, + bbox_path=result_path, + ) + num_bboxes_lines = len(bboxes) + else: + ocr_not_found_count += 1 + + dsdef.write_to_document_log( + document_id=docid, + vals={ + DocLogKey.ocr_required: True, + DocLogKey.ocr_found: ocr_found, + DocLogKey.ocr_num_bboxes_words: num_bboxes_words, + DocLogKey.ocr_num_bboxes_lines: num_bboxes_lines, + DocLogKey.ocr_error: ocr_error, + }, + ) + + progress.update(task, advance=1) + + print( + f"{ocr_not_found_count} of {len(documents_requiring_ocr)} OCR results documents missing." + ) diff --git a/docgenie/generation/pipeline_16_normalize_bboxes.py b/docgenie/generation/pipeline_16_normalize_bboxes.py new file mode 100755 index 0000000000000000000000000000000000000000..5b709f16d63007043930c4aee058f9cc1a5dec8d --- /dev/null +++ b/docgenie/generation/pipeline_16_normalize_bboxes.py @@ -0,0 +1,151 @@ +from dataclasses import asdict +import json +from PIL import Image +import fitz +from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI +from docgenie.generation.models import OCRBox, PipelineParameters, SynDatasetDefinition +from docgenie.generation.models._bbox import LayoutBox +from docgenie.generation.models._file import SyntheticDatasetFileStructure +from docgenie.generation.models._log import SynDocumentLog +from docgenie.generation.utils.bboxes import ( + read_syn_dataset_bboxes, + save_bboxes, +) +from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def normalize_ocrbox(bbox: OCRBox, width_px, height_px): + """ + Convert a bounding box from PDF points to normalized image coordinates. + """ + # Convert PDF points to pixels + x_min_px = bbox.x0 + y_min_px = bbox.y0 + x_max_px = bbox.x2 + y_max_px = bbox.y2 + + # Get image size in pixels + img_w_px = width_px + img_h_px = height_px + + # Normalize bounding box + x_min_norm = x_min_px / img_w_px + y_min_norm = y_min_px / img_h_px + x_max_norm = x_max_px / img_w_px + y_max_norm = y_max_px / img_h_px + + return OCRBox( + x0=x_min_norm, + y0=y_min_norm, + x2=x_max_norm, + y2=y_max_norm, + text=bbox.text, + block_no=bbox.block_no, + line_no=bbox.line_no, + word_no=bbox.word_no, + ) + + +def normalize_and_save_word_and_segment_bboxes(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + + width_px, height_px = get_document_size_for_bbox_unnormalization(docid=docid, dsfiles=dsfiles) + + # word + bbox_word_path = dsfiles.get_final_bbox_path(level="word", doc_id=docid) + bbox_word = read_syn_dataset_bboxes(bbox_word_path) + bbox_word_normalized = [ + normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px) + for b in bbox_word + ] + bbox_word_normalized_path = dsfiles.get_final_normalized_bbox_path( + level="word", doc_id=docid + ) + save_bboxes(bboxes=bbox_word_normalized, bbox_path=bbox_word_normalized_path) + + # segment + bbox_segment_path = dsfiles.get_final_bbox_path(level="segment", doc_id=docid) + bbox_segment = read_syn_dataset_bboxes(bbox_segment_path) + bbox_segment_normalized = [ + normalize_ocrbox(bbox=b, width_px=width_px, height_px=height_px) + for b in bbox_segment + ] + bbox_segment_normalized_path = dsfiles.get_final_normalized_bbox_path( + level="segment", doc_id=docid + ) + save_bboxes(bboxes=bbox_segment_normalized, bbox_path=bbox_segment_normalized_path) + + +def normalize_layout_bboxes(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + + pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf" + doc = fitz.open(pdf_path) + page = doc[0] + width_pt, height_pt = page.rect.width, page.rect.height + + raw_annotations_path = dsfiles.raw_annotations_directory / f"{docid}.json" + data = json.loads(raw_annotations_path.read_text(encoding="utf-8")) + layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data] + + layout_bboxes_normalized = [ + LayoutBox.normalize_to_pdf( + b, width_pt=width_pt, height_pt=height_pt, dpi=PDF_DPI + ) + for b in layout_bboxes + ] + + boxes_dicts = [asdict(b) for b in layout_bboxes_normalized] + gt_path = dsfiles.gt_directory / f"{docid}.json" + gt_path.write_text(json.dumps(boxes_dicts, indent=4), encoding="utf-8") + + doc.close() + + +def pipeline_normalize_bboxes(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + + # Get documents valid for bbox normalization + valid_documents = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1 and doclog.ocr_found: + valid_documents.append(doclog.document_id) + + print(f"Found {len(valid_documents)} documents valid for BBox normalization.") + + with get_progress_bar() as progress: + task = progress.add_task( + "[white]Normalizing BBoxes...", total=len(valid_documents) + ) + + for docid in valid_documents: + normalize_and_save_word_and_segment_bboxes(dsdef=dsdef, docid=docid) + progress.update(task, advance=1) + + # We also normalize the DLA GT here as they are layout BBoxes + # Get documents valid for layout bbox normalization + valid_documents = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + if doclog.pdf_num_pages == 1 and doclog.layout_elements_num_elements > 0: + valid_documents.append(doclog.document_id) + + print( + f"Found {len(valid_documents)} documents valid for Layout BBox normalization." + ) + + with get_progress_bar() as progress: + task = progress.add_task( + "[white]Normalizing Layout BBoxes...", total=len(valid_documents) + ) + + for docid in valid_documents: + normalize_layout_bboxes(dsdef=dsdef, docid=docid) + progress.update(task, advance=1) diff --git a/docgenie/generation/pipeline_17_gt_preparation_verification.py b/docgenie/generation/pipeline_17_gt_preparation_verification.py new file mode 100755 index 0000000000000000000000000000000000000000..c621330d641556e897151cb5a83b665b788c03ac --- /dev/null +++ b/docgenie/generation/pipeline_17_gt_preparation_verification.py @@ -0,0 +1,818 @@ +import json +import re +from dataclasses import asdict +from itertools import combinations + +import Levenshtein +import fitz + +from docgenie.generation.constants import ( + BBOX_TO_GEO_MATCHING_THRESHOLD, + PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF, + PDF_DPI, +) +from docgenie.generation.models import ( + DocLogKey, + OCRBox, + PipelineParameters, + SynDatasetDefinition, + DatasetTask, + SyntheticDatasetFileStructure, +) +from docgenie.generation.models._bbox import LayoutBox +from docgenie.generation.models._log import SynDocumentLog +from docgenie.generation.utils.bboxes import is_in_rect, read_syn_dataset_bboxes +from docgenie.generation.utils.documentsize import get_document_size_for_bbox_unnormalization, get_image_size_px, get_pdf_size_pt +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + +__KEY_SEPERATOR = "<<%?" # some value that will surely never be part of a key in KIE + + +def normalize(s: str) -> str: + return re.sub(r"\s+", " ", s.strip()) + + +def _find_best_fuzzy_match_span( + original_text: str, + pattern: str, + cutoff: float, + text_positions: list[tuple[int, int]], +): + """ + Returns (best_candidate_text, best_score, found, [bbox_indices]) + """ + clean_text = normalize(original_text) + clean_text_lower = clean_text.lower() + clean_pattern = normalize(pattern).lower() + pat_len = len(clean_pattern) + + best_candidate = "" + best_score = -1 + best_span = (0, 0) + + for i in range(0, len(clean_text) - pat_len + 1): + candidate = clean_text_lower[i : i + pat_len] + dist = Levenshtein.distance(candidate, clean_pattern) + clen = max(len(clean_pattern), len(candidate)) + if clen == 0: + continue + score = 1 - dist / clen + if score > best_score: + best_score = score + best_candidate = clean_text[i : i + pat_len] + best_span = (i, i + pat_len) + + found = best_score >= cutoff + + # Map char span → bbox indices + bbox_indices = [] + if found: + span_start, span_end = best_span + for idx, (start, end) in enumerate(text_positions): + if end < span_start: + continue + if start > span_end: + break + bbox_indices.append(idx) + + return best_candidate, best_score, found, bbox_indices + + +def _find_best_fuzzy_match_span_restriced( + original_text: str, + pattern: str, + cutoff: float, + allowed_bbox_indices: list[int] | None, + text_positions: list[tuple[int, int]], +): + """ + Returns (best_candidate_text, best_score, found, [bbox_indices]) + """ + clean_text = normalize(original_text) + clean_text_lower = clean_text.lower() + clean_pattern = normalize(pattern).lower() + pat_len = len(clean_pattern) + + best_candidate = "" + best_score = -1 + best_span = (0, 0) + + # Determine which character ranges are allowed + if allowed_bbox_indices is not None: + allowed_char_ranges = [ + text_positions[i] + for i in allowed_bbox_indices + # if 0 <= i < len(text_positions) + ] + # Merge them into one list of allowed character indices + allowed_chars = set() + for start, end in allowed_char_ranges: + allowed_chars.update(range(start, end + 1)) + else: + allowed_chars = set(range(len(clean_text))) + + # Scan only candidate windows where *all chars* fall within allowed ranges + for i in range(0, len(clean_text) - pat_len + 1): + window_range = set(range(i, i + pat_len)) + # if allowed_bbox_indices is not None: + # input( + # f"{i=} {window_range=} {window_range.issubset(allowed_chars)=} {allowed_chars=}" + # ) + if not window_range.issubset(allowed_chars): + continue # skip if this substring crosses disallowed areas + + candidate = clean_text_lower[i : i + pat_len] + dist = Levenshtein.distance(candidate, clean_pattern) + clen = max(len(clean_pattern), len(candidate)) + if clen == 0: + continue + score = 1 - dist / clen + if score > best_score: + best_score = score + best_candidate = clean_text[i : i + pat_len] + best_span = (i, i + pat_len) + + found = best_score >= cutoff + + # Map char span → bbox indices + bbox_indices = [] + if found: + span_start, span_end = best_span + for idx, (start, end) in enumerate(text_positions): + if end < span_start: + continue + if start > span_end: + break + # Only include bbox if it's allowed (if restricted) + if allowed_bbox_indices is None or idx in allowed_bbox_indices: + bbox_indices.append(idx) + + return best_candidate, best_score, found, bbox_indices + + +def _verify_dla_valid_labels( + layout_bboxes: list[LayoutBox], valid_labels: list[str] +) -> bool: + """ + Checks that all labels on the layout elements are valid. + Returns True if all labels are valid. False otherwise. + """ + for b in layout_bboxes: + if b.label not in valid_labels: + return False + + return True + + +def _verify_dla_has_containment_or_overlap( + layout_bboxes: list[LayoutBox], overlap_threshold: float +) -> bool: + """ + Checks if there are layout elements contained within other layout elements and if there are strong overlaps. + Returns True if there are elements contained within another or if there are strong overlaps. False otherwise. + """ + for box1, box2 in combinations(layout_bboxes, 2): + if LayoutBox.box_contains(box1, box2) or LayoutBox.box_contains(box2, box1): + return True + + overlap_ratio = LayoutBox.calculate_overlap_ratio(box1, box2) + if overlap_ratio > overlap_threshold: + return True + + return False + + +def _dla_load_visual_elements( + document_id: str, dsfiles: SyntheticDatasetFileStructure +) -> list[LayoutBox]: + """ + Load all available visual elements for a document. + Returns the visual elements as a list of layout boxes. + """ + visual_elements: list[LayoutBox] = [] + ve_data_path = dsfiles.visual_element_definitions_directory / f"{document_id}.json" + if not ve_data_path.exists(): + return [] + + data = json.loads(ve_data_path.read_text(encoding="utf-8")) + for d in data: + if d["error"] is not None: + continue + + rect = d["rect"] + label = d["type"] + visual_elements.append( + LayoutBox( + x0=rect["x"], + y0=rect["y"], + x2=rect["x"] + rect["width"], + y2=rect["y"] + rect["height"], + label=label.lower(), + ) + ) + + return visual_elements + + +def _dla_merge_visual_elements_into_dla_annotations( + document_id: str, + dsdef: SynDatasetDefinition, + layout_bboxes: list[LayoutBox], + visual_elements: list[LayoutBox], + overlap_threshold: float, +) -> list[LayoutBox]: + """ + Merge visual elements into dla annotations if they are missing in the dla annotations. + We currently only merge figure/picture elements. + Returns a new list of layout boxes that is extended. + """ + ds_has_figures = "LE-FIGURE" in dsdef.valid_labels + ds_has_pictures = "LE-PICTURE" in dsdef.valid_labels + visual_elements_figures = [ + element + for element in visual_elements + if element.label == "figure" and (ds_has_figures or ds_has_pictures) + ] + + result = list(layout_bboxes) + for figure in visual_elements_figures: + has_strong_overlap = any( + LayoutBox.calculate_overlap_ratio(figure, layout_box) > overlap_threshold + for layout_box in layout_bboxes + ) + if has_strong_overlap: + # Already contained + continue + + _label = "LE-FIGURE" + if ds_has_pictures and not ds_has_figures: + _label = "LE-PICTURE" + _element = LayoutBox( + x0=figure.x0, y0=figure.y0, x2=figure.x2, y2=figure.y2, label=_label + ) + result.append(_element) + # print(f'[NOTE] {document_id}: Inserting {_label} visual element into layout element GT because it was missing!') + + return result + + +def _dla_get_pdf_size_pt( + document_id: str, dsfiles: SyntheticDatasetFileStructure +) -> tuple[float, float]: + pdf_path = dsfiles.final_pdf_directory / f"{document_id}.pdf" + doc = fitz.open(pdf_path) + page = doc[0] + width_pt, height_pt = page.rect.width, page.rect.height + doc.close() + return width_pt, height_pt + + +def prepare_and_verify_gt_dla( + document_id: str, + dsfiles: SyntheticDatasetFileStructure, + dsdef: SynDatasetDefinition, + params: PipelineParameters, +) -> dict: + # written in pipeline_16_normalize_bboxes + gt_path = dsfiles.raw_annotations_directory / f"{document_id}.json" + data: list[dict] = json.loads(gt_path.read_text(encoding="utf-8")) + layout_bboxes: list[LayoutBox] = [LayoutBox(**d) for d in data] + + # Check if we only have valid Layout element labels. + all_labels_known = _verify_dla_valid_labels(layout_bboxes, dsdef.valid_labels) + + # Check that there are no layout elements contained within others or are strongly overlapping. + # has_containment_or_overlap = False + # if all_labels_known: + # has_containment_or_overlap = _verify_dla_has_containment_or_overlap( + # layout_bboxes, overlap_threshold=0.5 + # ) + # # if has_containment_or_overlap: + # # print(f'[ERROR]: Skipping {document_id} due to containment or overlap in the layout elements.') + + gt_verification_passed = all_labels_known + + # Merge layout elements generated by the LLM with visual elements generated by the LLM. + if gt_verification_passed: + visual_elements: list[LayoutBox] = _dla_load_visual_elements( + document_id, dsfiles + ) + layout_bboxes = _dla_merge_visual_elements_into_dla_annotations( + document_id, dsdef, layout_bboxes, visual_elements, overlap_threshold=0.8 + ) + + # Save post-processed GTs as new RAW-GT + gt_path = dsfiles.raw_annotations_directory / f"{document_id}.json" + gt_path.write_text( + json.dumps([asdict(box) for box in layout_bboxes], indent=2), + encoding="utf-8", + ) + + # Save post-processed GTs as new Normalized-GT + pdf_width_pt, pdf_height_pt = _dla_get_pdf_size_pt(document_id, dsfiles) + layout_bboxes_normalized = [ + LayoutBox.normalize_to_pdf( + box, width_pt=pdf_width_pt, height_pt=pdf_height_pt, dpi=PDF_DPI + ) + for box in layout_bboxes + ] + + # Filter out problematic Bboxes + def is_valid_bbox(b: LayoutBox): + def val(v: float): + return v >= 0 and v <= 1 + + w = b.x2 - b.x0 + h = b.y2 - b.y0 + return ( + val(w) + and val(h) + and val(b.x0) + and val(b.x2) + and val(b.y0) + and val(b.y2) + ) + + layout_bboxes_normalized_filtered = [] + problematic_bboxes = [] + for box in layout_bboxes_normalized: + if is_valid_bbox(box): + layout_bboxes_normalized_filtered.append(box) + else: + problematic_bboxes.append(box) + + if len(problematic_bboxes) == 0: + # ... + # print( + # f"Removed {len(problematic_bboxes)} buggy bboxes, remaining: {len(layout_bboxes_normalized_filtered)}!" + # ) + # print(dsfiles.debug_pdf_layout_directory / f"{document_id}.pdf") + # input(document_id) + + gt_path = dsfiles.gt_directory / f"{document_id}.json" + gt_path.write_text( + json.dumps( + [asdict(box) for box in layout_bboxes_normalized_filtered], indent=2 + ), + encoding="utf-8", + ) + + # Update the debug PDF + update_debug_pdfs = params.debug + if update_debug_pdfs: + from docgenie.generation.utils.debug import draw_geos_on_pdf + + debug_pdf_file = ( + dsfiles.debug_pdf_layout_directory / f"{document_id}.pdf" + ) + print(f"Updating: {debug_pdf_file}") + draw_geos_on_pdf( + geos=[ + { + "rect": { + "x": box.x0, + "y": box.y0, + "width": box.x2 - box.x0, + "height": box.y2 - box.y0, + } + } + for box in layout_bboxes + ], + pdf_in=dsfiles.pdf_initial_directory / f"{document_id}.pdf", + pdf_out=debug_pdf_file, + ) + else: + gt_verification_passed = False + + gt_validation_log = { + DocLogKey.gt_verification_confirmed_keys: [], + DocLogKey.gt_verification_similarities: [], + DocLogKey.gt_verification_passed: gt_verification_passed, + DocLogKey.gt_verification_skipped: False, + } + return gt_validation_log + + +def prepare_and_verify_gt_classification( + document_id: str, + raw_annotations: dict, + dsfiles: SyntheticDatasetFileStructure, + dsdef: SynDatasetDefinition, +) -> dict: + _, cls = next(iter(raw_annotations.items()), (None, None)) + gt_data = {"label": cls} + gt_path = dsfiles.gt_directory / f"{document_id}.json" + gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8") + + valid_label = cls in dsdef.valid_labels + if not valid_label: + print(f'Not a valid label "{cls}", not in {dsdef.valid_labels}') + + gt_validation_log = { + DocLogKey.gt_verification_confirmed_keys: [], + DocLogKey.gt_verification_similarities: [], + DocLogKey.gt_verification_passed: valid_label, + DocLogKey.gt_verification_skipped: False, + } + return gt_validation_log + + +def _postprocess_qa_gt_search_answer_indices( + gts: dict, document_text, cutoff: float, bboxes: list[OCRBox], text_positions +): + verbatim_gts = dict() + similarities = dict() + keys_with_values_found = list() + bbox_indices_per_key = dict() + + # Build document text and map each word's char span + document_text = "" + text_positions = [] + pos = 0 + for b in bboxes: + start = pos + document_text += b.text + " " + end = len(document_text) - 1 + text_positions.append((start, end)) + pos = len(document_text) + + for k, v in gts.items(): + if isinstance(v, dict): + for qa_key, qa_value in v.items(): + best_text, similarity, found, bbox_indices = ( + _find_best_fuzzy_match_span( + document_text, + qa_value, + cutoff=cutoff, + text_positions=text_positions, + ) + ) + + full_key = f"{k}{__KEY_SEPERATOR}{qa_key}" + if found: + keys_with_values_found.append(full_key) + + verbatim_gts[full_key] = best_text.strip() + similarities[full_key] = similarity + bbox_indices_per_key[full_key] = bbox_indices + + else: + best_text, similarity, found, bbox_indices = _find_best_fuzzy_match_span( + document_text, + v, + cutoff=cutoff, + text_positions=text_positions, + ) + + if found: + keys_with_values_found.append(k) + + verbatim_gts[k] = best_text.strip() + similarities[k] = similarity + bbox_indices_per_key[k] = bbox_indices + + return verbatim_gts, keys_with_values_found, similarities, bbox_indices_per_key + + +def prepare_and_verify_gt_qa( + dsdef: SynDatasetDefinition, + dsfiles: SyntheticDatasetFileStructure, + document_id: str, + verbatim_gts: dict, + bbox_indices_per_key: dict, + keys_with_values_found: list, + similarities: dict, +): + gt_data = [] + + for i, q in enumerate(keys_with_values_found): + answer_indices = bbox_indices_per_key[q] + a = verbatim_gts[q] + gt_data.append( + { + "question": q, + "answer": a, + "answer_bbox_indices": answer_indices, + } + ) + + # Save postprocessed GTs + gt_path = dsfiles.gt_directory / f"{document_id}.json" + gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8") + + # Return GT validation log + bbox info + gt_validation_log = { + DocLogKey.gt_verification_confirmed_keys: keys_with_values_found, + DocLogKey.gt_verification_similarities: similarities, + DocLogKey.gt_verification_passed: len(keys_with_values_found) > 0, + DocLogKey.gt_verification_skipped: False, + } + + return gt_validation_log + +def pdf_region_to_image(r): + scale = PDF_DPI / 72.0 + x_px = r["x"] * scale + #y_px = (page_height_pt - y_pt - h_pt) * scale + y_px = r["y"] * scale + w_px = r["width"] * scale + h_px = r["height"] * scale + return{ "x": x_px, "y": y_px, "width": w_px, "height": h_px } + + +def _postprocess_kie_gt_search_key_indices( + gts: list, + document_text, + cutoff: float, + bboxes: list[OCRBox], + text_positions, + doclog: SynDocumentLog, + dsfiles: SyntheticDatasetFileStructure, + is_annotation_task: bool, +): + verbatim_gts = dict() + similarities = dict() + keys_with_values_found = list() + bbox_indices_per_key = dict() + key_to_label = dict() + + for d in gts: + if d["error"]: + continue + + g = d["group"] + k = d["key"] + lbl = k + if g is not None: + k = f"{k}_{g}" + + key_to_label[k] = lbl + v = d["value"] + r = d["rect"] + + if is_annotation_task and doclog.ocr_required: + r = pdf_region_to_image(r) + + found = False + best_text = "" + similarity = -1 + bbox_indices = None + + if v: + bbox_indices_in_rect = ( + [ + i + for i, b in enumerate(bboxes) + if is_in_rect( + rect=r, bbox=b, threshold=BBOX_TO_GEO_MATCHING_THRESHOLD + ) + ] + if is_annotation_task + else None + ) + + best_text, similarity, found, bbox_indices = ( + _find_best_fuzzy_match_span_restriced( + document_text, + v, + cutoff=cutoff, + allowed_bbox_indices=bbox_indices_in_rect, # is None for JSON tasks und thus unrestricted + text_positions=text_positions, + ) + ) + + # if doclog.document_id == '74577486-4e36-425e-b733-8a745ca840f1_0': + # print(f'{is_annotation_task=} {k=} {v=} {best_text=} {similarity=} {found=} {bbox_indices=} {bbox_indices_in_rect=}') + # input() + + # if not found: + # print( + # f"RESTRICTED\n{bbox_indices_in_rect=} {v=} {best_text=} {similarity=} {found=} {bbox_indices=}" + # ) + # print(" ".join([bboxes[i].text for i in bbox_indices_in_rect])) + # input() + + if found: + keys_with_values_found.append(k) + + verbatim_gts[k] = best_text.strip() + similarities[k] = similarity + bbox_indices_per_key[k] = bbox_indices + + return ( + verbatim_gts, + keys_with_values_found, + similarities, + bbox_indices_per_key, + key_to_label, + ) + + +def prepare_and_verify_gt_kie( + dsdef: SynDatasetDefinition, + dsfiles: SyntheticDatasetFileStructure, + document_id: str, + verbatim_gts: dict, + key_to_label: dict, + word_bboxes: list[OCRBox], + bbox_indices_per_key: dict, + keys_with_values_found: list, + similarities: dict, +): + gt_data = dict() + gt_data["entities"] = [] + + # BIO Tagging: first collect all B- and I- + non_o = dict() + known_keys = set() + key: str + for key in keys_with_values_found: + k = key + lbl = key_to_label[k] + g = k.replace(f"{lbl}_", "") + + # grouping is only relevant for entity linking tasks + + known_keys.add(k) + answer_indices = bbox_indices_per_key[key] + value = verbatim_gts[key] + + label_mapped = lbl + if dsdef.label_mapping is not None and len(dsdef.label_mapping) > 0: + label_mapped = dsdef.label_mapping[lbl] + + gt_data["entities"].append( + { + "key": label_mapped, + "value": value, + "group": g, # is '' when no group given, not None + "bbox_indices": answer_indices, + "similarity": similarities[k], + } + ) + + # print(f"{key=} {value=} {answer_indices=}") + for i, bidx in enumerate(answer_indices): + prefix = "B-" if i == 0 else "I-" + non_o[bidx] = f"{prefix}{lbl}" + + # Then add all O Tags + word_labels = [non_o.get(i, "O") for i in range(len(word_bboxes))] + gt_data["word_labels"] = word_labels + + # Save postprocessed GTs + gt_path = dsfiles.gt_directory / f"{document_id}.json" + gt_path.write_text(json.dumps(gt_data, indent=2), encoding="utf-8") + + # Return GT validation log + bbox info + gt_validation_log = { + DocLogKey.gt_verification_confirmed_keys: keys_with_values_found, + DocLogKey.gt_verification_similarities: similarities, + DocLogKey.gt_verification_passed: len(keys_with_values_found) > 0, + DocLogKey.gt_verification_skipped: False, + } + + return gt_validation_log + + +def prepare_and_verify_gt( + dsdef: SynDatasetDefinition, + document_id: str, + cutoff: float, + params: PipelineParameters, +) -> dict: + dsfiles = dsdef.get_file_structure() + dataset_task = DatasetTask(dsdef.task) + if dataset_task == DatasetTask.DLA: + return prepare_and_verify_gt_dla( + document_id=document_id, dsfiles=dsfiles, dsdef=dsdef, params=params + ) + + raw_annotations_path = dsfiles.raw_annotations_directory / f"{document_id}.json" + raw_annotations = json.loads(raw_annotations_path.read_text(encoding="utf-8")) + + if dataset_task == DatasetTask.CLASSIFICATION: + # Classification labels do not need to be searched in the OCR. + # Currently, we do not check if the generated labels are valid, it might contain hallucinations. + return prepare_and_verify_gt_classification( + document_id=document_id, + raw_annotations=raw_annotations, + dsfiles=dsfiles, + dsdef=dsdef, + ) + + bbox_path = dsfiles.get_final_bbox_path(level="word", doc_id=document_id) + bboxes: list[OCRBox] = read_syn_dataset_bboxes(box_path=bbox_path) + # Rect ist das Problem nicht bbox + #bboxes = [b.unnormalize(width_px=width_px, height_px=height_px) for b in bboxes_normalized] + + # Build document text and map each word's char span + document_text = "" + text_positions = [] + pos = 0 + for b in bboxes: + start = pos + document_text += b.text + " " + end = len(document_text) - 1 + text_positions.append((start, end)) + pos = len(document_text) + + if dataset_task == DatasetTask.QA: + verbatim_gts, keys_with_values_found, similarities, bbox_indices_per_key = ( + _postprocess_qa_gt_search_answer_indices( + gts=raw_annotations, + document_text=document_text, + cutoff=cutoff, + bboxes=bboxes, + text_positions=text_positions, + ) + ) + + return prepare_and_verify_gt_qa( + dsdef=dsdef, + dsfiles=dsfiles, + document_id=document_id, + verbatim_gts=verbatim_gts, + bbox_indices_per_key=bbox_indices_per_key, + keys_with_values_found=keys_with_values_found, + similarities=similarities, + ) + + if dataset_task == DatasetTask.KIE: + # SROIE is modeled as JSON, but CORD and FUNSD as annotation task + is_annotation_task = dsdef.prompt_task == "annotation" + ( + verbatim_gts, + keys_with_values_found, + similarities, + bbox_indices_per_key, + key_to_label, + ) = _postprocess_kie_gt_search_key_indices( + gts=raw_annotations, + document_text=document_text, + cutoff=cutoff, + bboxes=bboxes, + text_positions=text_positions, + doclog=SynDocumentLog(document_id, dsfiles.document_logs_directory), + dsfiles=dsfiles, + is_annotation_task=is_annotation_task, + ) + + return prepare_and_verify_gt_kie( + dsdef=dsdef, + dsfiles=dsfiles, + document_id=document_id, + verbatim_gts=verbatim_gts, + key_to_label=key_to_label, + bbox_indices_per_key=bbox_indices_per_key, + word_bboxes=bboxes, + keys_with_values_found=keys_with_values_found, + similarities=similarities, + # is_annotation_task=is_annotation_task, + ) + + raise ValueError(f"Unknown synthetic dataset task: {dataset_task}") + + +def pipeline_ground_truth_verification(params: PipelineParameters): + log_pipeline_level() + + cutoff = PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF + + dsdef = params.dsdef + + # Get valid PDF paths (single page, not processed yet) + valid_document_ids = [] + total_annotations_count = 0 + for doclog in dsdef.get_document_logs(): + total_annotations_count += 1 + gt_valid = (doclog.raw_json_gt_found and doclog.raw_json_gt_valid_json) or ( + doclog.raw_annotation_gt_found + #and len(doclog.raw_annotation_gt_extraction_errors) == 0 + ) + if doclog.pdf_num_pages == 1 and doclog.ocr_found and gt_valid: + # annotations_path = dsfiles.gt_directory / f"{doclog.document_id}.json" + # if not annotations_path.exists(): + valid_document_ids.append(doclog.document_id) + + print( + f"{len(valid_document_ids)} out of {total_annotations_count} documents valid for GT preparation and verification." + ) + + with get_progress_bar() as progress: + verification_task = progress.add_task( + f"[red]Preparing and verifying {len(valid_document_ids)} document annotations...", + total=len(valid_document_ids), + ) + + for document_id in valid_document_ids: + dsfiles = dsdef.get_file_structure() + gt_log = prepare_and_verify_gt( + dsdef=dsdef, document_id=document_id, cutoff=cutoff, params=params + ) + dsdef.write_to_document_log( + document_id=document_id, + vals=gt_log, + ) + progress.update(verification_task, advance=1) diff --git a/docgenie/generation/pipeline_18_analyze.py b/docgenie/generation/pipeline_18_analyze.py new file mode 100755 index 0000000000000000000000000000000000000000..0fb4f6bcb011a097c4e001ae7866cc8153f73c54 --- /dev/null +++ b/docgenie/generation/pipeline_18_analyze.py @@ -0,0 +1,240 @@ +from collections import Counter +import json +from docgenie.generation.models import ( + DocLogKey, + MessageProcessingLogKey, + PipelineParameters, + PromptMsgResultLogKey, +) +from docgenie.generation.pipeline_01.cost import print_cost_report +from docgenie.generation.utils.log import log_pipeline_level + + +def pipeline_analyze(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + + # Count results of each step + # STEP 1 - Prompt Message Results + dsfiles = dsdef.get_file_structure() + batches_count = sum( + [1 for f in dsfiles.prompt_batches_directory.iterdir() if f.is_file()] + ) + messages_count = sum( + [1 for f in dsfiles.message_results_directory.iterdir() if f.is_file()] + ) + errored_messages_count = 0 + succeeded_messages_count = 0 + total_usage_output_tokens = 0 + total_usage_input_tokens = 0 + for f in dsfiles.message_results_directory.iterdir(): + msg_result = json.loads(f.read_text(encoding="utf-8")) + + total_usage_input_tokens += msg_result[PromptMsgResultLogKey.usage_input_tokens] + total_usage_output_tokens += msg_result[ + PromptMsgResultLogKey.usage_output_tokens + ] + + if msg_result[PromptMsgResultLogKey.result_type] == "errored": + errored_messages_count += 1 + elif msg_result[PromptMsgResultLogKey.result_type] == "succeeded": + succeeded_messages_count += 1 + + # STEP 2 - Message Processing Logs + num_documents_expected = 0 + num_documents_found = 0 + for f in dsfiles.message_processing_logs_directory.iterdir(): + msg_processing_log = json.loads(f.read_text(encoding="utf-8")) + if msg_processing_log[MessageProcessingLogKey.result_type] == "succeeded": + num_documents_expected += msg_processing_log[ + MessageProcessingLogKey.num_documents_expected + ] + num_documents_found += msg_processing_log[ + MessageProcessingLogKey.num_documents_found + ] + + prompting_log = { + "batches_count": batches_count, + "messages_count": messages_count, + "total_usage_input_tokens": total_usage_input_tokens, + "total_usage_output_tokens": total_usage_output_tokens, + "succeeded_messages_count": succeeded_messages_count, + "errored_messages_count": errored_messages_count, + "num_documents_expected": num_documents_expected, + "num_documents_found": num_documents_found, + } + + # STEP 3 - Document Logs + def list_to_entry(items: set): + return {"total": len(items), "items": sorted(items)} + + class DocumentError: + is_multipage = "is_multipage" + invalid_raw_gt = "invalid_raw_gt" + cannot_map_chars_to_words = "cannot_map_chars_to_words" + visual_elements_extraction_error = "visual_elements_extraction_error" + handwriting_extraction_error = "handwriting_extraction_error" + missing_handwriting_images = "missing_handwriting_images" + missing_ocr = "missing_ocr" + gt_verification_failed = "gt_verification_failed" + no_text = "no_text" + + has_no_valid_gt = set() + has_multiple_pages_pass1 = set() + cannot_map_chars_to_words = set() + has_visual_element_extraction_errors = set() + has_handwriting_extraction_errors = set() + has_missing_handwriting = set() + has_missing_ocr = set() + has_failed_gt_verification = set() + has_no_text = set() + + has_handwriting = set() + has_no_handwriting = set() + has_ve = set() + has_no_ve = set() + has_handwriting_and_ve = set() + + doc_level_stats_counter = ( + Counter() + ) # handwriting_num_elements visual_elements_num_elements + valid_samples = set() + document_errors = dict() + + # Fetch perfect documents + total_documents = 0 + min_annotation_count = 99999 + max_annotation_count = 0 + for doclog in dsdef.get_document_logs(): + did = doclog.document_id + total_documents += 1 + + gt_valid = (doclog.raw_json_gt_found and doclog.raw_json_gt_valid_json) or ( + doclog.raw_annotation_gt_found + ) + if not gt_valid: + has_no_valid_gt.add(did) + document_errors[did] = DocumentError.invalid_raw_gt + continue + + if not doclog.pdf_num_pages == 1: + has_multiple_pages_pass1.add(did) + document_errors[did] = DocumentError.is_multipage + continue + + if not doclog.can_map_chars_to_words: + cannot_map_chars_to_words.add(did) + document_errors[did] = DocumentError.cannot_map_chars_to_words + continue + + if len(doclog.visual_elements_extraction_errors) != 0: + has_visual_element_extraction_errors.add(did) + document_errors[did] = DocumentError.visual_elements_extraction_error + continue + + if len(doclog.handwriting_element_extraction_errors) != 0: + has_handwriting_extraction_errors.add(did) + document_errors[did] = DocumentError.handwriting_extraction_error + continue + + if len(doclog.handwriting_missing_images) != 0: + has_missing_handwriting.add(did) + document_errors[did] = DocumentError.missing_handwriting_images + continue + + if not doclog.ocr_found: + has_missing_ocr.add(did) + document_errors[did] = DocumentError.missing_ocr + continue + + if not doclog.gt_verification_passed: + has_failed_gt_verification.add(did) + document_errors[did] = DocumentError.gt_verification_failed + continue + + if doclog.num_word_bboxes == 0: + has_no_text.add(did) + document_errors[did] = DocumentError.no_text + continue + + if doclog.handwriting_num_elements > 0: + doc_level_stats_counter[DocLogKey.handwriting_num_elements] += ( + doclog.handwriting_num_elements + ) + has_handwriting.add(did) + else: + has_no_handwriting.add(did) + + if doclog.visual_elements_num_elements > 0: + doc_level_stats_counter[DocLogKey.visual_elements_num_elements] += ( + doclog.visual_elements_num_elements + ) + has_ve.add(did) + + if doclog.handwriting_num_elements > 0: + has_handwriting_and_ve.add(did) + else: + has_no_ve.add(did) + + if doclog.num_word_bboxes != -1: + doc_level_stats_counter["num_words"] += doclog.num_word_bboxes + + if doclog.num_char_bboxes != -1: + doc_level_stats_counter["num_chars"] += doclog.num_char_bboxes + + annotations_count = doclog.annotations_count + doc_level_stats_counter["annotations_count"] += annotations_count + min_annotation_count = min(min_annotation_count, annotations_count) + max_annotation_count = max(max_annotation_count, annotations_count) + + valid_samples.add(did) + + # Divide all counts by divisor (keep as float) + normalized = {k: v / len(valid_samples) for k, v in doc_level_stats_counter.items()} + + total_cost_summary = print_cost_report( + batch_data_directory=dsfiles.prompt_batches_directory, + dataset_log_path=dsfiles.ds_log_path, + ) + + dataset_log = { + "prompting": prompting_log, + "total_cost_summary": total_cost_summary, + "valid_samples_stats": { + "total": doc_level_stats_counter, + "avg": normalized, + "min_annotation_count": min_annotation_count, + "max_annotation_count": max_annotation_count, + }, + "total_samples": total_documents, + "valid_samples": list_to_entry(valid_samples), + "valid_samples_by_category": { + "has_handwriting": list_to_entry(has_handwriting), + "has_visual_elements": list_to_entry(has_ve), + "has_handwriting_and_visual_elements": list_to_entry( + has_handwriting_and_ve + ), + "no_handwriting": list_to_entry(has_no_handwriting), + "no_visual_elements": list_to_entry(has_no_ve), + }, + "errors": { + "has_no_valid_gt": list_to_entry(has_no_valid_gt), + "has_multiple_pages_pass1": list_to_entry(has_multiple_pages_pass1), + "cannot_map_chars_to_words": list_to_entry(cannot_map_chars_to_words), + "has_visual_element_extraction_errors": list_to_entry( + has_visual_element_extraction_errors + ), + "has_handwriting_extraction_errors": list_to_entry( + has_handwriting_extraction_errors + ), + "has_missing_handwriting": list_to_entry(has_missing_handwriting), + "has_missing_ocr": list_to_entry(has_missing_ocr), + "has_failed_gt_verification": list_to_entry(has_failed_gt_verification), + "has_no_text": list_to_entry(has_no_text), + }, + "docid_to_error": document_errors, + } + print(f"Valid samples: {len(valid_samples)}, errors: {len(document_errors)}") + + dsfiles.ds_log_path.write_text(json.dumps(dataset_log, indent=2), encoding="utf-8") diff --git a/docgenie/generation/pipeline_19_create_debug_data.py b/docgenie/generation/pipeline_19_create_debug_data.py new file mode 100755 index 0000000000000000000000000000000000000000..2324df29be98dfb31529c69b9ae106dc2c190289 --- /dev/null +++ b/docgenie/generation/pipeline_19_create_debug_data.py @@ -0,0 +1,152 @@ +import json +import pathlib +import shutil + +import fitz +from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI +from docgenie.generation.models import ( + OCRBox, + PipelineParameters, + SynDatasetDefinition, + SynDocumentLog, +) +from rich.progress import ( + Progress, + TimeElapsedColumn, + BarColumn, + TaskProgressColumn, + TimeRemainingColumn, +) +from PIL import Image + +from docgenie.generation.utils.bboxes import ( + draw_bboxes_on_image, + draw_bboxes_on_pdf, + read_syn_dataset_bboxes, + save_bboxes, +) +from docgenie.generation.utils.geos import rect_to_ocrbox +from docgenie.generation.utils.log import log_pipeline_level +from docgenie.generation.utils.status import get_progress_bar + + +def mm_to_px(mm: int | float): + return mm * 72 / 25.4 + + +def draw_visual_elements_debug(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + bboxes = [] + data_path = dsfiles.visual_element_definitions_directory / f"{docid}.json" + data = json.loads(data_path.read_text(encoding="utf-8")) + for d in data: + if d["error"] is None: + b = rect_to_ocrbox(d["rect"]) + bboxes.append(b) + + draw_bboxes_on_pdf( + dsfiles.final_pdf_directory / f"{docid}.pdf", + dsfiles.debug_pdf_visual_elements_directory / f"{docid}.pdf", + bboxes, + color=(0, 0, 1), # visual elements blue + ) + + +def unnormalize_bboxes(bboxes: list[OCRBox], width: float, height: float): + for b in bboxes: + yield OCRBox( + x0=b.x0 * width, + y0=b.y0 * height, + x2=b.x2 * width, + y2=b.y2 * height, + text=b.text, + block_no=b.block_no, + line_no=b.line_no, + word_no=b.word_no, + ) + + +def draw_bbox_final_debug(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + + bbox_norm_path = dsfiles.get_final_normalized_bbox_path( + level="segment", doc_id=docid + ) + bbox_norm = read_syn_dataset_bboxes(bbox_norm_path) + + img_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}" + img = Image.open(img_path) + width, height = img.size + bbox_unnorm = list(unnormalize_bboxes(bboxes=bbox_norm, width=width, height=height)) + + try: + img_altered = draw_bboxes_on_image(img, bbox_unnorm, show_text=True) + img_altered.save( + dsfiles.debug_pdf_bboxes_final_directory / f"{docid}.{IMAGE_RENDER_EXT}" + ) + except Exception as err: + print(f"[ERROR]: Skipping debug PDF: {str(err)}") + + +def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str): + dsfiles = dsdef.get_file_structure() + + bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid) + bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path) + + pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf" + outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf" + + try: + draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm) + except Exception as err: + print(f"[ERROR]: Skipping debug PDF: {str(err)}") + + +def pipeline_create_debug_data(params: PipelineParameters): + log_pipeline_level() + + dsdef = params.dsdef + dsfiles = dsdef.get_file_structure() + + # Get valid documents + valid_documents = [] + total_pdfs_count = 0 + for doclog in dsdef.get_document_logs(): + total_pdfs_count += 1 + + if doclog.pdf_num_pages == 1: + valid_documents.append(doclog) + + print(f"Found {len(valid_documents)} documents valid for debug PDF/Img drawing.") + + with get_progress_bar() as progress: + task = progress.add_task( + "[white]Draw Debug PDF/Images...", total=len(valid_documents) + ) + + doclog: SynDocumentLog + for doclog in valid_documents: + docid = doclog.document_id + + # Copy raw HTML to debug directory + src = dsfiles.raw_html_directory / f"{docid}.html" + tgt = dsfiles.debug_html_raw_directory / f"{docid}.html" + shutil.copy(src, tgt) + + if doclog.visual_elements_num_elements > 0: + draw_visual_elements_debug(dsdef=dsdef, docid=docid) + + # Handwriting debug is created when handwriting is inserted + + if doclog.ocr_found: + draw_bbox_final_debug(dsdef=dsdef, docid=docid) + + progress.update(task, advance=1) + + # Copy debug script into debug html directory + debug_script_fname = "debug.js" + src_dir = pathlib.Path(__file__).parent + src_path = src_dir / debug_script_fname + dst_path = dsfiles.debug_html_raw_directory / debug_script_fname + shutil.copy(src_path, dst_path) diff --git a/docgenie/generation/utils/bboxes.py b/docgenie/generation/utils/bboxes.py new file mode 100755 index 0000000000000000000000000000000000000000..cda360a313652c0eab4f35cb75401284789f431d --- /dev/null +++ b/docgenie/generation/utils/bboxes.py @@ -0,0 +1,128 @@ +from pathlib import Path + +from PIL import Image, ImageDraw, ImageFont +import pymupdf + +from docgenie.generation.models import OCRBox +from docgenie.generation.models._bbox import LayoutBox + + +def is_in_rect(rect: dict, bbox: OCRBox, threshold: float, document_id: str | None = None): + # Convert back PDF points to pixels + r_x0 = rect["x"] - threshold + r_y0 = rect["y"] - threshold + r_x2 = r_x0 + rect["width"] + 2 * threshold + r_y2 = r_y0 + rect["height"] + 2 * threshold + + left = bbox.x0 >= r_x0 + top = bbox.y0 >= r_y0 + right = bbox.x2 <= r_x2 + bottom = bbox.y2 <= r_y2 + + # if document_id == '74577486-4e36-425e-b733-8a745ca840f1_0': + # print(f'{left=} {top=} {right=} {bottom=} {bbox.as_string} {rect=}') + + return left and top and right and bottom + + +def save_bboxes( + bboxes: list[OCRBox], + bbox_path: Path, +): + bbox_path.parent.mkdir(exist_ok=True, parents=True) + with bbox_path.open(mode="w", encoding="utf-8") as f: + for i, box in enumerate(bboxes): + line = box.as_string() + if i < len(bboxes) - 1: + line += "\n" + f.write(line) + + +def read_syn_dataset_bbox_str(line: str) -> OCRBox: + parts = line.split(",", 4) + x0 = float(parts[0]) + y0 = float(parts[1]) + x2 = float(parts[2]) + y2 = float(parts[3]) + txt = parts[4] + parts = txt.rsplit(",", 3) + txt = parts[0] + block_no = int(parts[1]) + line_no = int(parts[2]) + word_no = int(parts[3]) + return OCRBox( + x0=x0, + y0=y0, + x2=x2, + y2=y2, + text=txt, + block_no=block_no, + line_no=line_no, + word_no=word_no, + ) + + +def read_syn_dataset_bboxes(box_path) -> list[OCRBox]: + """ + Reads bboxes from synthetic datasets + """ + bboxes = [] + line: str + for line in box_path.read_text(encoding="utf-8").splitlines(): + bboxes.append(read_syn_dataset_bbox_str(line)) + return bboxes + + +def draw_pdf_bboxes_on_pdf(pdf_path, outpath: Path): + doc = pymupdf.open(pdf_path) + for page_num, page in enumerate(doc.pages()): + for block in page.get_text("words"): + x0, y0, x1, y1, txt = block[:5] + # rect = pymupdf.Rect(block[:4]) + block = (round(x0), round(y0), round(x1), round(y1)) + rect = pymupdf.Rect(block) + print(",".join([str(x) for x in block])) + page.draw_rect(rect, color=(1, 0, 0)) # Red box + + doc.save(outpath) + + +def draw_bboxes_on_pdf( + pdf_path: Path, outpath: Path, bboxes: list[OCRBox], color=(1, 0, 0) +): + doc = pymupdf.open(pdf_path) + for page_num, page in enumerate(doc.pages()): + for bbox in bboxes: + # rect = pymupdf.Rect(block[:4]) + block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2)) + rect = pymupdf.Rect(block) + page.draw_rect(rect, color=color) # Red box + + doc.save(outpath) + + +def draw_bboxes_on_image( + image, bboxes: list[OCRBox], color="red", width=3, show_text=True +) -> Image.Image: + """ + Draws bounding boxes on a given Pillow image. + + :param image: Pillow Image object + :param bboxes: List of bounding boxes [(x0, y0, x1, y1), ...] + :param color: Color of the bounding box (default: red) + :param width: Line width (default: 3) + :return: Image with bounding boxes + """ + draw = ImageDraw.Draw(image) + + bbox: OCRBox + for bbox in bboxes: + box = (bbox.x0, bbox.y0, bbox.x2, bbox.y2) + draw.rectangle(box, outline=color, width=width) + + # font = ImageFont.truetype("sans-serif.ttf", 16) + if show_text: + font = ImageFont.load_default(32) + draw.text(box, bbox.text, (255, 0, 255), font=font) # type: ignore + + return image diff --git a/docgenie/generation/utils/debug.py b/docgenie/generation/utils/debug.py new file mode 100755 index 0000000000000000000000000000000000000000..cc424302509cc339e46f440fcdbfbde258b06ec2 --- /dev/null +++ b/docgenie/generation/utils/debug.py @@ -0,0 +1,80 @@ +from pathlib import Path + +import pymupdf + +from docgenie.generation.models._bbox import OCRBox +from docgenie.generation.utils.bboxes import draw_bboxes_on_pdf + + +def draw_geos_on_pdf(geos: list[dict], pdf_in: Path, pdf_out: Path): + bboxes = [] + for g in geos: + x0 = float(g["rect"]["x"]) + y0 = float(g["rect"]["y"]) + x2 = x0 + float(g["rect"]["width"]) + y2 = y0 + float(g["rect"]["height"]) + b = OCRBox( + x0=x0, + y0=y0, + x2=x2, + y2=y2, + text="", + block_no=-1, + line_no=-1, + word_no=-1, + ) + bboxes.append(b) + + draw_bboxes_on_pdf( + pdf_path=pdf_in, + outpath=pdf_out, + bboxes=bboxes, + ) + + +def draw_geos_and_bboxes_on_pdf( + geos: list[dict], bboxes_: list[OCRBox], pdf_in: Path, pdf_out: Path, verbose: bool +): + bboxes = [] + for g in geos: + x0 = float(g["rect"]["x"]) + y0 = float(g["rect"]["y"]) + x2 = x0 + float(g["rect"]["width"]) + y2 = y0 + float(g["rect"]["height"]) + b = OCRBox( + x0=x0, + y0=y0, + x2=x2, + y2=y2, + text="", + block_no=-1, + line_no=-1, + word_no=-1, + ) + bboxes.append(b) + + doc = pymupdf.open(pdf_in) + for page_num, page in enumerate(doc.pages()): + # geos red + color = (1, 0, 0) + for bbox in bboxes: + # rect = pymupdf.Rect(block[:4]) + block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2)) + rect = pymupdf.Rect(block) + page.draw_rect(rect, color=color) # Red box + + if verbose: + print(bbox) + + # bboxes green + color = (0, 1, 0) + for bbox in bboxes_: + # rect = pymupdf.Rect(block[:4]) + block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2)) + rect = pymupdf.Rect(block) + page.draw_rect(rect, color=color) # Red box + + if verbose: + print(bbox) + + doc.save(pdf_out) diff --git a/docgenie/generation/utils/documentsize.py b/docgenie/generation/utils/documentsize.py new file mode 100755 index 0000000000000000000000000000000000000000..7d4115ffb07aba3761b5b132d95a236d1a38a6e9 --- /dev/null +++ b/docgenie/generation/utils/documentsize.py @@ -0,0 +1,33 @@ +from PIL import Image +import fitz + +from docgenie.generation.constants import IMAGE_RENDER_EXT +from docgenie.generation.models._file import SyntheticDatasetFileStructure +from docgenie.generation.models._log import SynDocumentLog + + +def get_pdf_size_pt(docid: str, dsfiles: SyntheticDatasetFileStructure): + pdf_path = dsfiles.final_pdf_directory / f"{docid}.pdf" + doc = fitz.open(pdf_path) + page = doc[0] + width_pt, height_pt = page.rect.width, page.rect.height + width_px = width_pt + height_px = height_pt + doc.close() + return width_px, height_px + +def get_image_size_px(docid: str, dsfiles: SyntheticDatasetFileStructure): + # Take size from image -> the bboxes we have are extracted from Image + image_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}" + img = Image.open(image_path) + width_px, height_px = img.size # in pixels + return width_px, height_px + +def get_document_size_for_bbox_unnormalization(docid: str, dsfiles: SyntheticDatasetFileStructure): + doclog = SynDocumentLog(document_id=docid, logdir=dsfiles.document_logs_directory) + if doclog.ocr_required: + # Take size from image -> the bboxes we have are extracted from Image + return get_image_size_px(docid=docid, dsfiles=dsfiles) + else: + # Take size from PDF -> the bboxes we have are extracted from PDF + return get_pdf_size_pt(docid, dsfiles) \ No newline at end of file diff --git a/docgenie/generation/utils/geos.py b/docgenie/generation/utils/geos.py new file mode 100755 index 0000000000000000000000000000000000000000..fa4647fa7af33f2790e8aa9283028d3fb026a809 --- /dev/null +++ b/docgenie/generation/utils/geos.py @@ -0,0 +1,66 @@ +""" +{ + "id": null, + "tag": "div", + "classes": "signature handwritten author1", + "rect": { + "x": 521.5546875, + "y": 814.7109375, + "width": 357.1640625, + "height": 31.1953125 + }, + "visibility": "visible", + "dataContent": null, + "dataPlaceholder": null, + "style": null, + "text": "James Wellington", + "selectorType": "handwriting" + }, +""" + +import json +from pathlib import Path +from typing import Iterable + +from docgenie.generation.models._bbox import OCRBox + + +def read_visual_elements_from_geos(geo_path: Path) -> Iterable[dict]: + data = json.loads(geo_path.read_text(encoding="utf-8")) + for d in data: + if "visual_element" in d["selectorTypes"]: + yield d + + +def read_handwriting_elements_from_geos(geo_path: Path) -> Iterable[dict]: + data = json.loads(geo_path.read_text(encoding="utf-8")) + for d in data: + if "handwriting" in d["selectorTypes"]: + yield d + + +def read_layout_elements_from_geos(geo_path: Path) -> Iterable[dict]: + data = json.loads(geo_path.read_text(encoding="utf-8")) + for d in data: + if "layout_element" in d["selectorTypes"]: + yield d + + +def read_custom_elements_from_geos(geo_path: Path) -> Iterable[dict]: + data = json.loads(geo_path.read_text(encoding="utf-8")) + for d in data: + if "custom" in d["selectorTypes"]: + yield d + + +def rect_to_ocrbox(r: dict, text=None) -> OCRBox: + return OCRBox( + x0=r["x"], + y0=r["y"], + x2=r["x"] + r["width"], + y2=r["y"] + r["height"], + text=text, # type: ignore + block_no=-1, + line_no=-1, + word_no=-1, + ) diff --git a/docgenie/generation/utils/handwriting.py b/docgenie/generation/utils/handwriting.py new file mode 100755 index 0000000000000000000000000000000000000000..576cee217493bfefc454a20daf0f0a782633a893 --- /dev/null +++ b/docgenie/generation/utils/handwriting.py @@ -0,0 +1,26 @@ +from bs4 import Tag +from docgenie.generation.constants import HANDWRITING_CLASS_NAME + + +def get_author_id_from_field(field: Tag) -> str | None: + all_classes = field.get("class", []) # type: ignore + return get_author_id(all_classes) + + +def get_author_id(all_classes: list[str]) -> str | None: + other_classes = [c for c in all_classes if c != HANDWRITING_CLASS_NAME] # type: ignore + valid_author_ids = [c for c in other_classes if c.startswith("author")] + author_id = valid_author_ids[0] if valid_author_ids else None + return author_id + + +def get_all_author_ids(soup) -> set[str]: + fields = soup.find_all(class_=HANDWRITING_CLASS_NAME) + + # Extract text content + result = set() + for i, field in enumerate(fields): + author_id = get_author_id_from_field(field) + result.add(author_id) + + return result diff --git a/docgenie/generation/utils/html.py b/docgenie/generation/utils/html.py new file mode 100755 index 0000000000000000000000000000000000000000..6c4e3f34f9d420aab64fc3080568e085be8c2efe --- /dev/null +++ b/docgenie/generation/utils/html.py @@ -0,0 +1,15 @@ +from bs4 import Tag + + +def get_field_text(field: Tag) -> str: + """ + Extract text from a BeautifulSoup Tag. + + Works for: + - elements (uses 'value' attribute) + - Other tags (uses inner text) + """ + if field.name == "input": + return field.get("value", "").strip() # type: ignore + else: + return field.text.strip() diff --git a/docgenie/generation/utils/image.py b/docgenie/generation/utils/image.py new file mode 100755 index 0000000000000000000000000000000000000000..65c4d63fe194656618d49e772baca2a510b073c5 --- /dev/null +++ b/docgenie/generation/utils/image.py @@ -0,0 +1,52 @@ +import pathlib + +import cv2 +import numpy as np +from PIL import Image + + +def img_write_to_bytes(array, fmt: str = '.png') -> bytes: + return cv2.imencode(fmt, array)[1].tobytes() + + +def img_read(input_file: pathlib.Path, flags: int = cv2.IMREAD_COLOR): + return cv2.imdecode(np.fromfile(input_file, np.uint8), flags) + + +def downscale_image(img, max_width): + # Get current dimensions + width, height = img.size + + # Check if resizing is needed + if width <= max_width: + return img + + # Calculate the new height to maintain aspect ratio + new_height = int((max_width / width) * height) + + # Resize the image + img_resized = img.resize((max_width, new_height), Image.Resampling.LANCZOS) + + return img_resized + + +def downscale_and_compress_from_path( + old_path: pathlib.Path, + new_path: pathlib.Path, + max_width: int = 500, + quality: int = 80, +): + # Open the image + img = Image.open(old_path) + img = downscale_image(img, max_width=max_width) + img.save(new_path, format="JPEG", quality=quality) + + +def downscale_and_compress( + img: Image.Image, + save_to_path: pathlib.Path, + max_width: int = 500, + quality: int = 80, +): + img = downscale_image(img, max_width=max_width) + img.save(save_to_path, format="JPEG", quality=quality) diff --git a/docgenie/generation/utils/log.py b/docgenie/generation/utils/log.py new file mode 100755 index 0000000000000000000000000000000000000000..28ca183274455bcf0bae79f958a92274a6c34e42 --- /dev/null +++ b/docgenie/generation/utils/log.py @@ -0,0 +1,11 @@ +import inspect +from pathlib import Path + + +def log_pipeline_level(): + # Get the previous frame (the caller) + frame = inspect.stack()[1] + caller_file = frame.filename + level_name = Path(caller_file).name + print(f"\n-----> {level_name.upper()} <-----") + # input("PRESS KEY") diff --git a/docgenie/generation/utils/ocr.py b/docgenie/generation/utils/ocr.py new file mode 100755 index 0000000000000000000000000000000000000000..62757df55724abc7de319d2a6ddac42c255cbba6 --- /dev/null +++ b/docgenie/generation/utils/ocr.py @@ -0,0 +1,90 @@ +import os +from pathlib import Path + +import numpy as np +import requests + +from docgenie.generation.utils.image import img_read, img_write_to_bytes +from docgenie.utils.ocr import MicrosoftOCR, MicrosoftOCRWord + +OCR_ENGINE = 'microsoft_di' +OCR_PORT_ENV = os.getenv('DOCGENIE_OCR_PORT') +OCR_PORT = OCR_PORT_ENV or '8000' +OCR_URL = 'http://localhost:' + OCR_PORT +OCR_POSTFIX = '0.MicrosoftOcrService.json' + + +def get_ocr_cache_path(image_path: Path, postfix: str) -> Path: + return image_path.parent / f'{image_path.name}.{postfix}' + + +def call_ocr_service_from_image(image: np.ndarray, + url: str = OCR_URL, + engine: str = OCR_ENGINE, + client_caching: bool = True, + image_path: Path | None = None) -> MicrosoftOCR: + headers = {'accept': 'application/json'} + + cache_path = None + if client_caching: + cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX) + if cache_path.exists(): + return MicrosoftOCR.load_from_file(cache_path) + + encoded_image = img_write_to_bytes(image) + files = {'image': encoded_image, 'type': 'image/png'} + endpoint = f'{url}/v1/sync/ocr/{engine}' + response = requests.post(url=endpoint, headers=headers, files=files) + response.raise_for_status() + + data = response.json() + first_page = data['ocr']['pages'][0] + ocr = MicrosoftOCR( + angle=first_page['angle'], + width=first_page['imageWidth'], + height=first_page['imageHeight'], + words=[ + MicrosoftOCRWord( + text=proto['text'], + confidence=proto['confidence'], + geo=proto['geo'] + ) + for proto in first_page['words'] + ], + lines=[ + MicrosoftOCRWord( + text=proto['text'], + confidence=proto['confidence'], + geo=proto['geo'] + ) + for proto in first_page['lines'] + ], + ) + + if client_caching and cache_path: + ocr.save_to_file(cache_path) + + return ocr + + +def call_ocr_service_from_file(image_path: Path, + url: str = OCR_URL, + engine: str = OCR_ENGINE, + client_caching: bool = True) -> MicrosoftOCR: + if client_caching: + cache_path = get_ocr_cache_path(image_path, OCR_POSTFIX) + if cache_path.exists(): + return MicrosoftOCR.load_from_file(cache_path) + + image = img_read(image_path) + return call_ocr_service_from_image(image, url, engine, client_caching=client_caching, image_path=image_path) + + +if __name__ == '__main__': + base_dir = Path("data/temp/OCR/test-dataset") + image_file = base_dir / "04276b91-eb12-4b47-80a6-666f6d09b6ce_1.jpg" + + # client_caching: True will also write the OCR file next to the image. + ocr: MicrosoftOCR = call_ocr_service_from_file(image_file, client_caching=True) + # ocr.save_to_file(...) + print(ocr.words) diff --git a/docgenie/generation/utils/pdfjs.py b/docgenie/generation/utils/pdfjs.py new file mode 100755 index 0000000000000000000000000000000000000000..f195bc81abddc47ede614cc04a6a83e0f0b5d8a2 --- /dev/null +++ b/docgenie/generation/utils/pdfjs.py @@ -0,0 +1,396 @@ +MEASURE_DIMENSIONS_V1 = """ + () => { + const body = document.body; + const html = document.documentElement; + + // Force layout calculation + body.offsetHeight; + + // Get body's computed style to extract margins + const bodyStyle = window.getComputedStyle(body); + const marginTop = parseFloat(bodyStyle.marginTop) || 0; + const marginBottom = parseFloat(bodyStyle.marginBottom) || 0; + const marginLeft = parseFloat(bodyStyle.marginLeft) || 0; + const marginRight = parseFloat(bodyStyle.marginRight) || 0; + + const bodyRect = body.getBoundingClientRect(); + + // Find the furthest extent of content + let maxY = bodyRect.bottom; + let maxX = bodyRect.right; + + const allElements = body.querySelectorAll('*'); + allElements.forEach(el => { + const rect = el.getBoundingClientRect(); + const style = window.getComputedStyle(el); + + if (style.display === 'none' || + style.visibility === 'hidden' || + rect.width === 0 || rect.height === 0) { + return; + } + + if (rect.bottom > maxY) maxY = rect.bottom; + if (rect.right > maxX) maxX = rect.right; + }); + + // CRITICAL FIX: Include body margins in the total size + // The PDF needs to be tall enough to contain the margins too! + const totalWidth = Math.ceil(maxX - bodyRect.left + marginRight + 5); + const totalHeight = Math.ceil(maxY + marginBottom + 5); + + return { + width: totalWidth, + height: totalHeight, + debug: { + marginTop, + marginBottom, + marginLeft, + marginRight, + bodyRectTop: bodyRect.top, + bodyRectBottom: bodyRect.bottom, + maxY, + contentHeightWithoutMargin: Math.ceil(maxY - bodyRect.top) + } + }; + } + """ + +MEASURE_DIMENSIONS_V2 = """ +() => { + const body = document.body; + const html = document.documentElement; + + // Force layout calculation + body.offsetHeight; + html.offsetHeight; + + // Get body's computed style to extract margins + const bodyStyle = window.getComputedStyle(body); + const marginTop = parseFloat(bodyStyle.marginTop) || 0; + const marginBottom = parseFloat(bodyStyle.marginBottom) || 0; + const marginLeft = parseFloat(bodyStyle.marginLeft) || 0; + const marginRight = parseFloat(bodyStyle.marginRight) || 0; + + // Strategy: Find the bounding box of ALL visible content + // This works for narrow receipts, wide tables, multi-column, everything + + let minX = Infinity; + let minY = Infinity; + let maxX = -Infinity; + let maxY = -Infinity; + + // Check body itself + const bodyRect = body.getBoundingClientRect(); + if (bodyRect.width > 0 && bodyRect.height > 0) { + minX = Math.min(minX, bodyRect.left); + minY = Math.min(minY, bodyRect.top); + maxX = Math.max(maxX, bodyRect.right); + maxY = Math.max(maxY, bodyRect.bottom); + } + + // Check all elements to find true content bounds + const allElements = document.querySelectorAll('*'); + allElements.forEach(el => { + const rect = el.getBoundingClientRect(); + const style = window.getComputedStyle(el); + + // Skip hidden elements + if (style.display === 'none' || + style.visibility === 'hidden' || + rect.width === 0 || + rect.height === 0) { + return; + } + + // Skip script/style tags + if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') { + return; + } + + minX = Math.min(minX, rect.left); + minY = Math.min(minY, rect.top); + maxX = Math.max(maxX, rect.right); + maxY = Math.max(maxY, rect.bottom); + }); + + // Fallback if no content found + if (minX === Infinity) { + minX = 0; + minY = 0; + maxX = bodyRect.right; + maxY = bodyRect.bottom; + } + + // Calculate total dimensions + // Width: from leftmost to rightmost content + right margin + // Height: from topmost to bottommost content + bottom margin + const buffer = 5; // Small safety buffer + + const totalWidth = Math.ceil(maxX - minX + marginRight + buffer); + const totalHeight = Math.ceil(maxY - minY + marginBottom + buffer); + + return { + width: totalWidth, + height: totalHeight, + debug: { + marginTop, + marginBottom, + marginLeft, + marginRight, + minX, + minY, + maxX, + maxY, + bodyWidth: bodyRect.width, + bodyHeight: bodyRect.height + } + }; +} +""" + +MEASURE_DIMENSIONS_V3 = """ +() => { + const body = document.body; + + // Force layout + body.offsetHeight; + + const bodyRect = body.getBoundingClientRect(); + + // For receipts/documents with body padding, the body rect already includes everything + // Just add a small buffer + const buffer = 5; + + return { + width: Math.ceil(bodyRect.width + buffer), + height: Math.ceil(bodyRect.height + buffer) + }; +} +""" + + +MEASURE_DIMENSIONS_V4 = """ +() => { + const body = document.body; + const html = document.documentElement; + + // Force layout calculation + body.offsetHeight; + html.offsetHeight; + + // Get body's computed style to extract margins + const bodyStyle = window.getComputedStyle(body); + const marginTop = parseFloat(bodyStyle.marginTop) || 0; + const marginBottom = parseFloat(bodyStyle.marginBottom) || 0; + const marginLeft = parseFloat(bodyStyle.marginLeft) || 0; + const marginRight = parseFloat(bodyStyle.marginRight) || 0; + + // Get body padding as well + const paddingTop = parseFloat(bodyStyle.paddingTop) || 0; + const paddingBottom = parseFloat(bodyStyle.paddingBottom) || 0; + const paddingLeft = parseFloat(bodyStyle.paddingLeft) || 0; + const paddingRight = parseFloat(bodyStyle.paddingRight) || 0; + + // Strategy: Find the bounding box of ALL visible content + let minX = Infinity; + let minY = Infinity; + let maxX = -Infinity; + let maxY = -Infinity; + + // Check body itself + const bodyRect = body.getBoundingClientRect(); + if (bodyRect.width > 0 && bodyRect.height > 0) { + minX = Math.min(minX, bodyRect.left); + minY = Math.min(minY, bodyRect.top); + maxX = Math.max(maxX, bodyRect.right); + maxY = Math.max(maxY, bodyRect.bottom); + } + + // Check all elements to find true content bounds + const allElements = document.querySelectorAll('*'); + allElements.forEach(el => { + const rect = el.getBoundingClientRect(); + const style = window.getComputedStyle(el); + + // Skip hidden elements + if (style.display === 'none' || + style.visibility === 'hidden' || + rect.width === 0 || + rect.height === 0) { + return; + } + + // Skip script/style tags + if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') { + return; + } + + minX = Math.min(minX, rect.left); + minY = Math.min(minY, rect.top); + maxX = Math.max(maxX, rect.right); + maxY = Math.max(maxY, rect.bottom); + }); + + // Fallback if no content found + if (minX === Infinity) { + minX = 0; + minY = 0; + maxX = bodyRect.right; + maxY = bodyRect.bottom; + } + + // Calculate total dimensions + // CRITICAL FIX: The viewport starts at 0,0 but content might be offset + // We need the full document size, not just content span + + // For width: take the maximum of either the rightmost content or body width + // For height: take the maximum of either the bottommost content or body height + const buffer = 5; + + // Option A: Measure from viewport origin (0,0) to furthest content + const totalWidth = Math.ceil(maxX + buffer); + const totalHeight = Math.ceil(maxY + buffer); + + // Option B: Also consider body's full width (in case body is wider than content) + const bodyFullWidth = bodyRect.width; + const bodyFullHeight = bodyRect.height; + + // Use whichever is larger + const finalWidth = Math.max(totalWidth, bodyFullWidth); + const finalHeight = Math.max(totalHeight, bodyFullHeight); + + return { + width: finalWidth, + height: finalHeight, + debug: { + marginTop, + marginBottom, + marginLeft, + marginRight, + paddingTop, + paddingBottom, + paddingLeft, + paddingRight, + minX, + minY, + maxX, + maxY, + bodyWidth: bodyRect.width, + bodyHeight: bodyRect.height, + bodyLeft: bodyRect.left, + bodyTop: bodyRect.top, + totalWidth, + totalHeight, + bodyFullWidth, + bodyFullHeight + } + }; +} +""" + +MEASURE_DIMENSIONS = """ +() => { + const body = document.body; + const html = document.documentElement; + + // Force layout + body.offsetHeight; + html.offsetHeight; + + const bodyStyle = window.getComputedStyle(body); + const paddingTop = parseFloat(bodyStyle.paddingTop) || 0; + const paddingBottom = parseFloat(bodyStyle.paddingBottom) || 0; + const paddingLeft = parseFloat(bodyStyle.paddingLeft) || 0; + const paddingRight = parseFloat(bodyStyle.paddingRight) || 0; + + // Strategy: Find bounding box of ALL visible content + let minX = Infinity; + let minY = Infinity; + let maxX = -Infinity; + let maxY = -Infinity; + + const bodyRect = body.getBoundingClientRect(); + + // Check all elements (not just body children, in case of deep nesting) + const allElements = document.querySelectorAll('body *'); + let hasContent = false; + + allElements.forEach(el => { + // Skip scripts, styles, and hidden elements + if (el.tagName === 'SCRIPT' || el.tagName === 'STYLE') return; + + const rect = el.getBoundingClientRect(); + const style = window.getComputedStyle(el); + + if (style.display === 'none' || style.visibility === 'hidden' || + rect.width === 0 || rect.height === 0) { + return; + } + + hasContent = true; + minX = Math.min(minX, rect.left); + minY = Math.min(minY, rect.top); + maxX = Math.max(maxX, rect.right); + maxY = Math.max(maxY, rect.bottom); + }); + + // Fallback if no content found + if (!hasContent || minX === Infinity) { + return { + width: Math.ceil(bodyRect.width + 5), + height: Math.ceil(bodyRect.height + 5) + }; + } + + // Now decide: do we measure from content bounds or from body bounds? + + // Approach 1: Content-based (for narrow receipts) + // Width = actual content span + left padding + right padding + const contentWidth = maxX - minX; + const contentHeight = maxY - minY; + const contentBasedWidth = contentWidth + paddingLeft + paddingRight; + const contentBasedHeight = contentHeight + paddingTop + paddingBottom; + + // Approach 2: Body-based (for full-width documents) + // Width = body's full width + const bodyBasedWidth = bodyRect.width; + const bodyBasedHeight = bodyRect.height; + + // Decision logic: + // If content is significantly narrower than body (e.g., < 70% of body width), + // it's likely a centered narrow layout like a receipt + // Otherwise, it's a full-width document + + const contentWidthRatio = contentWidth / bodyRect.width; + const isNarrowCentered = contentWidthRatio < 0.7; + + let finalWidth, finalHeight; + + if (isNarrowCentered) { + // Use content-based measurement (receipt-style) + finalWidth = contentBasedWidth; + finalHeight = Math.max(contentBasedHeight, bodyBasedHeight); // Use max for height + } else { + // Use body-based measurement (full-width document) + finalWidth = bodyBasedWidth; + finalHeight = bodyBasedHeight; + } + + const buffer = 5; + + return { + width: Math.ceil(finalWidth + buffer), + height: Math.ceil(finalHeight + buffer), + debug: { + isNarrowCentered, + contentWidthRatio: contentWidthRatio.toFixed(2), + contentWidth, + contentHeight, + bodyWidth: bodyRect.width, + bodyHeight: bodyRect.height, + approach: isNarrowCentered ? 'content-based' : 'body-based' + } + }; +} +""" diff --git a/docgenie/generation/utils/serialization.py b/docgenie/generation/utils/serialization.py new file mode 100755 index 0000000000000000000000000000000000000000..681946c9ee5fdb96f5f30272cf540208c141b8f4 --- /dev/null +++ b/docgenie/generation/utils/serialization.py @@ -0,0 +1,41 @@ +from dataclasses import is_dataclass +import pathlib +import base64 +from typing import get_type_hints + + +def from_dict(cls, data: dict): + """ + Recursively parse a dictionary into a dataclass instance. + Handles nested dataclasses and special types like pathlib.Path. + """ + type_hints = get_type_hints(cls) + kwargs = {} + + for field_name, field_type in type_hints.items(): + value = data.get(field_name) + + if value is None: + kwargs[field_name] = None + elif getattr(field_type, "__origin__", None) is list: + subtype = field_type.__args__[0] + if is_dataclass(subtype): + kwargs[field_name] = [from_dict(subtype, v) for v in value] + elif subtype == pathlib.Path: + kwargs[field_name] = [pathlib.Path(v) for v in value] + else: + kwargs[field_name] = value + elif is_dataclass(field_type): + kwargs[field_name] = from_dict(field_type, value) + elif field_type == pathlib.Path: + kwargs[field_name] = pathlib.Path(value) + else: + kwargs[field_name] = value + + return cls(**kwargs) + + +def image_to_base64(imgpath: pathlib.Path) -> str: + with open(imgpath, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return encoded_string.decode("utf-8") diff --git a/docgenie/generation/utils/stamp.py b/docgenie/generation/utils/stamp.py new file mode 100755 index 0000000000000000000000000000000000000000..88735587ad4b8ac27c25ee65d3783dc229d65c72 --- /dev/null +++ b/docgenie/generation/utils/stamp.py @@ -0,0 +1,466 @@ +from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageChops +import PIL +import PIL.Image +import numpy as np +import math +import random +from typing import Tuple, Optional + + +def _hex_to_rgb(hex_color: str): + hex_color = hex_color.lstrip("#") + lv = len(hex_color) + return tuple(int(hex_color[i : i + lv // 3], 16) for i in range(0, lv, lv // 3)) + + +def _make_noise_image(size, mean=0.7, std=0.22, blur=2, contrast=1.0): + """Return L-mode noise image (0-255).""" + w, h = size + arr = np.clip(np.random.normal(loc=mean, scale=std, size=(h, w)), 0.0, 1.0) + img = Image.fromarray((arr * 255).astype(np.uint8), mode="L") + if blur > 0: + img = img.filter(ImageFilter.GaussianBlur(blur)) + if contrast != 1.0: + a = np.asarray(img).astype(np.float32) + a = 128 + (a - 128) * contrast + a = np.clip(a, 0, 255).astype(np.uint8) + img = Image.fromarray(a, mode="L") + return img + + +def _bias_noise_towards_opaque(noise_img: Image.Image, min_val=200): + """ + Bias a noise image so values fall in [min_val..255], preserving local variation + but ensuring the noise doesn't make the stamp too transparent. + """ + assert 0 <= min_val <= 255 + return noise_img.point(lambda p: min_val + (p * (255 - min_val) // 255)) + + +def _draw_text_on_arc( + target_img: Image.Image, + text: str, + center: Tuple[int, int], + radius: float, + font: ImageFont.FreeTypeFont, + color: Tuple[int, int, int, int], + start_angle_deg: float = 0.0, + inward: bool = False, +): + """ + Draw text along an arc centered at `center` with given `radius`. + Characters are placed and rotated tangentially for realism. + """ + draw = ImageDraw.Draw(target_img) + # measure each character width using textbbox + char_widths = [] + for ch in text: + bbox = draw.textbbox((0, 0), ch, font=font) + w = bbox[2] - bbox[0] + char_widths.append(max(w, 1)) + + angs = [(w / radius) * (180.0 / math.pi) for w in char_widths] + total_arc = sum(angs) + angle = start_angle_deg - total_arc / 2.0 + + cx, cy = center + for i, ch in enumerate(text): + char_ang = angs[i] + angle += char_ang / 2.0 + theta = math.radians(angle) + + x = cx + radius * math.cos(theta) + y = cy + radius * math.sin(theta) + + bbox = draw.textbbox((0, 0), ch, font=font) + cw = bbox[2] - bbox[0] + chh = bbox[3] - bbox[1] + pad = int(max(cw, chh) * 1.6) + 6 + char_img = Image.new("RGBA", (pad, pad), (0, 0, 0, 0)) + cd = ImageDraw.Draw(char_img) + cd.text((pad // 2, pad // 2), ch, font=font, fill=color, anchor="mm") + + rot_angle = -angle + 90 + if inward: + rot_angle += 180 + + rot = char_img.rotate(rot_angle, resample=Image.BICUBIC, expand=True) + px = int(x - rot.width / 2) + py = int(y - rot.height / 2) + target_img.paste(rot, (px, py), rot) + + angle += char_ang / 2.0 + + +def _wrap_text_to_fit(text, font, max_width): + """ + Automatically wrap text by inserting line breaks to fit within max_width. + Returns text with line breaks inserted. + """ + # If text already has line breaks, process each line separately + existing_lines = text.split("\n") + wrapped_lines = [] + + temp_img = Image.new("RGBA", (1, 1)) + temp_draw = ImageDraw.Draw(temp_img) + + for line in existing_lines: + words = line.split() + if not words: + wrapped_lines.append("") + continue + + current_line = [] + for word in words: + test_line = " ".join(current_line + [word]) + bbox = temp_draw.textbbox((0, 0), test_line, font=font) + width = bbox[2] - bbox[0] + + if width <= max_width: + current_line.append(word) + else: + if current_line: + wrapped_lines.append(" ".join(current_line)) + current_line = [word] + else: + # Single word is too long, just add it anyway + wrapped_lines.append(word) + current_line = [] + + if current_line: + wrapped_lines.append(" ".join(current_line)) + + return "\n".join(wrapped_lines) + + +def create_realistic_stamp( + text_top: str = "APPROVED", + text_bottom: Optional[str] = None, + inner_text: Optional[str] = None, + shape: str = "circle", # "circle" or "rectangle" + size: Tuple[int, int] = (800, 800), # final (width, height) + color: str = "#C42828", # hex or "r,g,b" + border_thickness_ratio: float = 0.08, # relative to min(width,height) + font_path: Optional[str] = None, + font_size: Optional[int] = None, # base font size + random_seed: Optional[int] = None, + supersample: int = 3, # supersampling factor + rot_angle: float | None = None, +): + """ + Generate a realistic-looking stamp PNG with transparent background. + - Automatically adjusts font size to fit text + - Fixes text cutoff issues + """ + if random_seed is not None: + random.seed(random_seed) + np.random.seed(random_seed) + + w, h = size + scale = max(1, int(supersample)) + W, H = w * scale, h * scale + + if isinstance(color, str): + if "," in color: + color_rgb = tuple(int(x) for x in color.split(",")) + else: + color_rgb = _hex_to_rgb(color) + else: + color_rgb = tuple(color) + + # big canvas (supersampled) + stamp = Image.new("RGBA", (W, H), (0, 0, 0, 0)) + shape_layer = Image.new("RGBA", (W, H), (0, 0, 0, 0)) + d_shape = ImageDraw.Draw(shape_layer) + + min_side = min(W, H) + border_w = max(2 * scale, int(min_side * border_thickness_ratio)) + + jitter_x = random.randint(-int(min_side * 0.005), int(min_side * 0.005)) + jitter_y = random.randint(-int(min_side * 0.005), int(min_side * 0.005)) + + # Draw the ring/rectangle onto shape_layer + if shape.lower() == "circle": + outer = [ + (border_w // 2 + jitter_x, border_w // 2 + jitter_y), + (W - border_w // 2 + jitter_x, H - border_w // 2 + jitter_y), + ] + inner = [ + (border_w * 3 + jitter_x, border_w * 3 + jitter_y), + (W - border_w * 3 + jitter_x, H - border_w * 3 + jitter_y), + ] + for i in range(border_w): + off = random.randint(-scale, scale) + d_shape.ellipse( + [ + (outer[0][0] + i + off, outer[0][1] + i + off), + (outer[1][0] - i + off, outer[1][1] - i + off), + ], + outline=color_rgb + (255,), + ) + d_shape.ellipse(inner, outline=color_rgb + (220,), width=max(1, border_w // 6)) + else: + pad = border_w // 2 + for i in range(border_w): + off = random.randint(-scale, scale) + rect = [ + pad + i + off + jitter_x, + pad + i + off + jitter_y, + W - (pad + i) + jitter_x, + H - (pad + i) + jitter_y, + ] + d_shape.rounded_rectangle( + rect, radius=max(6 * scale, border_w), outline=color_rgb + (255,) + ) + + # Blur the shape layer + bleed_radius = max(1.0 * scale, scale * 0.9) + shape_layer = shape_layer.filter(ImageFilter.GaussianBlur(radius=bleed_radius)) + stamp.alpha_composite(shape_layer, (0, 0)) + + # Font loading helper + def _try_load_ttf(desired_size): + try: + if font_path: + return ImageFont.truetype(font_path, desired_size) + else: + return ImageFont.truetype("DejaVuSans-Bold.ttf", desired_size) + except Exception: + return ImageFont.load_default() + + # Calculate available space for inner text + if inner_text: + # Define text area boundaries + if shape.lower() == "circle": + # For circle: use area inside inner ring + text_area_width = W - (border_w * 6) + text_area_height = H - (border_w * 6) + else: + # For rectangle: use area inside borders with padding + text_area_width = W - (border_w * 4) + text_area_height = H - (border_w * 4) + + # Calculate initial font size + if font_size: + inner_font_size = int(font_size * 1.6 * scale) + else: + inner_font_size = int(min_side * 0.20) + + inner_font = _try_load_ttf(inner_font_size) + + # Wrap text to fit width + inner_text = _wrap_text_to_fit(inner_text, inner_font, text_area_width * 0.95) + + # Small font for curved text + if font_size: + small_font_size = max(10 * scale, int(font_size * 0.6 * scale)) + else: + small_font_size = max(10 * scale, int(min_side * 0.055)) + small_font = _try_load_ttf(small_font_size) + + d = ImageDraw.Draw(stamp) + + # Curved text (circle) + if shape.lower() == "circle" and text_top: + center = (W // 2 + jitter_x, H // 2 + jitter_y) + radius = (min_side // 2) - border_w - int(min_side * 0.03) + _draw_text_on_arc( + stamp, + text_top.upper(), + center, + radius, + small_font, + color_rgb + (255,), + start_angle_deg=-90, + ) + if text_bottom: + _draw_text_on_arc( + stamp, + text_bottom.upper(), + center, + radius, + small_font, + color_rgb + (255,), + start_angle_deg=90, + inward=True, + ) + + # Inner/center text - FIXED VERTICAL POSITIONING + if inner_text: + centerx, centery = W // 2 + jitter_x, H // 2 + jitter_y + lines = inner_text.split("\n") + + # Calculate total height and individual line metrics + draw_tmp = ImageDraw.Draw(stamp) + line_metrics = [] + total_h = 0 + + for ln in lines: + bbox = draw_tmp.textbbox((0, 0), ln, font=inner_font) + # Use actual bbox for accurate height including descenders + line_height = bbox[3] - bbox[1] + line_metrics.append( + { + "text": ln, + "bbox": bbox, + "width": bbox[2] - bbox[0], + "height": line_height, + "y_offset": -bbox[1], # Offset to account for font baseline + } + ) + total_h += line_height + + # Start from top, centered vertically + y = centery - total_h // 2 + + for metric in line_metrics: + ln = metric["text"] + tw = metric["width"] + th = metric["height"] + y_off = metric["y_offset"] + + # Create image with extra padding to prevent cutoff + padding = 30 + txt_img = Image.new( + "RGBA", (tw + padding * 2, th + padding * 2), (0, 0, 0, 0) + ) + td = ImageDraw.Draw(txt_img) + + # Draw text with proper baseline offset + td.text( + (padding, padding + y_off), ln, font=inner_font, fill=color_rgb + (255,) + ) + + angle = random.uniform(-1.0, 1.0) + txt_img = txt_img.rotate(angle, resample=Image.BICUBIC, expand=True) + + paste_x = int(centerx - txt_img.width / 2) + paste_y = int(y - padding) + + stamp.paste(txt_img, (paste_x, paste_y), txt_img) + y += th + + # Add subtle overlay strokes + overlay = Image.new("RGBA", (W, H), (0, 0, 0, 0)) + od = ImageDraw.Draw(overlay) + if shape.lower() == "circle": + try: + od.ellipse( + [(border_w, border_w), (W - border_w, H - border_w)], + outline=color_rgb + (180,), + width=max(1, border_w // 6), + ) + except Exception: + pass + else: + try: + od.rounded_rectangle( + [border_w, border_w, W - border_w, H - border_w], + radius=max(6 * scale, border_w), + outline=color_rgb + (180,), + width=max(1, border_w // 6), + ) + except Exception: + pass + stamp.alpha_composite(overlay) + + # Add noise texture + noise = _make_noise_image( + (W, H), mean=0.78, std=0.18, blur=2 * scale, contrast=1.05 + ) + noise_biased = _bias_noise_towards_opaque(noise, min_val=210) + + orig_alpha = stamp.split()[-1] + new_alpha = ImageChops.multiply(orig_alpha, noise_biased) + a_arr = np.asarray(new_alpha).astype(np.float32) + a_arr = np.clip(a_arr * 1.03, 0, 255).astype(np.uint8) + new_alpha = Image.fromarray(a_arr, mode="L") + stamp.putalpha(new_alpha) + + # Slight blur for ink bleed effect + stamp = stamp.filter(ImageFilter.GaussianBlur(radius=0.4 * scale)) + + # Add light speckle holes + speck = _make_noise_image((W, H), mean=0.5, std=0.9, blur=0.6 * scale, contrast=1.6) + speck_arr = np.asarray(speck) + speck_mask = (speck_arr > 252).astype(np.uint8) * 255 + speck_img = Image.fromarray(speck_mask, mode="L") + if speck_img.getbbox() is not None: + alpha = stamp.split()[-1] + alpha = ImageChops.subtract(alpha, speck_img) + stamp.putalpha(alpha) + + # Random rotation + rot_angle = rot_angle or random.uniform(-2.2, 2.2) + stamp = stamp.rotate(rot_angle, resample=Image.Resampling.BICUBIC, expand=True) + + # Downsample to final size + final = stamp.resize((w, h), resample=Image.Resampling.LANCZOS) + + # Final sharpening + final = final.filter(ImageFilter.UnsharpMask(radius=0.6, percent=120, threshold=2)) + + return final + + +def create_stamp_alt(text: str) -> PIL.Image.Image: + coin = random.random() <= 0.5 + if coin: + return create_realistic_stamp( + "", + text_bottom="", + inner_text=text, + shape="circle", + size=(900, 900), + color="#a81f1f", + font_path=None, + font_size=60, + random_seed=42, + supersample=3, + ) + else: + return create_realistic_stamp( + text_top="", + inner_text=text, + shape="rectangle", + size=(1100, 500), + color="#1f7a1f", + font_size=56, + random_seed=7, + supersample=3, + ) + + +def create_stamp( + text: str, width: float, height: float, rot_angle: float | None +) -> PIL.Image.Image: + coin = random.random() <= 0.5 + width = int(width) + height = int(height) + size_mult = 11 # previous default values were along 900/1000, but real sizes are around 100, which the text resizing cant handle + if coin: + return create_realistic_stamp( + "", + text_bottom="", + inner_text=text, + shape="circle", + size=(width * size_mult, height * size_mult), + color="#a81f1f", + font_path=None, + font_size=60, + random_seed=42, + supersample=3, + rot_angle=rot_angle, + ) + else: + return create_realistic_stamp( + text_top="", + inner_text=text, + shape="rectangle", + size=(width * size_mult, height * size_mult), + color="#1f7a1f", + font_size=56, + random_seed=7, + supersample=3, + rot_angle=rot_angle, + ) diff --git a/docgenie/generation/utils/status.py b/docgenie/generation/utils/status.py new file mode 100755 index 0000000000000000000000000000000000000000..04002827ae2340685bfd15b83385d2718f00c2a9 --- /dev/null +++ b/docgenie/generation/utils/status.py @@ -0,0 +1,103 @@ +import itertools +import sys +import time +import threading + + +class StatusLine: + _BAR = "|/-\\" + _DOTS = "▖▘▝▗" + _ARROW = "←↖↑↗→↘↓↙" + _BRAILLE = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏" + _CIRCLE = "◐◓◑◒" + _DASH = "━╾╼━" + + def __init__(self, delay: float = 0.1): + self._stop = False + self._spinner = itertools.cycle(self._BRAILLE) # rotating chars + self._delay = delay + self.message = "" + self.lock = threading.Lock() + self.thread = threading.Thread(target=self._run, daemon=True) + + def start(self): + self.start_time = time.time() + self.thread.start() + + def stop(self): + self._stop = True + self.thread.join() + # clear line on exit + sys.stdout.write("\r" + " " * 80 + "\r") + sys.stdout.flush() + + def update_message(self, msg: str): + with self.lock: + self.message = msg + + def log(self, msg: str): + # Print a normal log above the status line + sys.stdout.write("\r" + " " * 80 + "\r") # clear status line + sys.stdout.write(msg + "\n") + sys.stdout.flush() + + def _run(self): + last_logged_message = "" + last_logged_time = -1 + is_tty = sys.stdout.isatty() + + while not self._stop: + with self.lock: + spinner_char = next(self._spinner) + now = time.time() + elapsed = int(now - self.start_time) + current_msg = self.message + line = f"[{spinner_char}] {current_msg} | waiting {elapsed}s" + + if is_tty: + sys.stdout.write("\r" + line[:79]) # overwrite line + sys.stdout.flush() + else: + # Log-file mode: only print when message changes OR every 30 seconds + # We use int(now) // 30 to ensure we only log once per 30-second window + current_period = int(now) // 30 + if current_msg != last_logged_message or (current_period > last_logged_time): + sys.stdout.write(f"{line}\n") + sys.stdout.flush() + last_logged_message = current_msg + last_logged_time = current_period + + time.sleep(self._delay) + + +# Example usage +if __name__ == "__main__": + status = StatusLine() + status.start() + + for i in range(5): + status.update_message(f"Awaiting {5 - i} batches") + time.sleep(3) + status.log(f"Batch {i + 1} ended!") + + status.stop() + print("Done.") + + +def get_progress_bar(): + from rich.progress import ( + Progress, + TimeElapsedColumn, + BarColumn, + TaskProgressColumn, + TimeRemainingColumn, + ) + + return Progress( + "[progress.description]{task.description}", + BarColumn(), + TaskProgressColumn(), + "[yellow]({task.completed}/{task.total})", + TimeElapsedColumn(), + TimeRemainingColumn(), + ) diff --git a/docgenie/generation/utils/visualelement.py b/docgenie/generation/utils/visualelement.py new file mode 100755 index 0000000000000000000000000000000000000000..2484c3f77dc55896409e5d4de729732937280a1a --- /dev/null +++ b/docgenie/generation/utils/visualelement.py @@ -0,0 +1,13 @@ +__PREFIX = "&?ve" + + +def get_visual_element_id(i: int) -> str: + return f"{__PREFIX}{i}" + + +def is_visual_element_id(s: str) -> bool: + if s.startswith(__PREFIX): + s = s.replace(__PREFIX, "") + return s.isdigit() + else: + return False diff --git a/docgenie/logging.py b/docgenie/logging.py new file mode 100755 index 0000000000000000000000000000000000000000..97bae71877f61946b21ddd859bf9ef51307c38b4 --- /dev/null +++ b/docgenie/logging.py @@ -0,0 +1 @@ +from atria_core.logger import get_logger # noqa: F401 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..1678f395589a1c43d3465ab82a9c490e941008c5 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,79 @@ +version: '3.8' + +services: + # Redis for background job queue + redis: + image: redis:7-alpine + ports: + - "6379:6379" + volumes: + - redis_data:/data + command: redis-server --appendonly yes + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 3 + + # DocGenie API Server + api: + build: + context: . + dockerfile: Dockerfile + ports: + - "8000:8000" + environment: + - REDIS_URL=redis://redis:6379 + - HANDWRITING_SERVICE_URL=http://handwriting:8080 + - PORT=8000 + env_file: + - api/.env + depends_on: + - redis + command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload + working_dir: /app/api + volumes: + - ./api:/app/api + - ./docgenie:/app/docgenie + + # Background Worker for async jobs + worker: + build: + context: . + dockerfile: Dockerfile + environment: + - REDIS_URL=redis://redis:6379 + - HANDWRITING_SERVICE_URL=http://handwriting:8080 + env_file: + - api/.env + depends_on: + - redis + command: rq worker --url redis://redis:6379 + working_dir: /app/api + volumes: + - ./api:/app/api + - ./docgenie:/app/docgenie + + # Handwriting Service (GPU) + # Note: Requires nvidia-docker for GPU access + handwriting: + build: + context: handwriting_service + dockerfile: Dockerfile + ports: + - "8080:8080" + environment: + - DEVICE=cuda + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + volumes: + - ./handwriting_service:/app + - ./WordStylist:/app/WordStylist + +volumes: + redis_data: diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000000000000000000000000000000000000..25a241b19805af285e1f1c83bd5993d29a25525d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,176 @@ +[project] +name = "docgenie" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = "==3.11.12" +dependencies = [ + "aiohappyeyeballs==2.6.1", + "aiohttp==3.12.15", + "aiosignal==1.4.0", + "annotated-types==0.7.0", + "anthropic==0.64.0", + "anyio==4.10.0", + "attrs==25.3.0", + "beautifulsoup4==4.13.4", + "certifi==2025.8.3", + "charset-normalizer==3.4.3", + "click==8.2.1", + "cssutils==2.11.1", + "datasets==4.0.0", + "dill==0.3.8", + "distro==1.9.0", + "einops==0.8.2", + "filelock==3.19.1", + "frozenlist==1.7.0", + "fsspec==2025.3.0", + "gitdb==4.0.12", + "gitpython==3.1.45", + "h11==0.16.0", + "hf-xet==1.1.8", + "httpcore==1.0.9", + "httpx==0.28.1", + "huggingface-hub==0.34.4", + "idna==3.10", + "jinja2==3.1.6", + "jiter==0.10.0", + "jsonlines==4.0.0", + "levenshtein==0.27.1", + "markdown-it-py==4.0.0", + "markupsafe==3.0.2", + "mdurl==0.1.2", + "more-itertools==10.7.0", + "mpmath==1.3.0", + "multidict==6.6.4", + "multiprocess==0.70.16", + "networkx==3.5", + "numpy==1.26.4", + "packaging==25.0", + "pandas==2.3.1", + "pdf2image==1.17.0", + "pillow==11.3.0", + "platformdirs==4.3.8", + "propcache==0.3.2", + "protobuf==6.32.0", + "pyarrow==21.0.0", + "pydantic==2.11.7", + "pydantic-core==2.33.2", + "pygments==2.19.2", + "pymupdf==1.26.3", + "pypdf2==3.0.1", + "python-dateutil==2.9.0.post0", + "pytz==2025.2", + "pyyaml==6.0.2", + "rapidfuzz==3.13.0", + "regex==2025.7.34", + "requests==2.32.5", + "rich==14.1.0", + "safetensors==0.6.2", + "sentry-sdk==2.35.0", + "setuptools==78.1.1", + "six==1.17.0", + "smmap==5.0.2", + "sniffio==1.3.1", + "soupsieve==2.7", + "sympy==1.13.1", + "tokenizers==0.21.4", + "tqdm==4.67.1", + "transformers==4.49", + "triton==2.1.0", + "typing-extensions==4.14.1", + "typing-inspection==0.4.1", + "tzdata==2025.2", + "urllib3==2.5.0", + "wandb==0.21.1", + "wheel==0.45.1", + "xxhash==3.5.0", + "yarl==1.20.1", + "torch==2.1.0", + "torchvision", + "atria-core", + "datadings>=3.4.7", + "pytorch-ignite>=0.5.2", + "scikit-learn>=1.7.2", + "fire>=0.7.1", + "tensorboardx>=2.6.4", + "torchinfo>=1.8.0", + "hydra-core>=1.3.2", + "umap-learn==0.5.9.post2", + "hdbscan>=0.8.40", + "h5py>=3.14.0", + "colorlog>=6.9.0", + "dash>=3.2.0", + "flask>=3.1.2", + "plotly>=6.3.1", + "dash-bootstrap-components>=2.0.4", + "matplotlib>=3.10.6", + "pydantic-argparse>=0.10.0", + "sentence-transformers>=5.1.1", + "pydantic-settings>=2.11.0", + "playwright>=1.55.0", + "mmcv==2.1.0", + "mmdet==3.3.0", + "tensorboard>=2.20.0", + "seqeval>=1.2.2", + "textdistance>=4.0.0", + "due-evaluator", + "python-barcode>=0.16.1", + "diffusers>=0.35.2", + "icecream>=2.1.8", + "editdistance>=0.8.1", + "selenium>=4.36.0", + "accelerate>=1.11.0", + "seaborn>=0.13.2", + "timm>=1.0.20", + "fastapi>=0.109.0", + "uvicorn[standard]>=0.27.0", + "python-multipart>=0.0.6", + "lxml>=5.1.0", + "pdfplumber>=0.10.4", + "python-dotenv>=1.0.0", + "tenacity>=8.2.3", + "pytesseract>=0.3.10", + "redis>=5.0.0", + "rq>=1.15.0", + "supabase>=2.0.0", + "google-api-python-client>=2.100.0", + "google-auth-httplib2>=0.2.0", + "google-auth-oauthlib>=1.2.0", +] + +[[tool.uv.index]] +name = "pytorch-cu121" +url = "https://download.pytorch.org/whl/cu121" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cu121"}, +] +torchvision = [ + { index = "pytorch-cu121"}, +] +mmcv = { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" } +atria-core = { git = "https://github.com/saifullah3396/atria_core.git", branch = "devel-estella" } +nest-asyncio = [ + { index = "pypi" } +] +due-evaluator = { git = "https://github.com/due-benchmark/evaluator.git" } + +[dependency-groups] +dev = [ + "ipykernel>=6.30.1", + "pydrive2>=1.21.3", + "pytest>=8.4.2", +] + +[tool.uv] +package = true + +[tool.pytest.ini_options] +pythonpath = [".", "docgenie", "tests"] +# Set additional command line options for pytest +# Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags +addopts = "-rXs --strict-config --strict-markers --tb=short" +xfail_strict = true # Treat tests that are marked as xfail but pass as test failures +# filterwarnings = ["error"] # Treat all warnings as errors \ No newline at end of file diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100755 index 0000000000000000000000000000000000000000..2f4197bd9aa162d88fce2fcc4ed77f55e6b3cf24 --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,8 @@ +{ + "exclude": [ + "data", + "**/__pycache__", + "**/build", + "**/.venv", + ] +} \ No newline at end of file diff --git a/railway.json b/railway.json new file mode 100644 index 0000000000000000000000000000000000000000..dc125bb26e046b0bff90b3593415af0be0fb21c4 --- /dev/null +++ b/railway.json @@ -0,0 +1,11 @@ +{ + "$schema": "https://railway.app/railway.schema.json", + "build": { + "builder": "DOCKERFILE", + "dockerfilePath": "Dockerfile" + }, + "deploy": { + "restartPolicyType": "ON_FAILURE", + "restartPolicyMaxRetries": 10 + } +} diff --git a/railway_setup_vars.sh b/railway_setup_vars.sh new file mode 100755 index 0000000000000000000000000000000000000000..3484a42f3700e8bed0fe6f8ed0ad5d3a33550399 --- /dev/null +++ b/railway_setup_vars.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Railway Environment Variables Setup Script +# Run this after linking your Railway project: railway link + +echo "🚀 Setting up Railway environment variables for DocGenie API..." +echo "" + +# Required Variables +echo "📦 Setting required variables..." +railway variables set ANTHROPIC_API_KEY="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" +railway variables set REDIS_URL="rediss://default:AW79AAIncDJiMTVjMTk2NDVmYzg0YTdiOWI2OWQzZDg4ZmJlZDkzNnAyMjg0MTM@beloved-starling-28413.upstash.io:6379" +railway variables set HANDWRITING_SERVICE_URL="https://api.runpod.ai/v2/ht9ajgrduitgpr/runsync" +railway variables set HANDWRITING_SERVICE_ENABLED="true" +railway variables set RUNPOD_API_KEY="rpa_CYMSSCD76KMXHCBTIVGP3G1FV87MMFQAEMMAJDFBts0kvq" +railway variables set SUPABASE_URL="https://vbdwvbjbrbjzegtfsads.supabase.co" +railway variables set SUPABASE_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZiZHd2YmpicmJqemVndGZzYWRzIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTgyMjAxOTIsImV4cCI6MjA3Mzc5NjE5Mn0.3yK5yGK1OneWSZXBFSmd3j8wuRBDUKveJCsSYsK4tp0" +railway variables set GOOGLE_CLIENT_ID="129149757519-bm3qg2mtepkpti8ifjuhbgu6bl52khf6.apps.googleusercontent.com" +railway variables set GOOGLE_CLIENT_SECRET="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" + +echo "" +echo "⚙️ Setting recommended variables..." +railway variables set OCR_SERVICE_ENABLED="true" +railway variables set OCR_USE_LOCAL="true" +railway variables set OCR_TESSERACT_LANG="eng" +railway variables set OCR_TESSERACT_CONFIG="--psm 3" +railway variables set OCR_DPI="300" +railway variables set RQ_QUEUE_NAME="docgenie" +railway variables set BATCH_POLL_INTERVAL="30" +railway variables set HANDWRITING_SERVICE_TIMEOUT="300" +railway variables set HANDWRITING_SERVICE_MAX_RETRIES="1" +railway variables set CLAUDE_MODEL="claude-sonnet-4-5-20250929" +railway variables set GOOGLE_DRIVE_FOLDER_NAME="DocGenie Documents" +railway variables set LOG_LEVEL="INFO" +railway variables set DEBUG_MODE="false" + +echo "" +echo "✅ All environment variables set successfully!" +echo "" +echo "Next steps:" +echo "1. Verify variables: railway variables" +echo "2. Deploy: railway up" +echo "3. Monitor: railway logs" +echo "" diff --git a/setup.py b/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..850bae51f42a833083aa9bdd979da9ac765f84f9 --- /dev/null +++ b/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup, find_packages + +setup( + name="docgenie", + version="0.1.0", + packages=find_packages(), + url="https://gitlab.cs.hs-rm.de/diss_lamott/docgenie", + python_requires=">=3.10", +) \ No newline at end of file diff --git a/start.sh b/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..ca80e9422e529b59da307a3c468bc4da1f9f754d --- /dev/null +++ b/start.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +echo "🚀 Starting DocGenie API + Worker..." +echo "============================================================" +echo "🔧 Configuration:" +echo " PORT: ${PORT:-7860}" +echo " REDIS_URL: ${REDIS_URL:0:30}..." +echo " ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:0:20}..." +echo "============================================================" + +# Function to add prefix to logs +prefix_logs() { + local prefix=$1 + while IFS= read -r line; do + echo "[$prefix] $line" + done +} + +# Start API server in background with log prefix +echo "🌐 Starting FastAPI server..." +uvicorn api.main:app --host 0.0.0.0 --port ${PORT:-7860} 2>&1 | prefix_logs "API" & +API_PID=$! +echo " PID: $API_PID" + +# Give API a moment to start +sleep 2 + +# Start RQ worker in background with log prefix +# Use timestamp + random number to ensure unique worker name across deployments +WORKER_ID="docgenie-worker-$(date +%s)-${RANDOM}" +echo "👷 Starting RQ Worker..." +rq worker ${RQ_QUEUE_NAME:-docgenie} --url $REDIS_URL --name "$WORKER_ID" --verbose 2>&1 | prefix_logs "WORKER" & +WORKER_PID=$! +echo " PID: $WORKER_PID" + +echo "============================================================" +echo "✓ Both processes started successfully" +echo " API: http://0.0.0.0:${PORT:-7860}" +echo " Worker: Connected to Redis" +echo "============================================================" + +# Function to handle shutdown +shutdown() { + echo "" + echo "🛑 Shutting down gracefully..." + kill -TERM $API_PID 2>/dev/null || true + kill -TERM $WORKER_PID 2>/dev/null || true + wait $API_PID 2>/dev/null || true + wait $WORKER_PID 2>/dev/null || true + echo "✓ Shutdown complete" + exit 0 +} + +# Trap SIGTERM and SIGINT +trap shutdown SIGTERM SIGINT + +# Wait for both processes - if either exits, shutdown both +while kill -0 $API_PID 2>/dev/null && kill -0 $WORKER_PID 2>/dev/null; do + sleep 1 +done + +# If we get here, one process died +echo "" +echo "❌ One of the processes exited unexpectedly" +if ! kill -0 $API_PID 2>/dev/null; then + echo " API process died (PID: $API_PID)" +fi +if ! kill -0 $WORKER_PID 2>/dev/null; then + echo " Worker process died (PID: $WORKER_PID)" +fi + +shutdown diff --git a/uv.lock b/uv.lock new file mode 100755 index 0000000000000000000000000000000000000000..4d4b7aec4c192c1d2f138dba994241d2ec801223 --- /dev/null +++ b/uv.lock @@ -0,0 +1,4059 @@ +version = 1 +revision = 3 +requires-python = "==3.11.12" +resolution-markers = [ + "platform_machine == 'aarch64' and platform_python_implementation != 'CPython' and sys_platform == 'linux'", + "platform_machine != 'aarch64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'", + "sys_platform == 'darwin'", + "sys_platform != 'darwin' and sys_platform != 'linux'", +] + +[manifest] +overrides = [{ name = "atria-core", git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella" }] + +[[package]] +name = "absl-py" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/10/2a/c93173ffa1b39c1d0395b7e842bbdc62e556ca9d8d3b5572926f3e4ca752/absl_py-2.3.1.tar.gz", hash = "sha256:a97820526f7fbfd2ec1bce83f3f25e3a14840dac0d8e02a0b71cd75db3f77fc9", size = 116588, upload-time = "2025-07-03T09:31:44.05Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d", size = 135811, upload-time = "2025-07-03T09:31:42.253Z" }, +] + +[[package]] +name = "abydos" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecation" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/53/4d8dfccbbfe6031a2293941d718dfda7cf2e39883f915b5e3b2c057b518c/abydos-0.5.0.tar.gz", hash = "sha256:4c02e84e18211ede6885e4347a93e64fe15b777bdce0d69ac5a8617a26baef4f", size = 416089, upload-time = "2020-01-11T00:00:29.052Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/a5/ca258a571997be1c9483d6075bbc1b9487ae80f3bb3bf1f60db0b29f5aa6/abydos-0.5.0-py2.py3-none-any.whl", hash = "sha256:fe758c8f8456a703b7637ab9ac49457c1461d1ee61c97b52a6d803a567f355e1", size = 886001, upload-time = "2020-01-11T00:00:25.853Z" }, +] + +[[package]] +name = "accelerate" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/60/2757c4f03a8705dbf80b1268b03881927878dca5ed07d74f733fb6c219e0/accelerate-1.11.0.tar.gz", hash = "sha256:bb1caf2597b4cd632b917b5000c591d10730bb024a79746f1ee205bba80bd229", size = 393715, upload-time = "2025-10-20T14:42:25.025Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/85/85951bc0f9843e2c10baaa1b6657227056095de08f4d1eea7d8b423a6832/accelerate-1.11.0-py3-none-any.whl", hash = "sha256:a628fa6beb069b8e549460fc449135d5bd8d73e7a11fd09f0bc9fc4ace7f06f1", size = 375777, upload-time = "2025-10-20T14:42:23.256Z" }, +] + +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.12.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/19/9e86722ec8e835959bd97ce8c1efa78cf361fa4531fca372551abcc9cdd6/aiohttp-3.12.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d3ce17ce0220383a0f9ea07175eeaa6aa13ae5a41f30bc61d84df17f0e9b1117", size = 711246, upload-time = "2025-07-29T05:50:15.937Z" }, + { url = "https://files.pythonhosted.org/packages/71/f9/0a31fcb1a7d4629ac9d8f01f1cb9242e2f9943f47f5d03215af91c3c1a26/aiohttp-3.12.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:010cc9bbd06db80fe234d9003f67e97a10fe003bfbedb40da7d71c1008eda0fe", size = 483515, upload-time = "2025-07-29T05:50:17.442Z" }, + { url = "https://files.pythonhosted.org/packages/62/6c/94846f576f1d11df0c2e41d3001000527c0fdf63fce7e69b3927a731325d/aiohttp-3.12.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f9d7c55b41ed687b9d7165b17672340187f87a773c98236c987f08c858145a9", size = 471776, upload-time = "2025-07-29T05:50:19.568Z" }, + { url = "https://files.pythonhosted.org/packages/f8/6c/f766d0aaafcee0447fad0328da780d344489c042e25cd58fde566bf40aed/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc4fbc61bb3548d3b482f9ac7ddd0f18c67e4225aaa4e8552b9f1ac7e6bda9e5", size = 1741977, upload-time = "2025-07-29T05:50:21.665Z" }, + { url = "https://files.pythonhosted.org/packages/17/e5/fb779a05ba6ff44d7bc1e9d24c644e876bfff5abe5454f7b854cace1b9cc/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7fbc8a7c410bb3ad5d595bb7118147dfbb6449d862cc1125cf8867cb337e8728", size = 1690645, upload-time = "2025-07-29T05:50:23.333Z" }, + { url = "https://files.pythonhosted.org/packages/37/4e/a22e799c2035f5d6a4ad2cf8e7c1d1bd0923192871dd6e367dafb158b14c/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74dad41b3458dbb0511e760fb355bb0b6689e0630de8a22b1b62a98777136e16", size = 1789437, upload-time = "2025-07-29T05:50:25.007Z" }, + { url = "https://files.pythonhosted.org/packages/28/e5/55a33b991f6433569babb56018b2fb8fb9146424f8b3a0c8ecca80556762/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b6f0af863cf17e6222b1735a756d664159e58855da99cfe965134a3ff63b0b0", size = 1828482, upload-time = "2025-07-29T05:50:26.693Z" }, + { url = "https://files.pythonhosted.org/packages/c6/82/1ddf0ea4f2f3afe79dffed5e8a246737cff6cbe781887a6a170299e33204/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b5b7fe4972d48a4da367043b8e023fb70a04d1490aa7d68800e465d1b97e493b", size = 1730944, upload-time = "2025-07-29T05:50:28.382Z" }, + { url = "https://files.pythonhosted.org/packages/1b/96/784c785674117b4cb3877522a177ba1b5e4db9ce0fd519430b5de76eec90/aiohttp-3.12.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6443cca89553b7a5485331bc9bedb2342b08d073fa10b8c7d1c60579c4a7b9bd", size = 1668020, upload-time = "2025-07-29T05:50:30.032Z" }, + { url = "https://files.pythonhosted.org/packages/12/8a/8b75f203ea7e5c21c0920d84dd24a5c0e971fe1e9b9ebbf29ae7e8e39790/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c5f40ec615e5264f44b4282ee27628cea221fcad52f27405b80abb346d9f3f8", size = 1716292, upload-time = "2025-07-29T05:50:31.983Z" }, + { url = "https://files.pythonhosted.org/packages/47/0b/a1451543475bb6b86a5cfc27861e52b14085ae232896a2654ff1231c0992/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:2abbb216a1d3a2fe86dbd2edce20cdc5e9ad0be6378455b05ec7f77361b3ab50", size = 1711451, upload-time = "2025-07-29T05:50:33.989Z" }, + { url = "https://files.pythonhosted.org/packages/55/fd/793a23a197cc2f0d29188805cfc93aa613407f07e5f9da5cd1366afd9d7c/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:db71ce547012a5420a39c1b744d485cfb823564d01d5d20805977f5ea1345676", size = 1691634, upload-time = "2025-07-29T05:50:35.846Z" }, + { url = "https://files.pythonhosted.org/packages/ca/bf/23a335a6670b5f5dfc6d268328e55a22651b440fca341a64fccf1eada0c6/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ced339d7c9b5030abad5854aa5413a77565e5b6e6248ff927d3e174baf3badf7", size = 1785238, upload-time = "2025-07-29T05:50:37.597Z" }, + { url = "https://files.pythonhosted.org/packages/57/4f/ed60a591839a9d85d40694aba5cef86dde9ee51ce6cca0bb30d6eb1581e7/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7c7dd29c7b5bda137464dc9bfc738d7ceea46ff70309859ffde8c022e9b08ba7", size = 1805701, upload-time = "2025-07-29T05:50:39.591Z" }, + { url = "https://files.pythonhosted.org/packages/85/e0/444747a9455c5de188c0f4a0173ee701e2e325d4b2550e9af84abb20cdba/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:421da6fd326460517873274875c6c5a18ff225b40da2616083c5a34a7570b685", size = 1718758, upload-time = "2025-07-29T05:50:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/1006278d1ffd13a698e5dd4bfa01e5878f6bddefc296c8b62649753ff249/aiohttp-3.12.15-cp311-cp311-win32.whl", hash = "sha256:4420cf9d179ec8dfe4be10e7d0fe47d6d606485512ea2265b0d8c5113372771b", size = 428868, upload-time = "2025-07-29T05:50:43.063Z" }, + { url = "https://files.pythonhosted.org/packages/10/97/ad2b18700708452400278039272032170246a1bf8ec5d832772372c71f1a/aiohttp-3.12.15-cp311-cp311-win_amd64.whl", hash = "sha256:edd533a07da85baa4b423ee8839e3e91681c7bfa19b04260a469ee94b778bf6d", size = 453273, upload-time = "2025-07-29T05:50:44.613Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "annotated-doc" +version = "0.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anthropic" +version = "0.64.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d8/4f/f2b880cba1a76f3acc7d5eb2ae217632eac1b8cef5ed3027493545c59eba/anthropic-0.64.0.tar.gz", hash = "sha256:3d496c91a63dff64f451b3e8e4b238a9640bf87b0c11d0b74ddc372ba5a3fe58", size = 427893, upload-time = "2025-08-13T17:09:49.915Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/b2/2d268bcd5d6441df9dc0ebebc67107657edb8b0150d3fda1a5b81d1bec45/anthropic-0.64.0-py3-none-any.whl", hash = "sha256:6f5f7d913a6a95eb7f8e1bda4e75f76670e8acd8d4cd965e02e2a256b0429dd1", size = 297244, upload-time = "2025-08-13T17:09:47.908Z" }, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.9.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" } + +[[package]] +name = "anyio" +version = "4.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, +] + +[[package]] +name = "appnope" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, +] + +[[package]] +name = "asttokens" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, +] + +[[package]] +name = "atria-core" +version = "0.0.0" +source = { git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella#c171ab4c154bf3ba74ff3b0515908cc49bb5cdbe" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "codename" }, + { name = "coloredlogs" }, + { name = "imagesize" }, + { name = "lazy-loader" }, + { name = "lxml" }, + { name = "numpy" }, + { name = "omegaconf" }, + { name = "pillow" }, + { name = "pyarrow" }, + { name = "pydantic" }, + { name = "pymupdf" }, + { name = "pypdf2" }, + { name = "rich" }, +] + +[[package]] +name = "attrs" +version = "25.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload-time = "2025-03-13T11:10:22.779Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.13.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload-time = "2025-04-15T17:05:13.836Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" }, +] + +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + +[[package]] +name = "cachetools" +version = "6.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/7e/b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz", hash = "sha256:3f391e4bd8f8bf0931169baf7456cc822705f4e2a31f840d218f445b9a854201", size = 31325, upload-time = "2025-10-12T14:55:30.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl", hash = "sha256:09868944b6dde876dfd44e1d47e18484541eaf12f26f29b7af91b26cc892d701", size = 11280, upload-time = "2025-10-12T14:55:28.382Z" }, +] + +[[package]] +name = "certifi" +version = "2025.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dc/67/960ebe6bf230a96cda2e0abcf73af550ec4f090005363542f0765df162e0/certifi-2025.8.3.tar.gz", hash = "sha256:e564105f78ded564e3ae7c923924435e1daa7463faeab5bb932bc53ffae63407", size = 162386, upload-time = "2025-08-03T03:07:47.08Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" }, +] + +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" }, + { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" }, + { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" }, + { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" }, + { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" }, + { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" }, + { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/2d/5fd176ceb9b2fc619e63405525573493ca23441330fcdaee6bef9460e924/charset_normalizer-3.4.3.tar.gz", hash = "sha256:6fce4b8500244f6fcb71465d4a4930d132ba9ab8e71a7859e6a5d59851068d14", size = 122371, upload-time = "2025-08-09T07:57:28.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/b5/991245018615474a60965a7c9cd2b4efbaabd16d582a5547c47ee1c7730b/charset_normalizer-3.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b256ee2e749283ef3ddcff51a675ff43798d92d746d1a6e4631bf8c707d22d0b", size = 204483, upload-time = "2025-08-09T07:55:53.12Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2a/ae245c41c06299ec18262825c1569c5d3298fc920e4ddf56ab011b417efd/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:13faeacfe61784e2559e690fc53fa4c5ae97c6fcedb8eb6fb8d0a15b475d2c64", size = 145520, upload-time = "2025-08-09T07:55:54.712Z" }, + { url = "https://files.pythonhosted.org/packages/3a/a4/b3b6c76e7a635748c4421d2b92c7b8f90a432f98bda5082049af37ffc8e3/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:00237675befef519d9af72169d8604a067d92755e84fe76492fef5441db05b91", size = 158876, upload-time = "2025-08-09T07:55:56.024Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e6/63bb0e10f90a8243c5def74b5b105b3bbbfb3e7bb753915fe333fb0c11ea/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:585f3b2a80fbd26b048a0be90c5aae8f06605d3c92615911c3a2b03a8a3b796f", size = 156083, upload-time = "2025-08-09T07:55:57.582Z" }, + { url = "https://files.pythonhosted.org/packages/87/df/b7737ff046c974b183ea9aa111b74185ac8c3a326c6262d413bd5a1b8c69/charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e78314bdc32fa80696f72fa16dc61168fda4d6a0c014e0380f9d02f0e5d8a07", size = 150295, upload-time = "2025-08-09T07:55:59.147Z" }, + { url = "https://files.pythonhosted.org/packages/61/f1/190d9977e0084d3f1dc169acd060d479bbbc71b90bf3e7bf7b9927dec3eb/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:96b2b3d1a83ad55310de8c7b4a2d04d9277d5591f40761274856635acc5fcb30", size = 148379, upload-time = "2025-08-09T07:56:00.364Z" }, + { url = "https://files.pythonhosted.org/packages/4c/92/27dbe365d34c68cfe0ca76f1edd70e8705d82b378cb54ebbaeabc2e3029d/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:939578d9d8fd4299220161fdd76e86c6a251987476f5243e8864a7844476ba14", size = 160018, upload-time = "2025-08-09T07:56:01.678Z" }, + { url = "https://files.pythonhosted.org/packages/99/04/baae2a1ea1893a01635d475b9261c889a18fd48393634b6270827869fa34/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:fd10de089bcdcd1be95a2f73dbe6254798ec1bda9f450d5828c96f93e2536b9c", size = 157430, upload-time = "2025-08-09T07:56:02.87Z" }, + { url = "https://files.pythonhosted.org/packages/2f/36/77da9c6a328c54d17b960c89eccacfab8271fdaaa228305330915b88afa9/charset_normalizer-3.4.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1e8ac75d72fa3775e0b7cb7e4629cec13b7514d928d15ef8ea06bca03ef01cae", size = 151600, upload-time = "2025-08-09T07:56:04.089Z" }, + { url = "https://files.pythonhosted.org/packages/64/d4/9eb4ff2c167edbbf08cdd28e19078bf195762e9bd63371689cab5ecd3d0d/charset_normalizer-3.4.3-cp311-cp311-win32.whl", hash = "sha256:6cf8fd4c04756b6b60146d98cd8a77d0cdae0e1ca20329da2ac85eed779b6849", size = 99616, upload-time = "2025-08-09T07:56:05.658Z" }, + { url = "https://files.pythonhosted.org/packages/f4/9c/996a4a028222e7761a96634d1820de8a744ff4327a00ada9c8942033089b/charset_normalizer-3.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:31a9a6f775f9bcd865d88ee350f0ffb0e25936a7f930ca98995c05abf1faf21c", size = 107108, upload-time = "2025-08-09T07:56:07.176Z" }, + { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, +] + +[[package]] +name = "click" +version = "8.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, +] + +[[package]] +name = "codename" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c8/44/682d39480678b0d3a07c7c7c08e5c102e4807938ed9d126a77e21be00afd/codename-1.1.tar.gz", hash = "sha256:863780fa47521baa19087015d408b19dac4e93c6eaac4b1cc59a7ea728ea15fd", size = 3493, upload-time = "2018-01-30T13:22:36.71Z" } + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "coloredlogs" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "humanfriendly" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" }, +] + +[[package]] +name = "colorlog" +version = "6.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" }, +] + +[[package]] +name = "comm" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, +] + +[[package]] +name = "contourpy" +version = "1.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773, upload-time = "2025-07-26T12:01:02.277Z" }, + { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149, upload-time = "2025-07-26T12:01:04.072Z" }, + { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222, upload-time = "2025-07-26T12:01:05.688Z" }, + { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234, upload-time = "2025-07-26T12:01:07.054Z" }, + { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555, upload-time = "2025-07-26T12:01:08.801Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238, upload-time = "2025-07-26T12:01:10.319Z" }, + { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218, upload-time = "2025-07-26T12:01:12.659Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867, upload-time = "2025-07-26T12:01:15.533Z" }, + { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677, upload-time = "2025-07-26T12:01:17.088Z" }, + { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234, upload-time = "2025-07-26T12:01:18.256Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123, upload-time = "2025-07-26T12:01:19.848Z" }, + { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809, upload-time = "2025-07-26T12:02:52.74Z" }, + { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593, upload-time = "2025-07-26T12:02:54.037Z" }, + { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202, upload-time = "2025-07-26T12:02:55.947Z" }, + { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207, upload-time = "2025-07-26T12:02:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315, upload-time = "2025-07-26T12:02:58.801Z" }, +] + +[[package]] +name = "croniter" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, + { name = "pytz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/2f/44d1ae153a0e27be56be43465e5cb39b9650c781e001e7864389deb25090/croniter-6.0.0.tar.gz", hash = "sha256:37c504b313956114a983ece2c2b07790b1f1094fe9d81cc94739214748255577", size = 64481, upload-time = "2024-12-17T17:17:47.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/4b/290b4c3efd6417a8b0c284896de19b1d5855e6dbdb97d2a35e68fa42de85/croniter-6.0.0-py2.py3-none-any.whl", hash = "sha256:2f878c3856f17896979b2a4379ba1f09c83e374931ea15cc835c5dd2eee9b368", size = 25468, upload-time = "2024-12-17T17:17:45.359Z" }, +] + +[[package]] +name = "cryptography" +version = "43.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0d/05/07b55d1fa21ac18c3a8c79f764e2514e6f6a9698f1be44994f5adf0d29db/cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805", size = 686989, upload-time = "2024-10-18T15:58:32.918Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/f3/01fdf26701a26f4b4dbc337a26883ad5bccaa6f1bbbdd29cd89e22f18a1c/cryptography-43.0.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf7a1932ac4176486eab36a19ed4c0492da5d97123f1406cf15e41b05e787d2e", size = 6225303, upload-time = "2024-10-18T15:57:36.753Z" }, + { url = "https://files.pythonhosted.org/packages/a3/01/4896f3d1b392025d4fcbecf40fdea92d3df8662123f6835d0af828d148fd/cryptography-43.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63efa177ff54aec6e1c0aefaa1a241232dcd37413835a9b674b6e3f0ae2bfd3e", size = 3760905, upload-time = "2024-10-18T15:57:39.166Z" }, + { url = "https://files.pythonhosted.org/packages/0a/be/f9a1f673f0ed4b7f6c643164e513dbad28dd4f2dcdf5715004f172ef24b6/cryptography-43.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e1ce50266f4f70bf41a2c6dc4358afadae90e2a1e5342d3c08883df1675374f", size = 3977271, upload-time = "2024-10-18T15:57:41.227Z" }, + { url = "https://files.pythonhosted.org/packages/4e/49/80c3a7b5514d1b416d7350830e8c422a4d667b6d9b16a9392ebfd4a5388a/cryptography-43.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:443c4a81bb10daed9a8f334365fe52542771f25aedaf889fd323a853ce7377d6", size = 3746606, upload-time = "2024-10-18T15:57:42.903Z" }, + { url = "https://files.pythonhosted.org/packages/0e/16/a28ddf78ac6e7e3f25ebcef69ab15c2c6be5ff9743dd0709a69a4f968472/cryptography-43.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:74f57f24754fe349223792466a709f8e0c093205ff0dca557af51072ff47ab18", size = 3986484, upload-time = "2024-10-18T15:57:45.434Z" }, + { url = "https://files.pythonhosted.org/packages/01/f5/69ae8da70c19864a32b0315049866c4d411cce423ec169993d0434218762/cryptography-43.0.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9762ea51a8fc2a88b70cf2995e5675b38d93bf36bd67d91721c309df184f49bd", size = 3852131, upload-time = "2024-10-18T15:57:47.267Z" }, + { url = "https://files.pythonhosted.org/packages/fd/db/e74911d95c040f9afd3612b1f732e52b3e517cb80de8bf183be0b7d413c6/cryptography-43.0.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:81ef806b1fef6b06dcebad789f988d3b37ccaee225695cf3e07648eee0fc6b73", size = 4075647, upload-time = "2024-10-18T15:57:49.684Z" }, + { url = "https://files.pythonhosted.org/packages/56/48/7b6b190f1462818b324e674fa20d1d5ef3e24f2328675b9b16189cbf0b3c/cryptography-43.0.3-cp37-abi3-win32.whl", hash = "sha256:cbeb489927bd7af4aa98d4b261af9a5bc025bd87f0e3547e11584be9e9427be2", size = 2623873, upload-time = "2024-10-18T15:57:51.822Z" }, + { url = "https://files.pythonhosted.org/packages/eb/b1/0ebff61a004f7f89e7b65ca95f2f2375679d43d0290672f7713ee3162aff/cryptography-43.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:f46304d6f0c6ab8e52770addfa2fc41e6629495548862279641972b6215451cd", size = 3068039, upload-time = "2024-10-18T15:57:54.426Z" }, + { url = "https://files.pythonhosted.org/packages/30/d5/c8b32c047e2e81dd172138f772e81d852c51f0f2ad2ae8a24f1122e9e9a7/cryptography-43.0.3-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:8ac43ae87929a5982f5948ceda07001ee5e83227fd69cf55b109144938d96984", size = 6222984, upload-time = "2024-10-18T15:57:56.174Z" }, + { url = "https://files.pythonhosted.org/packages/2f/78/55356eb9075d0be6e81b59f45c7b48df87f76a20e73893872170471f3ee8/cryptography-43.0.3-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:846da004a5804145a5f441b8530b4bf35afbf7da70f82409f151695b127213d5", size = 3762968, upload-time = "2024-10-18T15:57:58.206Z" }, + { url = "https://files.pythonhosted.org/packages/2a/2c/488776a3dc843f95f86d2f957ca0fc3407d0242b50bede7fad1e339be03f/cryptography-43.0.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f996e7268af62598f2fc1204afa98a3b5712313a55c4c9d434aef49cadc91d4", size = 3977754, upload-time = "2024-10-18T15:58:00.683Z" }, + { url = "https://files.pythonhosted.org/packages/7c/04/2345ca92f7a22f601a9c62961741ef7dd0127c39f7310dffa0041c80f16f/cryptography-43.0.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f7b178f11ed3664fd0e995a47ed2b5ff0a12d893e41dd0494f406d1cf555cab7", size = 3749458, upload-time = "2024-10-18T15:58:02.225Z" }, + { url = "https://files.pythonhosted.org/packages/ac/25/e715fa0bc24ac2114ed69da33adf451a38abb6f3f24ec207908112e9ba53/cryptography-43.0.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c2e6fc39c4ab499049df3bdf567f768a723a5e8464816e8f009f121a5a9f4405", size = 3988220, upload-time = "2024-10-18T15:58:04.331Z" }, + { url = "https://files.pythonhosted.org/packages/21/ce/b9c9ff56c7164d8e2edfb6c9305045fbc0df4508ccfdb13ee66eb8c95b0e/cryptography-43.0.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e1be4655c7ef6e1bbe6b5d0403526601323420bcf414598955968c9ef3eb7d16", size = 3853898, upload-time = "2024-10-18T15:58:06.113Z" }, + { url = "https://files.pythonhosted.org/packages/2a/33/b3682992ab2e9476b9c81fff22f02c8b0a1e6e1d49ee1750a67d85fd7ed2/cryptography-43.0.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:df6b6c6d742395dd77a23ea3728ab62f98379eff8fb61be2744d4679ab678f73", size = 4076592, upload-time = "2024-10-18T15:58:08.673Z" }, + { url = "https://files.pythonhosted.org/packages/81/1e/ffcc41b3cebd64ca90b28fd58141c5f68c83d48563c88333ab660e002cd3/cryptography-43.0.3-cp39-abi3-win32.whl", hash = "sha256:d56e96520b1020449bbace2b78b603442e7e378a9b3bd68de65c782db1507995", size = 2623145, upload-time = "2024-10-18T15:58:10.264Z" }, + { url = "https://files.pythonhosted.org/packages/87/5c/3dab83cc4aba1f4b0e733e3f0c3e7d4386440d660ba5b1e3ff995feb734d/cryptography-43.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:0c580952eef9bf68c4747774cde7ec1d85a6e61de97281f2dba83c7d2c806362", size = 3068026, upload-time = "2024-10-18T15:58:11.916Z" }, +] + +[[package]] +name = "cssutils" +version = "2.11.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/9f/329d26121fe165be44b1dfff21aa0dc348f04633931f1d20ed6cf448a236/cssutils-2.11.1.tar.gz", hash = "sha256:0563a76513b6af6eebbe788c3bf3d01c920e46b3f90c8416738c5cfc773ff8e2", size = 711657, upload-time = "2024-06-04T15:51:39.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/ec/bb273b7208c606890dc36540fe667d06ce840a6f62f9fae7e658fcdc90fb/cssutils-2.11.1-py3-none-any.whl", hash = "sha256:a67bfdfdff4f3867fab43698ec4897c1a828eca5973f4073321b3bccaf1199b1", size = 385747, upload-time = "2024-06-04T15:51:37.499Z" }, +] + +[[package]] +name = "cycler" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, +] + +[[package]] +name = "dash" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "flask" }, + { name = "importlib-metadata" }, + { name = "nest-asyncio" }, + { name = "plotly" }, + { name = "requests" }, + { name = "retrying" }, + { name = "setuptools" }, + { name = "typing-extensions" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/80/37/8b5621e0a0b3c6e81a8b6cd3f033aa4b750f53e288dd1a494a887a8a06e9/dash-3.2.0.tar.gz", hash = "sha256:93300b9b99498f8b8ed267e61c455b4ee1282c7e4d4b518600eec87ce6ddea55", size = 7558708, upload-time = "2025-07-31T19:18:59.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/36/e0010483ca49b9bf6f389631ccea07b3ff6b678d14d8c7a0a4357860c36a/dash-3.2.0-py3-none-any.whl", hash = "sha256:4c1819588d83bed2cbcf5807daa5c2380c8c85789a6935a733f018f04ad8a6a2", size = 7900661, upload-time = "2025-07-31T19:18:50.679Z" }, +] + +[[package]] +name = "dash-bootstrap-components" +version = "2.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/d4/5b7da808ff5acb3a6ca702f504d8ef05bc7d4c475b18dadefd783b1120c3/dash_bootstrap_components-2.0.4.tar.gz", hash = "sha256:c3206c0923774bbc6a6ddaa7822b8d9aa5326b0d3c1e7cd795cc975025fe2484", size = 115599, upload-time = "2025-08-20T19:42:09.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/38/1efeec8b4d741c09ccd169baf8a00c07a0176b58e418d4cd0c30dffedd22/dash_bootstrap_components-2.0.4-py3-none-any.whl", hash = "sha256:767cf0084586c1b2b614ccf50f79fe4525fdbbf8e3a161ed60016e584a14f5d1", size = 204044, upload-time = "2025-08-20T19:42:07.928Z" }, +] + +[[package]] +name = "datadings" +version = "3.4.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gdown" }, + { name = "msgpack" }, + { name = "msgpack-numpy" }, + { name = "natsort" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "pyzmq" }, + { name = "requests" }, + { name = "scipy" }, + { name = "simplebloom" }, + { name = "simplejpeg" }, + { name = "tqdm" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/18/2e46a7241c4b3ca653c88b607ca129ff9fae23625f09c25ff73d4d90190c/datadings-3.4.7-py3-none-any.whl", hash = "sha256:3602f3581650ac55a2f7cf722a72786284441846907fa244561dc9924edacd19", size = 2907582, upload-time = "2025-05-28T09:44:51.584Z" }, +] + +[[package]] +name = "datasets" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e3/9d/348ed92110ba5f9b70b51ca1078d4809767a835aa2b7ce7e74ad2b98323d/datasets-4.0.0.tar.gz", hash = "sha256:9657e7140a9050db13443ba21cb5de185af8af944479b00e7ff1e00a61c8dbf1", size = 569566, upload-time = "2025-07-09T14:35:52.431Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d", size = 494825, upload-time = "2025-07-09T14:35:50.658Z" }, +] + +[[package]] +name = "debugpy" +version = "1.8.17" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/ad/71e708ff4ca377c4230530d6a7aa7992592648c122a2cd2b321cf8b35a76/debugpy-1.8.17.tar.gz", hash = "sha256:fd723b47a8c08892b1a16b2c6239a8b96637c62a59b94bb5dab4bac592a58a8e", size = 1644129, upload-time = "2025-09-17T16:33:20.633Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/53/3af72b5c159278c4a0cf4cffa518675a0e73bdb7d1cac0239b815502d2ce/debugpy-1.8.17-cp311-cp311-macosx_15_0_universal2.whl", hash = "sha256:d3fce3f0e3de262a3b67e69916d001f3e767661c6e1ee42553009d445d1cd840", size = 2207154, upload-time = "2025-09-17T16:33:29.457Z" }, + { url = "https://files.pythonhosted.org/packages/8f/6d/204f407df45600e2245b4a39860ed4ba32552330a0b3f5f160ae4cc30072/debugpy-1.8.17-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:c6bdf134457ae0cac6fb68205776be635d31174eeac9541e1d0c062165c6461f", size = 3170322, upload-time = "2025-09-17T16:33:30.837Z" }, + { url = "https://files.pythonhosted.org/packages/f2/13/1b8f87d39cf83c6b713de2620c31205299e6065622e7dd37aff4808dd410/debugpy-1.8.17-cp311-cp311-win32.whl", hash = "sha256:e79a195f9e059edfe5d8bf6f3749b2599452d3e9380484cd261f6b7cd2c7c4da", size = 5155078, upload-time = "2025-09-17T16:33:33.331Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c5/c012c60a2922cc91caa9675d0ddfbb14ba59e1e36228355f41cab6483469/debugpy-1.8.17-cp311-cp311-win_amd64.whl", hash = "sha256:b532282ad4eca958b1b2d7dbcb2b7218e02cb934165859b918e3b6ba7772d3f4", size = 5179011, upload-time = "2025-09-17T16:33:35.711Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d0/89247ec250369fc76db477720a26b2fce7ba079ff1380e4ab4529d2fe233/debugpy-1.8.17-py2.py3-none-any.whl", hash = "sha256:60c7dca6571efe660ccb7a9508d73ca14b8796c4ed484c2002abba714226cfef", size = 5283210, upload-time = "2025-09-17T16:34:25.835Z" }, +] + +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + +[[package]] +name = "deprecation" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/d3/8ae2869247df154b64c1884d7346d412fed0c49df84db635aab2d1c40e62/deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff", size = 173788, upload-time = "2020-04-20T14:23:38.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, +] + +[[package]] +name = "diffusers" +version = "0.35.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "importlib-metadata" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/68/288ca23c7c05c73e87ffe5efffc282400ac9b017f7a9bb03883f4310ea15/diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded", size = 3366711, upload-time = "2025-10-15T04:05:17.213Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/2e/38d9824f8c6bb048c5ba21c6d4da54c29c162a46b58b3ef907a360a76d3e/diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5", size = 4121649, upload-time = "2025-10-15T04:05:14.391Z" }, +] + +[[package]] +name = "dill" +version = "0.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847, upload-time = "2024-01-27T23:42:16.145Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252, upload-time = "2024-01-27T23:42:14.239Z" }, +] + +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + +[[package]] +name = "docgenie" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "accelerate" }, + { name = "aiohappyeyeballs" }, + { name = "aiohttp" }, + { name = "aiosignal" }, + { name = "annotated-types" }, + { name = "anthropic" }, + { name = "anyio" }, + { name = "atria-core" }, + { name = "attrs" }, + { name = "beautifulsoup4" }, + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "click" }, + { name = "colorlog" }, + { name = "cssutils" }, + { name = "dash" }, + { name = "dash-bootstrap-components" }, + { name = "datadings" }, + { name = "datasets" }, + { name = "diffusers" }, + { name = "dill" }, + { name = "distro" }, + { name = "due-evaluator" }, + { name = "editdistance" }, + { name = "einops" }, + { name = "fastapi" }, + { name = "filelock" }, + { name = "fire" }, + { name = "flask" }, + { name = "frozenlist" }, + { name = "fsspec" }, + { name = "gitdb" }, + { name = "gitpython" }, + { name = "google-api-python-client" }, + { name = "google-auth-httplib2" }, + { name = "google-auth-oauthlib" }, + { name = "h11" }, + { name = "h5py" }, + { name = "hdbscan" }, + { name = "hf-xet" }, + { name = "httpcore" }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "hydra-core" }, + { name = "icecream" }, + { name = "idna" }, + { name = "jinja2" }, + { name = "jiter" }, + { name = "jsonlines" }, + { name = "levenshtein" }, + { name = "lxml" }, + { name = "markdown-it-py" }, + { name = "markupsafe" }, + { name = "matplotlib" }, + { name = "mdurl" }, + { name = "mmcv" }, + { name = "mmdet" }, + { name = "more-itertools" }, + { name = "mpmath" }, + { name = "multidict" }, + { name = "multiprocess" }, + { name = "networkx" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pdf2image" }, + { name = "pdfplumber" }, + { name = "pillow" }, + { name = "platformdirs" }, + { name = "playwright" }, + { name = "plotly" }, + { name = "propcache" }, + { name = "protobuf" }, + { name = "pyarrow" }, + { name = "pydantic" }, + { name = "pydantic-argparse" }, + { name = "pydantic-core" }, + { name = "pydantic-settings" }, + { name = "pygments" }, + { name = "pymupdf" }, + { name = "pypdf2" }, + { name = "pytesseract" }, + { name = "python-barcode" }, + { name = "python-dateutil" }, + { name = "python-dotenv" }, + { name = "python-multipart" }, + { name = "pytorch-ignite" }, + { name = "pytz" }, + { name = "pyyaml" }, + { name = "rapidfuzz" }, + { name = "redis" }, + { name = "regex" }, + { name = "requests" }, + { name = "rich" }, + { name = "rq" }, + { name = "safetensors" }, + { name = "scikit-learn" }, + { name = "seaborn" }, + { name = "selenium" }, + { name = "sentence-transformers" }, + { name = "sentry-sdk" }, + { name = "seqeval" }, + { name = "setuptools" }, + { name = "six" }, + { name = "smmap" }, + { name = "sniffio" }, + { name = "soupsieve" }, + { name = "supabase" }, + { name = "sympy" }, + { name = "tenacity" }, + { name = "tensorboard" }, + { name = "tensorboardx" }, + { name = "textdistance" }, + { name = "timm" }, + { name = "tokenizers" }, + { name = "torch" }, + { name = "torchinfo" }, + { name = "torchvision", version = "0.16.0", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.16.0+cu121", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "triton" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, + { name = "tzdata" }, + { name = "umap-learn" }, + { name = "urllib3" }, + { name = "uvicorn", extra = ["standard"] }, + { name = "wandb" }, + { name = "wheel" }, + { name = "xxhash" }, + { name = "yarl" }, +] + +[package.dev-dependencies] +dev = [ + { name = "ipykernel" }, + { name = "pydrive2" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "accelerate", specifier = ">=1.11.0" }, + { name = "aiohappyeyeballs", specifier = "==2.6.1" }, + { name = "aiohttp", specifier = "==3.12.15" }, + { name = "aiosignal", specifier = "==1.4.0" }, + { name = "annotated-types", specifier = "==0.7.0" }, + { name = "anthropic", specifier = "==0.64.0" }, + { name = "anyio", specifier = "==4.10.0" }, + { name = "atria-core", git = "https://github.com/saifullah3396/atria_core.git?branch=devel-estella" }, + { name = "attrs", specifier = "==25.3.0" }, + { name = "beautifulsoup4", specifier = "==4.13.4" }, + { name = "certifi", specifier = "==2025.8.3" }, + { name = "charset-normalizer", specifier = "==3.4.3" }, + { name = "click", specifier = "==8.2.1" }, + { name = "colorlog", specifier = ">=6.9.0" }, + { name = "cssutils", specifier = "==2.11.1" }, + { name = "dash", specifier = ">=3.2.0" }, + { name = "dash-bootstrap-components", specifier = ">=2.0.4" }, + { name = "datadings", specifier = ">=3.4.7" }, + { name = "datasets", specifier = "==4.0.0" }, + { name = "diffusers", specifier = ">=0.35.2" }, + { name = "dill", specifier = "==0.3.8" }, + { name = "distro", specifier = "==1.9.0" }, + { name = "due-evaluator", git = "https://github.com/due-benchmark/evaluator.git" }, + { name = "editdistance", specifier = ">=0.8.1" }, + { name = "einops", specifier = "==0.8.2" }, + { name = "fastapi", specifier = ">=0.109.0" }, + { name = "filelock", specifier = "==3.19.1" }, + { name = "fire", specifier = ">=0.7.1" }, + { name = "flask", specifier = ">=3.1.2" }, + { name = "frozenlist", specifier = "==1.7.0" }, + { name = "fsspec", specifier = "==2025.3.0" }, + { name = "gitdb", specifier = "==4.0.12" }, + { name = "gitpython", specifier = "==3.1.45" }, + { name = "google-api-python-client", specifier = ">=2.100.0" }, + { name = "google-auth-httplib2", specifier = ">=0.2.0" }, + { name = "google-auth-oauthlib", specifier = ">=1.2.0" }, + { name = "h11", specifier = "==0.16.0" }, + { name = "h5py", specifier = ">=3.14.0" }, + { name = "hdbscan", specifier = ">=0.8.40" }, + { name = "hf-xet", specifier = "==1.1.8" }, + { name = "httpcore", specifier = "==1.0.9" }, + { name = "httpx", specifier = "==0.28.1" }, + { name = "huggingface-hub", specifier = "==0.34.4" }, + { name = "hydra-core", specifier = ">=1.3.2" }, + { name = "icecream", specifier = ">=2.1.8" }, + { name = "idna", specifier = "==3.10" }, + { name = "jinja2", specifier = "==3.1.6" }, + { name = "jiter", specifier = "==0.10.0" }, + { name = "jsonlines", specifier = "==4.0.0" }, + { name = "levenshtein", specifier = "==0.27.1" }, + { name = "lxml", specifier = ">=5.1.0" }, + { name = "markdown-it-py", specifier = "==4.0.0" }, + { name = "markupsafe", specifier = "==3.0.2" }, + { name = "matplotlib", specifier = ">=3.10.6" }, + { name = "mdurl", specifier = "==0.1.2" }, + { name = "mmcv", url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" }, + { name = "mmdet", specifier = "==3.3.0" }, + { name = "more-itertools", specifier = "==10.7.0" }, + { name = "mpmath", specifier = "==1.3.0" }, + { name = "multidict", specifier = "==6.6.4" }, + { name = "multiprocess", specifier = "==0.70.16" }, + { name = "networkx", specifier = "==3.5" }, + { name = "numpy", specifier = "==1.26.4" }, + { name = "packaging", specifier = "==25.0" }, + { name = "pandas", specifier = "==2.3.1" }, + { name = "pdf2image", specifier = "==1.17.0" }, + { name = "pdfplumber", specifier = ">=0.10.4" }, + { name = "pillow", specifier = "==11.3.0" }, + { name = "platformdirs", specifier = "==4.3.8" }, + { name = "playwright", specifier = ">=1.55.0" }, + { name = "plotly", specifier = ">=6.3.1" }, + { name = "propcache", specifier = "==0.3.2" }, + { name = "protobuf", specifier = "==6.32.0" }, + { name = "pyarrow", specifier = "==21.0.0" }, + { name = "pydantic", specifier = "==2.11.7" }, + { name = "pydantic-argparse", specifier = ">=0.10.0" }, + { name = "pydantic-core", specifier = "==2.33.2" }, + { name = "pydantic-settings", specifier = ">=2.11.0" }, + { name = "pygments", specifier = "==2.19.2" }, + { name = "pymupdf", specifier = "==1.26.3" }, + { name = "pypdf2", specifier = "==3.0.1" }, + { name = "pytesseract", specifier = ">=0.3.10" }, + { name = "python-barcode", specifier = ">=0.16.1" }, + { name = "python-dateutil", specifier = "==2.9.0.post0" }, + { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "python-multipart", specifier = ">=0.0.6" }, + { name = "pytorch-ignite", specifier = ">=0.5.2" }, + { name = "pytz", specifier = "==2025.2" }, + { name = "pyyaml", specifier = "==6.0.2" }, + { name = "rapidfuzz", specifier = "==3.13.0" }, + { name = "redis", specifier = ">=5.0.0" }, + { name = "regex", specifier = "==2025.7.34" }, + { name = "requests", specifier = "==2.32.5" }, + { name = "rich", specifier = "==14.1.0" }, + { name = "rq", specifier = ">=1.15.0" }, + { name = "safetensors", specifier = "==0.6.2" }, + { name = "scikit-learn", specifier = ">=1.7.2" }, + { name = "seaborn", specifier = ">=0.13.2" }, + { name = "selenium", specifier = ">=4.36.0" }, + { name = "sentence-transformers", specifier = ">=5.1.1" }, + { name = "sentry-sdk", specifier = "==2.35.0" }, + { name = "seqeval", specifier = ">=1.2.2" }, + { name = "setuptools", specifier = "==78.1.1" }, + { name = "six", specifier = "==1.17.0" }, + { name = "smmap", specifier = "==5.0.2" }, + { name = "sniffio", specifier = "==1.3.1" }, + { name = "soupsieve", specifier = "==2.7" }, + { name = "supabase", specifier = ">=2.0.0" }, + { name = "sympy", specifier = "==1.13.1" }, + { name = "tenacity", specifier = ">=8.2.3" }, + { name = "tensorboard", specifier = ">=2.20.0" }, + { name = "tensorboardx", specifier = ">=2.6.4" }, + { name = "textdistance", specifier = ">=4.0.0" }, + { name = "timm", specifier = ">=1.0.20" }, + { name = "tokenizers", specifier = "==0.21.4" }, + { name = "torch", specifier = "==2.1.0", index = "https://download.pytorch.org/whl/cu121" }, + { name = "torchinfo", specifier = ">=1.8.0" }, + { name = "torchvision", index = "https://download.pytorch.org/whl/cu121" }, + { name = "tqdm", specifier = "==4.67.1" }, + { name = "transformers", specifier = "==4.49" }, + { name = "triton", specifier = "==2.1.0" }, + { name = "typing-extensions", specifier = "==4.14.1" }, + { name = "typing-inspection", specifier = "==0.4.1" }, + { name = "tzdata", specifier = "==2025.2" }, + { name = "umap-learn", specifier = "==0.5.9.post2" }, + { name = "urllib3", specifier = "==2.5.0" }, + { name = "uvicorn", extras = ["standard"], specifier = ">=0.27.0" }, + { name = "wandb", specifier = "==0.21.1" }, + { name = "wheel", specifier = "==0.45.1" }, + { name = "xxhash", specifier = "==3.5.0" }, + { name = "yarl", specifier = "==1.20.1" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "ipykernel", specifier = ">=6.30.1" }, + { name = "pydrive2", specifier = ">=1.21.3" }, + { name = "pytest", specifier = ">=8.4.2" }, +] + +[[package]] +name = "due-evaluator" +version = "0.0.8" +source = { git = "https://github.com/due-benchmark/evaluator.git#75c0f550f5b7c0104a86993d5f307794687493ac" } +dependencies = [ + { name = "numpy" }, + { name = "pandas" }, + { name = "scipy" }, + { name = "textdistance", extra = ["levenshtein"] }, +] + +[[package]] +name = "editdistance" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d5/18/9f4f975ca87a390832b1c22478f3702fcdf739f83211e24d054b7551270d/editdistance-0.8.1.tar.gz", hash = "sha256:d1cdf80a5d5014b0c9126a69a42ce55a457b457f6986ff69ca98e4fe4d2d8fed", size = 50006, upload-time = "2024-02-10T07:44:53.914Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/dc/d0c29fd52d8f9e795653ed2b838a2a48c739cdfff04ac5b79c6c0ecbdf79/editdistance-0.8.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:486105603a273d73d12a54f347dffa70ab281749d7c3879658b377bc49e4b98c", size = 106079, upload-time = "2024-02-10T07:43:34.34Z" }, + { url = "https://files.pythonhosted.org/packages/b4/c6/75fa45d7b78fbea6fd894f4e48895a75bd3c83d4a9a6b57673881d74d3e0/editdistance-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fad081f5f86a175c1a09a4e9e45b95c9349e454c21e181e842e01c85f1f536fc", size = 80580, upload-time = "2024-02-10T07:43:35.947Z" }, + { url = "https://files.pythonhosted.org/packages/b7/a3/058d823b6285c3511dc94ed80620c3fb0c18b4aaa708f70ba71f3af28436/editdistance-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8cb78e125f6759398885a775f5eed07c2bb72b2f86da43e674c6b6a3335b273b", size = 79087, upload-time = "2024-02-10T07:43:36.923Z" }, + { url = "https://files.pythonhosted.org/packages/a0/3a/0b13c7864c93b1e9b9952bd2a33c5ef3c4fd1bf70a5fad6924789e70e5eb/editdistance-0.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3778ca60aa89def9144b70e330bcec5330c7da1d69cb28c612e90b84510a1d3d", size = 409296, upload-time = "2024-02-10T07:43:38.52Z" }, + { url = "https://files.pythonhosted.org/packages/96/8a/db0fd79e8ddb9b5f86f274107c5d0a27ec4f2af88877df1f26c2c6d150cc/editdistance-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fba945eaa0436cf40bc53d7e299dc537c7c71353379a095b7459ff4af910da33", size = 412913, upload-time = "2024-02-10T07:43:39.852Z" }, + { url = "https://files.pythonhosted.org/packages/0d/d2/98be7112750ff17b436dd76f988f1e38570dcec0df8578ee19ef046f22fe/editdistance-0.8.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:877f2a0d801f32bc1a1878901ffb947b974361e849c66e314a7f1d786a446b58", size = 407430, upload-time = "2024-02-10T07:43:41.048Z" }, + { url = "https://files.pythonhosted.org/packages/03/62/1815e3bf164910c47ba1948c8b5e937a40c7f9763b64e98fb6666b01dd06/editdistance-0.8.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e79d351ca40a6ead5f3763253fd7521572ee0d3e5d42538630e56d10f48db481", size = 909217, upload-time = "2024-02-10T07:43:42.916Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d3/a832cea7b507a9be54e4ac3d1340fb66dca5f9c16c70bf38d5039e8fdede/editdistance-0.8.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:70ed382b3052a51161bad0149d4665003bf3b949fce0b01bf1253a4cc1a88239", size = 969407, upload-time = "2024-02-10T07:43:44.912Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b4/db291d2a3845cbf8047b4b5aad3b3e038a8a2994d87027b40e1a1b0f4b74/editdistance-0.8.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a529bfb384c4000775d76739c4e64f73337f0f5a3784933b1321b577a62bed4e", size = 922112, upload-time = "2024-02-10T07:43:47.047Z" }, + { url = "https://files.pythonhosted.org/packages/c4/26/7ddeacada4982d0b892a28897e21871d0f25bca165e3663e37c3a272808a/editdistance-0.8.1-cp311-cp311-win32.whl", hash = "sha256:b082232429e731f181af7f7d2bcf79da6ca8fadd04e9086c11e2973f7d330c81", size = 80799, upload-time = "2024-02-10T07:43:48.231Z" }, + { url = "https://files.pythonhosted.org/packages/52/a1/778af8590b8b12f03f62eacc3c8744407ade9e3d69be6dabe38d0afbf2dd/editdistance-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:cef1a4359252a49f2c4718e64e9d40027d9d951b289d045bdb278656e59f6af8", size = 79698, upload-time = "2024-02-10T07:43:49.234Z" }, +] + +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + +[[package]] +name = "executing" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, +] + +[[package]] +name = "fastapi" +version = "0.128.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-doc" }, + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/08/8c8508db6c7b9aae8f7175046af41baad690771c9bcde676419965e338c7/fastapi-0.128.0.tar.gz", hash = "sha256:1cc179e1cef10a6be60ffe429f79b829dce99d8de32d7acb7e6c8dfdf7f2645a", size = 365682, upload-time = "2025-12-27T15:21:13.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" }, +] + +[[package]] +name = "filelock" +version = "3.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, +] + +[[package]] +name = "fire" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "termcolor" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/00/f8d10588d2019d6d6452653def1ee807353b21983db48550318424b5ff18/fire-0.7.1.tar.gz", hash = "sha256:3b208f05c736de98fb343310d090dcc4d8c78b2a89ea4f32b837c586270a9cbf", size = 88720, upload-time = "2025-08-16T20:20:24.175Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" }, +] + +[[package]] +name = "flask" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/6d/cfe3c0fcc5e477df242b98bfe186a4c34357b4847e87ecaef04507332dab/flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87", size = 720160, upload-time = "2025-08-19T21:03:21.205Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/f9/7f9263c5695f4bd0023734af91bedb2ff8209e8de6ead162f35d8dc762fd/flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c", size = 103308, upload-time = "2025-08-19T21:03:19.499Z" }, +] + +[[package]] +name = "fonttools" +version = "4.60.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4b/42/97a13e47a1e51a5a7142475bbcf5107fe3a68fc34aef331c897d5fb98ad0/fonttools-4.60.1.tar.gz", hash = "sha256:ef00af0439ebfee806b25f24c8f92109157ff3fac5731dc7867957812e87b8d9", size = 3559823, upload-time = "2025-09-29T21:13:27.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/85/639aa9bface1537e0fb0f643690672dde0695a5bbbc90736bc571b0b1941/fonttools-4.60.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7b4c32e232a71f63a5d00259ca3d88345ce2a43295bb049d21061f338124246f", size = 2831872, upload-time = "2025-09-29T21:11:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/6b/47/3c63158459c95093be9618794acb1067b3f4d30dcc5c3e8114b70e67a092/fonttools-4.60.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3630e86c484263eaac71d117085d509cbcf7b18f677906824e4bace598fb70d2", size = 2356990, upload-time = "2025-09-29T21:11:22.754Z" }, + { url = "https://files.pythonhosted.org/packages/94/dd/1934b537c86fcf99f9761823f1fc37a98fbd54568e8e613f29a90fed95a9/fonttools-4.60.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5c1015318e4fec75dd4943ad5f6a206d9727adf97410d58b7e32ab644a807914", size = 5042189, upload-time = "2025-09-29T21:11:25.061Z" }, + { url = "https://files.pythonhosted.org/packages/d2/d2/9f4e4c4374dd1daa8367784e1bd910f18ba886db1d6b825b12edf6db3edc/fonttools-4.60.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e6c58beb17380f7c2ea181ea11e7db8c0ceb474c9dd45f48e71e2cb577d146a1", size = 4978683, upload-time = "2025-09-29T21:11:27.693Z" }, + { url = "https://files.pythonhosted.org/packages/cc/c4/0fb2dfd1ecbe9a07954cc13414713ed1eab17b1c0214ef07fc93df234a47/fonttools-4.60.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec3681a0cb34c255d76dd9d865a55f260164adb9fa02628415cdc2d43ee2c05d", size = 5021372, upload-time = "2025-09-29T21:11:30.257Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d5/495fc7ae2fab20223cc87179a8f50f40f9a6f821f271ba8301ae12bb580f/fonttools-4.60.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f4b5c37a5f40e4d733d3bbaaef082149bee5a5ea3156a785ff64d949bd1353fa", size = 5132562, upload-time = "2025-09-29T21:11:32.737Z" }, + { url = "https://files.pythonhosted.org/packages/bc/fa/021dab618526323c744e0206b3f5c8596a2e7ae9aa38db5948a131123e83/fonttools-4.60.1-cp311-cp311-win32.whl", hash = "sha256:398447f3d8c0c786cbf1209711e79080a40761eb44b27cdafffb48f52bcec258", size = 2230288, upload-time = "2025-09-29T21:11:35.015Z" }, + { url = "https://files.pythonhosted.org/packages/bb/78/0e1a6d22b427579ea5c8273e1c07def2f325b977faaf60bb7ddc01456cb1/fonttools-4.60.1-cp311-cp311-win_amd64.whl", hash = "sha256:d066ea419f719ed87bc2c99a4a4bfd77c2e5949cb724588b9dd58f3fd90b92bf", size = 2278184, upload-time = "2025-09-29T21:11:37.434Z" }, + { url = "https://files.pythonhosted.org/packages/c7/93/0dd45cd283c32dea1545151d8c3637b4b8c53cdb3a625aeb2885b184d74d/fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb", size = 1143175, upload-time = "2025-09-29T21:13:24.134Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/7e/803dde33760128acd393a27eb002f2020ddb8d99d30a44bfbaab31c5f08a/frozenlist-1.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:aa51e147a66b2d74de1e6e2cf5921890de6b0f4820b257465101d7f37b49fb5a", size = 82251, upload-time = "2025-06-09T23:00:16.279Z" }, + { url = "https://files.pythonhosted.org/packages/75/a9/9c2c5760b6ba45eae11334db454c189d43d34a4c0b489feb2175e5e64277/frozenlist-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9b35db7ce1cd71d36ba24f80f0c9e7cff73a28d7a74e91fe83e23d27c7828750", size = 48183, upload-time = "2025-06-09T23:00:17.698Z" }, + { url = "https://files.pythonhosted.org/packages/47/be/4038e2d869f8a2da165f35a6befb9158c259819be22eeaf9c9a8f6a87771/frozenlist-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:34a69a85e34ff37791e94542065c8416c1afbf820b68f720452f636d5fb990cd", size = 47107, upload-time = "2025-06-09T23:00:18.952Z" }, + { url = "https://files.pythonhosted.org/packages/79/26/85314b8a83187c76a37183ceed886381a5f992975786f883472fcb6dc5f2/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a646531fa8d82c87fe4bb2e596f23173caec9185bfbca5d583b4ccfb95183e2", size = 237333, upload-time = "2025-06-09T23:00:20.275Z" }, + { url = "https://files.pythonhosted.org/packages/1f/fd/e5b64f7d2c92a41639ffb2ad44a6a82f347787abc0c7df5f49057cf11770/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:79b2ffbba483f4ed36a0f236ccb85fbb16e670c9238313709638167670ba235f", size = 231724, upload-time = "2025-06-09T23:00:21.705Z" }, + { url = "https://files.pythonhosted.org/packages/20/fb/03395c0a43a5976af4bf7534759d214405fbbb4c114683f434dfdd3128ef/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26f205c9ca5829cbf82bb2a84b5c36f7184c4316617d7ef1b271a56720d6b30", size = 245842, upload-time = "2025-06-09T23:00:23.148Z" }, + { url = "https://files.pythonhosted.org/packages/d0/15/c01c8e1dffdac5d9803507d824f27aed2ba76b6ed0026fab4d9866e82f1f/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcacfad3185a623fa11ea0e0634aac7b691aa925d50a440f39b458e41c561d98", size = 239767, upload-time = "2025-06-09T23:00:25.103Z" }, + { url = "https://files.pythonhosted.org/packages/14/99/3f4c6fe882c1f5514b6848aa0a69b20cb5e5d8e8f51a339d48c0e9305ed0/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72c1b0fe8fe451b34f12dce46445ddf14bd2a5bcad7e324987194dc8e3a74c86", size = 224130, upload-time = "2025-06-09T23:00:27.061Z" }, + { url = "https://files.pythonhosted.org/packages/4d/83/220a374bd7b2aeba9d0725130665afe11de347d95c3620b9b82cc2fcab97/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d1a5baeaac6c0798ff6edfaeaa00e0e412d49946c53fae8d4b8e8b3566c4ae", size = 235301, upload-time = "2025-06-09T23:00:29.02Z" }, + { url = "https://files.pythonhosted.org/packages/03/3c/3e3390d75334a063181625343e8daab61b77e1b8214802cc4e8a1bb678fc/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7edf5c043c062462f09b6820de9854bf28cc6cc5b6714b383149745e287181a8", size = 234606, upload-time = "2025-06-09T23:00:30.514Z" }, + { url = "https://files.pythonhosted.org/packages/23/1e/58232c19608b7a549d72d9903005e2d82488f12554a32de2d5fb59b9b1ba/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d50ac7627b3a1bd2dcef6f9da89a772694ec04d9a61b66cf87f7d9446b4a0c31", size = 248372, upload-time = "2025-06-09T23:00:31.966Z" }, + { url = "https://files.pythonhosted.org/packages/c0/a4/e4a567e01702a88a74ce8a324691e62a629bf47d4f8607f24bf1c7216e7f/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce48b2fece5aeb45265bb7a58259f45027db0abff478e3077e12b05b17fb9da7", size = 229860, upload-time = "2025-06-09T23:00:33.375Z" }, + { url = "https://files.pythonhosted.org/packages/73/a6/63b3374f7d22268b41a9db73d68a8233afa30ed164c46107b33c4d18ecdd/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:fe2365ae915a1fafd982c146754e1de6ab3478def8a59c86e1f7242d794f97d5", size = 245893, upload-time = "2025-06-09T23:00:35.002Z" }, + { url = "https://files.pythonhosted.org/packages/6d/eb/d18b3f6e64799a79673c4ba0b45e4cfbe49c240edfd03a68be20002eaeaa/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:45a6f2fdbd10e074e8814eb98b05292f27bad7d1883afbe009d96abdcf3bc898", size = 246323, upload-time = "2025-06-09T23:00:36.468Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f5/720f3812e3d06cd89a1d5db9ff6450088b8f5c449dae8ffb2971a44da506/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21884e23cffabb157a9dd7e353779077bf5b8f9a58e9b262c6caad2ef5f80a56", size = 233149, upload-time = "2025-06-09T23:00:37.963Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/03efbf545e217d5db8446acfd4c447c15b7c8cf4dbd4a58403111df9322d/frozenlist-1.7.0-cp311-cp311-win32.whl", hash = "sha256:284d233a8953d7b24f9159b8a3496fc1ddc00f4db99c324bd5fb5f22d8698ea7", size = 39565, upload-time = "2025-06-09T23:00:39.753Z" }, + { url = "https://files.pythonhosted.org/packages/58/17/fe61124c5c333ae87f09bb67186d65038834a47d974fc10a5fadb4cc5ae1/frozenlist-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:387cbfdcde2f2353f19c2f66bbb52406d06ed77519ac7ee21be0232147c2592d", size = 44019, upload-time = "2025-06-09T23:00:40.988Z" }, + { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, +] + +[[package]] +name = "fsspec" +version = "2025.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/34/f4/5721faf47b8c499e776bc34c6a8fc17efdf7fdef0b00f398128bc5dcb4ac/fsspec-2025.3.0.tar.gz", hash = "sha256:a935fd1ea872591f2b5148907d103488fc523295e6c64b835cfad8c3eca44972", size = 298491, upload-time = "2025-03-07T21:47:56.461Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/53/eb690efa8513166adef3e0669afd31e95ffde69fb3c52ec2ac7223ed6018/fsspec-2025.3.0-py3-none-any.whl", hash = "sha256:efb87af3efa9103f94ca91a7f8cb7a4df91af9f74fc106c9c7ea0efd7277c1b3", size = 193615, upload-time = "2025-03-07T21:47:54.809Z" }, +] + +[package.optional-dependencies] +http = [ + { name = "aiohttp" }, +] + +[[package]] +name = "gdown" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "filelock" }, + { name = "requests", extra = ["socks"] }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/6a/37e6b70c5bda3161e40265861e63b64a86bfc6ca6a8f1c35328a675c84fd/gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787", size = 284647, upload-time = "2024-05-12T06:45:12.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/70/e07c381e6488a77094f04c85c9caf1c8008cdc30778f7019bc52e5285ef0/gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6", size = 18235, upload-time = "2024-05-12T06:45:10.017Z" }, +] + +[[package]] +name = "gitdb" +version = "4.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "smmap" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" }, +] + +[[package]] +name = "gitpython" +version = "3.1.45" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "gitdb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, +] + +[[package]] +name = "google-api-core" +version = "2.28.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/27/77ec922bf9b10ff605192cc6f7164f1448e60a9404290ed9b9c33589b1df/google_api_core-2.28.0.tar.gz", hash = "sha256:4743b7d45fe8c0930e59928b1bade287242910f30b06ff9b22f139a3e33271b8", size = 176510, upload-time = "2025-10-27T22:50:27.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/8a/c75ed5fd7819742201ffffbd61bb081af4819ea882a6b84930fa93f8e96f/google_api_core-2.28.0-py3-none-any.whl", hash = "sha256:b4362b0e2e6bc06037cfb0e2b28e2fe0c3f9d760dc311f314d5fb373768c7387", size = 173371, upload-time = "2025-10-27T22:50:25.853Z" }, +] + +[[package]] +name = "google-api-python-client" +version = "2.185.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, + { name = "google-auth-httplib2" }, + { name = "httplib2" }, + { name = "uritemplate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/5a/6f9b49d67ea91376305fdb8bbf2877c746d756e45fd8fb7d2e32d6dad19b/google_api_python_client-2.185.0.tar.gz", hash = "sha256:aa1b338e4bb0f141c2df26743f6b46b11f38705aacd775b61971cbc51da089c3", size = 13885609, upload-time = "2025-10-17T15:00:35.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/28/be3b17bd6a190c8c2ec9e4fb65d43e6ecd7b7a1bb19ccc1d9ab4f687a58c/google_api_python_client-2.185.0-py3-none-any.whl", hash = "sha256:00fe173a4b346d2397fbe0d37ac15368170dfbed91a0395a66ef2558e22b93fc", size = 14453595, upload-time = "2025-10-17T15:00:33.176Z" }, +] + +[[package]] +name = "google-auth" +version = "2.41.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/af/5129ce5b2f9688d2fa49b463e544972a7c82b0fdb50980dafee92e121d9f/google_auth-2.41.1.tar.gz", hash = "sha256:b76b7b1f9e61f0cb7e88870d14f6a94aeef248959ef6992670efee37709cbfd2", size = 292284, upload-time = "2025-09-30T22:51:26.363Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/a4/7319a2a8add4cc352be9e3efeff5e2aacee917c85ca2fa1647e29089983c/google_auth-2.41.1-py2.py3-none-any.whl", hash = "sha256:754843be95575b9a19c604a848a41be03f7f2afd8c019f716dc1f51ee41c639d", size = 221302, upload-time = "2025-09-30T22:51:24.212Z" }, +] + +[[package]] +name = "google-auth-httplib2" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "httplib2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/56/be/217a598a818567b28e859ff087f347475c807a5649296fb5a817c58dacef/google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05", size = 10842, upload-time = "2023-12-12T17:40:30.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/8a/fe34d2f3f9470a27b01c9e76226965863f153d5fbe276f83608562e49c04/google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d", size = 9253, upload-time = "2023-12-12T17:40:13.055Z" }, +] + +[[package]] +name = "google-auth-oauthlib" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "requests-oauthlib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/b4/1b19567e4c567b796f5c593d89895f3cfae5a38e04f27c6af87618fd0942/google_auth_oauthlib-1.3.0.tar.gz", hash = "sha256:cd39e807ac7229d6b8b9c1e297321d36fcc8a9e4857dff4301870985df51a528", size = 21777, upload-time = "2026-02-27T14:13:01.489Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/56/909fd5632226d3fba31d7aeffd4754410735d49362f5809956fe3e9af344/google_auth_oauthlib-1.3.0-py3-none-any.whl", hash = "sha256:386b3fb85cf4a5b819c6ad23e3128d975216b4cac76324de1d90b128aaf38f29", size = 19308, upload-time = "2026-02-27T14:12:47.865Z" }, +] + +[[package]] +name = "googleapis-common-protos" +version = "1.71.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/43/b25abe02db2911397819003029bef768f68a974f2ece483e6084d1a5f754/googleapis_common_protos-1.71.0.tar.gz", hash = "sha256:1aec01e574e29da63c80ba9f7bbf1ccfaacf1da877f23609fe236ca7c72a2e2e", size = 146454, upload-time = "2025-10-20T14:58:08.732Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/e8/eba9fece11d57a71e3e22ea672742c8f3cf23b35730c9e96db768b295216/googleapis_common_protos-1.71.0-py3-none-any.whl", hash = "sha256:59034a1d849dc4d18971997a72ac56246570afdd17f9369a0ff68218d50ab78c", size = 294576, upload-time = "2025-10-20T14:56:21.295Z" }, +] + +[[package]] +name = "greenlet" +version = "3.2.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/b8/704d753a5a45507a7aab61f18db9509302ed3d0a27ac7e0359ec2905b1a6/greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d", size = 188260, upload-time = "2025-08-07T13:24:33.51Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, + { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8f/95d48d7e3d433e6dae5b1682e4292242a53f22df82e6d3dda81b1701a960/greenlet-3.2.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:94abf90142c2a18151632371140b3dba4dee031633fe614cb592dbb6c9e17bc3", size = 644646, upload-time = "2025-08-07T13:45:26.523Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5e/405965351aef8c76b8ef7ad370e5da58d57ef6068df197548b015464001a/greenlet-3.2.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:4d1378601b85e2e5171b99be8d2dc85f594c79967599328f95c1dc1a40f1c633", size = 640519, upload-time = "2025-08-07T13:53:13.928Z" }, + { url = "https://files.pythonhosted.org/packages/25/5d/382753b52006ce0218297ec1b628e048c4e64b155379331f25a7316eb749/greenlet-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0db5594dce18db94f7d1650d7489909b57afde4c580806b8d9203b6e79cdc079", size = 639707, upload-time = "2025-08-07T13:18:27.146Z" }, + { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, + { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, + { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, + { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, +] + +[[package]] +name = "grpcio" +version = "1.76.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/e0/318c1ce3ae5a17894d5791e87aea147587c9e702f24122cc7a5c8bbaeeb1/grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73", size = 12785182, upload-time = "2025-10-21T16:23:12.106Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/00/8163a1beeb6971f66b4bbe6ac9457b97948beba8dd2fc8e1281dce7f79ec/grpcio-1.76.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a", size = 5843567, upload-time = "2025-10-21T16:20:52.829Z" }, + { url = "https://files.pythonhosted.org/packages/10/c1/934202f5cf335e6d852530ce14ddb0fef21be612ba9ecbbcbd4d748ca32d/grpcio-1.76.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c", size = 11848017, upload-time = "2025-10-21T16:20:56.705Z" }, + { url = "https://files.pythonhosted.org/packages/11/0b/8dec16b1863d74af6eb3543928600ec2195af49ca58b16334972f6775663/grpcio-1.76.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465", size = 6412027, upload-time = "2025-10-21T16:20:59.3Z" }, + { url = "https://files.pythonhosted.org/packages/d7/64/7b9e6e7ab910bea9d46f2c090380bab274a0b91fb0a2fe9b0cd399fffa12/grpcio-1.76.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48", size = 7075913, upload-time = "2025-10-21T16:21:01.645Z" }, + { url = "https://files.pythonhosted.org/packages/68/86/093c46e9546073cefa789bd76d44c5cb2abc824ca62af0c18be590ff13ba/grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da", size = 6615417, upload-time = "2025-10-21T16:21:03.844Z" }, + { url = "https://files.pythonhosted.org/packages/f7/b6/5709a3a68500a9c03da6fb71740dcdd5ef245e39266461a03f31a57036d8/grpcio-1.76.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397", size = 7199683, upload-time = "2025-10-21T16:21:06.195Z" }, + { url = "https://files.pythonhosted.org/packages/91/d3/4b1f2bf16ed52ce0b508161df3a2d186e4935379a159a834cb4a7d687429/grpcio-1.76.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749", size = 8163109, upload-time = "2025-10-21T16:21:08.498Z" }, + { url = "https://files.pythonhosted.org/packages/5c/61/d9043f95f5f4cf085ac5dd6137b469d41befb04bd80280952ffa2a4c3f12/grpcio-1.76.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00", size = 7626676, upload-time = "2025-10-21T16:21:10.693Z" }, + { url = "https://files.pythonhosted.org/packages/36/95/fd9a5152ca02d8881e4dd419cdd790e11805979f499a2e5b96488b85cf27/grpcio-1.76.0-cp311-cp311-win32.whl", hash = "sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054", size = 3997688, upload-time = "2025-10-21T16:21:12.746Z" }, + { url = "https://files.pythonhosted.org/packages/60/9c/5c359c8d4c9176cfa3c61ecd4efe5affe1f38d9bae81e81ac7186b4c9cc8/grpcio-1.76.0-cp311-cp311-win_amd64.whl", hash = "sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d", size = 4709315, upload-time = "2025-10-21T16:21:15.26Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "h2" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, +] + +[[package]] +name = "h5py" +version = "3.15.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/6a/0d79de0b025aa85dc8864de8e97659c94cf3d23148394a954dc5ca52f8c8/h5py-3.15.1.tar.gz", hash = "sha256:c86e3ed45c4473564de55aa83b6fc9e5ead86578773dfbd93047380042e26b69", size = 426236, upload-time = "2025-10-16T10:35:27.404Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/fd/8349b48b15b47768042cff06ad6e1c229f0a4bd89225bf6b6894fea27e6d/h5py-3.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5aaa330bcbf2830150c50897ea5dcbed30b5b6d56897289846ac5b9e529ec243", size = 3434135, upload-time = "2025-10-16T10:33:47.954Z" }, + { url = "https://files.pythonhosted.org/packages/c1/b0/1c628e26a0b95858f54aba17e1599e7f6cd241727596cc2580b72cb0a9bf/h5py-3.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c970fb80001fffabb0109eaf95116c8e7c0d3ca2de854e0901e8a04c1f098509", size = 2870958, upload-time = "2025-10-16T10:33:50.907Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e3/c255cafc9b85e6ea04e2ad1bba1416baa1d7f57fc98a214be1144087690c/h5py-3.15.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:80e5bb5b9508d5d9da09f81fd00abbb3f85da8143e56b1585d59bc8ceb1dba8b", size = 4504770, upload-time = "2025-10-16T10:33:54.357Z" }, + { url = "https://files.pythonhosted.org/packages/8b/23/4ab1108e87851ccc69694b03b817d92e142966a6c4abd99e17db77f2c066/h5py-3.15.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b849ba619a066196169763c33f9f0f02e381156d61c03e000bb0100f9950faf", size = 4700329, upload-time = "2025-10-16T10:33:57.616Z" }, + { url = "https://files.pythonhosted.org/packages/a4/e4/932a3a8516e4e475b90969bf250b1924dbe3612a02b897e426613aed68f4/h5py-3.15.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e7f6c841efd4e6e5b7e82222eaf90819927b6d256ab0f3aca29675601f654f3c", size = 4152456, upload-time = "2025-10-16T10:34:00.843Z" }, + { url = "https://files.pythonhosted.org/packages/2a/0a/f74d589883b13737021b2049ac796328f188dbb60c2ed35b101f5b95a3fc/h5py-3.15.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ca8a3a22458956ee7b40d8e39c9a9dc01f82933e4c030c964f8b875592f4d831", size = 4617295, upload-time = "2025-10-16T10:34:04.154Z" }, + { url = "https://files.pythonhosted.org/packages/23/95/499b4e56452ef8b6c95a271af0dde08dac4ddb70515a75f346d4f400579b/h5py-3.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:550e51131376889656feec4aff2170efc054a7fe79eb1da3bb92e1625d1ac878", size = 2882129, upload-time = "2025-10-16T10:34:06.886Z" }, + { url = "https://files.pythonhosted.org/packages/ce/bb/cfcc70b8a42222ba3ad4478bcef1791181ea908e2adbd7d53c66395edad5/h5py-3.15.1-cp311-cp311-win_arm64.whl", hash = "sha256:b39239947cb36a819147fc19e86b618dcb0953d1cd969f5ed71fc0de60392427", size = 2477121, upload-time = "2025-10-16T10:34:09.579Z" }, +] + +[[package]] +name = "hdbscan" +version = "0.8.40" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scikit-learn" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c1/84/6b010387b795f774e1ec695df3c8660c15abd041783647d5e7e4076bfc6b/hdbscan-0.8.40.tar.gz", hash = "sha256:c9e383ff17beee0591075ff65d524bda5b5a35dfb01d218245a7ba30c8d48a17", size = 6904096, upload-time = "2024-11-18T16:14:05.384Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/6b/88b8c8023c0c0b27589ad83c82084a1b751917a3e09bdf7fcacf7e6bd523/hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5e958f0d7a33cd2b5e8e927b47f7360bf8a3e7d72355dd65a701e8aabe407b27", size = 1491349, upload-time = "2024-11-18T16:16:10.666Z" }, + { url = "https://files.pythonhosted.org/packages/e5/72/84bc7b6ea9eb59ca6c5e4d3f32313cdfa8f4ab5cfece6fb6dfef4c9149fc/hdbscan-0.8.40-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b95447c9c2cf6c95f98210c0edee3dc463d0a237e5531076855d9776495c96fc", size = 4459927, upload-time = "2025-10-11T11:55:49.958Z" }, + { url = "https://files.pythonhosted.org/packages/a3/ef/32c8a0b3dc6e6c4e433b85b30c3723d8eb48d115c0185b82ab89e1a0ef89/hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e0d6197ee045b173e1f16e6884386f335a56091e373a839dd24f7331a8fa9ed", size = 4576215, upload-time = "2024-11-18T16:14:11.241Z" }, + { url = "https://files.pythonhosted.org/packages/64/b1/96c347c7740efa1ac803be64155159284f92fafcff88c1077344e64eead5/hdbscan-0.8.40-cp311-cp311-win_amd64.whl", hash = "sha256:127cbe8c858dc77adfde33a3e1ce4f3bea810f78b01d2bd47b1147d4b5a50472", size = 732173, upload-time = "2024-11-18T16:18:40.361Z" }, +] + +[[package]] +name = "hf-xet" +version = "1.1.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7a/49/91010b59debc7c862a5fd426d343134dd9a68778dbe570234b6495a4e204/hf_xet-1.1.8.tar.gz", hash = "sha256:62a0043e441753bbc446dcb5a3fe40a4d03f5fb9f13589ef1df9ab19252beb53", size = 484065, upload-time = "2025-08-18T22:01:03.584Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/91/5814db3a0d4a65fb6a87f0931ae28073b87f06307701fe66e7c41513bfb4/hf_xet-1.1.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3d5f82e533fc51c7daad0f9b655d9c7811b5308e5890236828bd1dd3ed8fea74", size = 2752357, upload-time = "2025-08-18T22:00:58.777Z" }, + { url = "https://files.pythonhosted.org/packages/70/72/ce898516e97341a7a9d450609e130e108643389110261eaee6deb1ba8545/hf_xet-1.1.8-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:8e2dba5896bca3ab61d0bef4f01a1647004de59640701b37e37eaa57087bbd9d", size = 2613142, upload-time = "2025-08-18T22:00:57.252Z" }, + { url = "https://files.pythonhosted.org/packages/b7/d6/13af5f916cef795ac2b5e4cc1de31f2e0e375f4475d50799915835f301c2/hf_xet-1.1.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfe5700bc729be3d33d4e9a9b5cc17a951bf8c7ada7ba0c9198a6ab2053b7453", size = 3175859, upload-time = "2025-08-18T22:00:55.978Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ed/34a193c9d1d72b7c3901b3b5153b1be9b2736b832692e1c3f167af537102/hf_xet-1.1.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:09e86514c3c4284ed8a57d6b0f3d089f9836a0af0a1ceb3c9dd664f1f3eaefef", size = 3074178, upload-time = "2025-08-18T22:00:54.147Z" }, + { url = "https://files.pythonhosted.org/packages/4a/1b/de6817b4bf65385280252dff5c9cceeedfbcb27ddb93923639323c1034a4/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4a9b99ab721d385b83f4fc8ee4e0366b0b59dce03b5888a86029cc0ca634efbf", size = 3238122, upload-time = "2025-08-18T22:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/b7/13/874c85c7ed519ec101deb654f06703d9e5e68d34416730f64c4755ada36a/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:25b9d43333bbef39aeae1616789ec329c21401a7fe30969d538791076227b591", size = 3344325, upload-time = "2025-08-18T22:01:02.013Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/0aaf279f4f3dea58e99401b92c31c0f752924ba0e6c7d7bb07b1dbd7f35e/hf_xet-1.1.8-cp37-abi3-win_amd64.whl", hash = "sha256:4171f31d87b13da4af1ed86c98cf763292e4720c088b4957cf9d564f92904ca9", size = 2801689, upload-time = "2025-08-18T22:01:04.81Z" }, +] + +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httplib2" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyparsing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/77/6653db69c1f7ecfe5e3f9726fdadc981794656fcd7d98c4209fecfea9993/httplib2-0.31.0.tar.gz", hash = "sha256:ac7ab497c50975147d4f7b1ade44becc7df2f8954d42b38b3d69c515f531135c", size = 250759, upload-time = "2025-09-11T12:16:03.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/a2/0d269db0f6163be503775dc8b6a6fa15820cc9fdc866f6ba608d86b721f2/httplib2-0.31.0-py3-none-any.whl", hash = "sha256:b9cd78abea9b4e43a7714c6e0f8b6b8561a6fc1e95d5dbd367f5bf0ef35f5d24", size = 91148, upload-time = "2025-09-11T12:16:01.803Z" }, +] + +[[package]] +name = "httptools" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/46/120a669232c7bdedb9d52d4aeae7e6c7dfe151e99dc70802e2fc7a5e1993/httptools-0.7.1.tar.gz", hash = "sha256:abd72556974f8e7c74a259655924a717a2365b236c882c3f6f8a45fe94703ac9", size = 258961, upload-time = "2025-10-10T03:55:08.559Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/08/17e07e8d89ab8f343c134616d72eebfe03798835058e2ab579dcc8353c06/httptools-0.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:474d3b7ab469fefcca3697a10d11a32ee2b9573250206ba1e50d5980910da657", size = 206521, upload-time = "2025-10-10T03:54:31.002Z" }, + { url = "https://files.pythonhosted.org/packages/aa/06/c9c1b41ff52f16aee526fd10fbda99fa4787938aa776858ddc4a1ea825ec/httptools-0.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3c3b7366bb6c7b96bd72d0dbe7f7d5eead261361f013be5f6d9590465ea1c70", size = 110375, upload-time = "2025-10-10T03:54:31.941Z" }, + { url = "https://files.pythonhosted.org/packages/cc/cc/10935db22fda0ee34c76f047590ca0a8bd9de531406a3ccb10a90e12ea21/httptools-0.7.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:379b479408b8747f47f3b253326183d7c009a3936518cdb70db58cffd369d9df", size = 456621, upload-time = "2025-10-10T03:54:33.176Z" }, + { url = "https://files.pythonhosted.org/packages/0e/84/875382b10d271b0c11aa5d414b44f92f8dd53e9b658aec338a79164fa548/httptools-0.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cad6b591a682dcc6cf1397c3900527f9affef1e55a06c4547264796bbd17cf5e", size = 454954, upload-time = "2025-10-10T03:54:34.226Z" }, + { url = "https://files.pythonhosted.org/packages/30/e1/44f89b280f7e46c0b1b2ccee5737d46b3bb13136383958f20b580a821ca0/httptools-0.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eb844698d11433d2139bbeeb56499102143beb582bd6c194e3ba69c22f25c274", size = 440175, upload-time = "2025-10-10T03:54:35.942Z" }, + { url = "https://files.pythonhosted.org/packages/6f/7e/b9287763159e700e335028bc1824359dc736fa9b829dacedace91a39b37e/httptools-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f65744d7a8bdb4bda5e1fa23e4ba16832860606fcc09d674d56e425e991539ec", size = 440310, upload-time = "2025-10-10T03:54:37.1Z" }, + { url = "https://files.pythonhosted.org/packages/b3/07/5b614f592868e07f5c94b1f301b5e14a21df4e8076215a3bccb830a687d8/httptools-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:135fbe974b3718eada677229312e97f3b31f8a9c8ffa3ae6f565bf808d5b6bcb", size = 86875, upload-time = "2025-10-10T03:54:38.421Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + +[package.optional-dependencies] +http2 = [ + { name = "h2" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.34.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/c9/bdbe19339f76d12985bc03572f330a01a93c04dffecaaea3061bdd7fb892/huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c", size = 459768, upload-time = "2025-08-08T09:14:52.365Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/7b/bb06b061991107cd8783f300adff3e7b7f284e330fd82f507f2a1417b11d/huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a", size = 561452, upload-time = "2025-08-08T09:14:50.159Z" }, +] + +[[package]] +name = "humanfriendly" +version = "10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyreadline3", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" }, +] + +[[package]] +name = "hydra-core" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "omegaconf" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, +] + +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, +] + +[[package]] +name = "icecream" +version = "2.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "colorama" }, + { name = "executing" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d0/be/a89ec4132ddb4481f9587f736b8a01a07378f9e71de73549223ff1cd41f7/icecream-2.1.8.tar.gz", hash = "sha256:37269bbc62b02f0d85bfaf3a0eb4df272c967fad059f7ddcdaee5303ea2b2a62", size = 18337, upload-time = "2025-09-14T09:31:09.938Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/5f/f877d2cfcad41c0db98b120872f67b805fde5e6d407e584e9edfc9dec35c/icecream-2.1.8-py3-none-any.whl", hash = "sha256:10b1c39dcb54cb28eb487bac56c35dbf9c2b2f406d24340e1a615c3f17274852", size = 15714, upload-time = "2025-09-14T09:31:08.647Z" }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, +] + +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "ipykernel" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appnope", marker = "sys_platform == 'darwin'" }, + { name = "comm" }, + { name = "debugpy" }, + { name = "ipython" }, + { name = "jupyter-client" }, + { name = "jupyter-core" }, + { name = "matplotlib-inline" }, + { name = "nest-asyncio" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/a4/4948be6eb88628505b83a1f2f40d90254cab66abf2043b3c40fa07dfce0f/ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db", size = 174579, upload-time = "2025-10-27T09:46:39.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/17/20c2552266728ceba271967b87919664ecc0e33efca29c3efc6baf88c5f9/ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c", size = 117968, upload-time = "2025-10-27T09:46:37.805Z" }, +] + +[[package]] +name = "ipython" +version = "9.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/34/29b18c62e39ee2f7a6a3bba7efd952729d8aadd45ca17efc34453b717665/ipython-9.6.0.tar.gz", hash = "sha256:5603d6d5d356378be5043e69441a072b50a5b33b4503428c77b04cb8ce7bc731", size = 4396932, upload-time = "2025-09-29T10:55:53.948Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/c5/d5e07995077e48220269c28a221e168c91123ad5ceee44d548f54a057fc0/ipython-9.6.0-py3-none-any.whl", hash = "sha256:5f77efafc886d2f023442479b8149e7d86547ad0a979e9da9f045d252f648196", size = 616170, upload-time = "2025-09-29T10:55:47.676Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + +[[package]] +name = "jellyfish" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/14/fc5bdb637996df181e5c4fa3b15dcc27d33215e6c41753564ae453bdb40f/jellyfish-1.2.1.tar.gz", hash = "sha256:72d2fda61b23babe862018729be73c8b0dc12e3e6601f36f6e65d905e249f4db", size = 364417, upload-time = "2025-10-11T19:36:37.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/67/5d5ec4004d92573cbccd33fc84d0ad61e523b29f7b17b062913b183961e1/jellyfish-1.2.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9913789a98ccf49213fbb1dabc597847a0ec33d3b0e151689498f4b38ba9be0f", size = 325488, upload-time = "2025-10-11T19:35:06.87Z" }, + { url = "https://files.pythonhosted.org/packages/83/21/6cf3add349cd0002cc586178bd8f1fd006894e5c70f959a8db5507cfe075/jellyfish-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4e36d9000d4f7e1a35689a74ec7749d27a216dfa6c47cac2e5ad3de8a523bd69", size = 320226, upload-time = "2025-10-11T19:35:08.314Z" }, + { url = "https://files.pythonhosted.org/packages/1c/ed/b5458b09482913caece2e9f807599318e48490b01c3c3134b636ecd7af8c/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7853d2ed7d6929c029312ec849410f1ea7ae76ce72ad1140fb73f6e8a1e6aa4f", size = 353091, upload-time = "2025-10-11T19:35:09.395Z" }, + { url = "https://files.pythonhosted.org/packages/67/be/7e01fda506f3249d3548d35d1203e009a850734297ccfe4039ce76a927dc/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:68080af234256ef943f0add6fc79816b0c643d8df291c17a85c1b6e45bdfbb96", size = 362820, upload-time = "2025-10-11T19:35:11.28Z" }, + { url = "https://files.pythonhosted.org/packages/f5/95/39302d0df1e1b7c348c1fe6fda27cc6cd4c0bd0b27d79f34de3981a14e55/jellyfish-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c5acb213aa75a61bcfc176566e20f2503069667e760d83d403b59e115fef0dd", size = 360560, upload-time = "2025-10-11T19:35:12.266Z" }, + { url = "https://files.pythonhosted.org/packages/ac/9d/b477787bc032e8b5b1ffd798e1c638ecbd54621967dc5577ccd10b5e9444/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4b28fcefc0c3534277ff0306e6c10672fb050f4784b5f3be7037e80801569fb5", size = 533823, upload-time = "2025-10-11T19:35:13.268Z" }, + { url = "https://files.pythonhosted.org/packages/73/7e/c6e389c4fccfc2838b1d3fe21736b5bf9ea1e739287d128a291eb84df158/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f69aeb08659a6c81d559bbe319075e3417434ae5b3a5e4a758d1c4055a03497a", size = 554439, upload-time = "2025-10-11T19:35:14.595Z" }, + { url = "https://files.pythonhosted.org/packages/17/1e/3239b2dfdfb2f1d8795a8d35936c5eadb90475dbbeebacf45e083579d560/jellyfish-1.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:63770120cc3386dcc13bcc4df508ab281a6b14c3b2c0e33586439a6c40ee122f", size = 523762, upload-time = "2025-10-11T19:35:15.614Z" }, + { url = "https://files.pythonhosted.org/packages/2c/05/62f16bec1d2cd74e6944dfb18a8511bd9df9f2d58e041567f909da22ee26/jellyfish-1.2.1-cp311-cp311-win32.whl", hash = "sha256:ecf62d4aad0baa8832ab60f96e7baedbe6558bd292597503d927e9c5bce745d8", size = 208967, upload-time = "2025-10-11T19:35:16.616Z" }, + { url = "https://files.pythonhosted.org/packages/87/a4/69b65d9090d297407bc530f2e5b8707aa1caa9484e7281a04da6821f13be/jellyfish-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:bd186c041d9be86c4fa5e2490943ce5d7f05b472f45d7f49426f259f3dd20bc4", size = 213556, upload-time = "2025-10-11T19:35:17.528Z" }, + { url = "https://files.pythonhosted.org/packages/96/72/e4897449abd844d501412873d1d15bd846bcc919648c0b1043e225268b21/jellyfish-1.2.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:748dc45a0394fbe9120b8b3b9a39fab0967c7e2d6ecdd5304af018e774f80f96", size = 326967, upload-time = "2025-10-11T19:36:18.851Z" }, + { url = "https://files.pythonhosted.org/packages/60/3f/c7a550abd212ae40c2a555055a3f16ba39376e486ba0189e150fb25cf6b1/jellyfish-1.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:13f1ac9caba22af10bfe42f674822643c0266009f882e0fe652079706dc5d13a", size = 321759, upload-time = "2025-10-11T19:36:19.882Z" }, + { url = "https://files.pythonhosted.org/packages/19/58/a268365ba659f04d4db0c94325042aa9aee69c3a9a5823a5b2a2db308a5c/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ffeeb6c78c45fbb6d2a22b0173fb8a6af849001d6c26fab49c525136dbd9734", size = 354847, upload-time = "2025-10-11T19:36:20.976Z" }, + { url = "https://files.pythonhosted.org/packages/5d/63/371351a5e0e19d642e33c1c8b4c3ef47538f36bbd8d76a06ee34000b38a2/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1354b558a0a16597b6032dd0af64bebd24994f7e7484cf14993320eb764b06cb", size = 364210, upload-time = "2025-10-11T19:36:22.071Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ed/f43d79b9b6d846189b1235f8303d1246ca9cba79a61a26cac790b57c1789/jellyfish-1.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5977810972c6f0b2e61252c4758fd5aee21abf663ff309881195a99d37daa94", size = 361876, upload-time = "2025-10-11T19:36:23.235Z" }, + { url = "https://files.pythonhosted.org/packages/74/5c/82455195b77cd1996c3618bd5aa8f25a9fc254d401a3c1425fb60cf97742/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:536c80d8d4ec7f39cbb10b85d926ff96cef3cde4a83ca0991c07cd9835d5dc13", size = 535488, upload-time = "2025-10-11T19:36:24.252Z" }, + { url = "https://files.pythonhosted.org/packages/74/3d/295468c5df5a8d03f522b0c21fc3e694d6be376602a6d755bf7815947522/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_i686.whl", hash = "sha256:21baa92d4a5112167721156f6d061c2ae105f2995b3a5e19cec6662928f0c439", size = 555818, upload-time = "2025-10-11T19:36:25.667Z" }, + { url = "https://files.pythonhosted.org/packages/41/6d/4029265138a5a0b18e4df381560e467339bec477f4efaa7614736dbc433e/jellyfish-1.2.1-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:68ea3ddd4dae1152a7f7155ef02a7bfad919611158d71b301f9aa167685819af", size = 525259, upload-time = "2025-10-11T19:36:26.726Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "jiter" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" }, + { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" }, + { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" }, + { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" }, + { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" }, + { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" }, + { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" }, + { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" }, + { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/5d/447af5ea094b9e4c4054f82e223ada074c552335b9b4b2d14bd9b35a67c4/joblib-1.5.2.tar.gz", hash = "sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55", size = 331077, upload-time = "2025-08-27T12:15:46.575Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" }, +] + +[[package]] +name = "jsonlines" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" }, +] + +[[package]] +name = "jupyter-client" +version = "8.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jupyter-core" }, + { name = "python-dateutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" }, +] + +[[package]] +name = "jupyter-core" +version = "5.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "platformdirs" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" }, +] + +[[package]] +name = "kiwisolver" +version = "1.4.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/ab/c80b0d5a9d8a1a65f4f815f2afff9798b12c3b9f31f1d304dd233dd920e2/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16", size = 124167, upload-time = "2025-08-10T21:25:53.403Z" }, + { url = "https://files.pythonhosted.org/packages/a0/c0/27fe1a68a39cf62472a300e2879ffc13c0538546c359b86f149cc19f6ac3/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089", size = 66579, upload-time = "2025-08-10T21:25:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/31/a2/a12a503ac1fd4943c50f9822678e8015a790a13b5490354c68afb8489814/kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543", size = 65309, upload-time = "2025-08-10T21:25:55.76Z" }, + { url = "https://files.pythonhosted.org/packages/66/e1/e533435c0be77c3f64040d68d7a657771194a63c279f55573188161e81ca/kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61", size = 1435596, upload-time = "2025-08-10T21:25:56.861Z" }, + { url = "https://files.pythonhosted.org/packages/67/1e/51b73c7347f9aabdc7215aa79e8b15299097dc2f8e67dee2b095faca9cb0/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1", size = 1246548, upload-time = "2025-08-10T21:25:58.246Z" }, + { url = "https://files.pythonhosted.org/packages/21/aa/72a1c5d1e430294f2d32adb9542719cfb441b5da368d09d268c7757af46c/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872", size = 1263618, upload-time = "2025-08-10T21:25:59.857Z" }, + { url = "https://files.pythonhosted.org/packages/a3/af/db1509a9e79dbf4c260ce0cfa3903ea8945f6240e9e59d1e4deb731b1a40/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26", size = 1317437, upload-time = "2025-08-10T21:26:01.105Z" }, + { url = "https://files.pythonhosted.org/packages/e0/f2/3ea5ee5d52abacdd12013a94130436e19969fa183faa1e7c7fbc89e9a42f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028", size = 2195742, upload-time = "2025-08-10T21:26:02.675Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9b/1efdd3013c2d9a2566aa6a337e9923a00590c516add9a1e89a768a3eb2fc/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771", size = 2290810, upload-time = "2025-08-10T21:26:04.009Z" }, + { url = "https://files.pythonhosted.org/packages/fb/e5/cfdc36109ae4e67361f9bc5b41323648cb24a01b9ade18784657e022e65f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a", size = 2461579, upload-time = "2025-08-10T21:26:05.317Z" }, + { url = "https://files.pythonhosted.org/packages/62/86/b589e5e86c7610842213994cdea5add00960076bef4ae290c5fa68589cac/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464", size = 2268071, upload-time = "2025-08-10T21:26:06.686Z" }, + { url = "https://files.pythonhosted.org/packages/3b/c6/f8df8509fd1eee6c622febe54384a96cfaf4d43bf2ccec7a0cc17e4715c9/kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2", size = 73840, upload-time = "2025-08-10T21:26:07.94Z" }, + { url = "https://files.pythonhosted.org/packages/e2/2d/16e0581daafd147bc11ac53f032a2b45eabac897f42a338d0a13c1e5c436/kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7", size = 65159, upload-time = "2025-08-10T21:26:09.048Z" }, + { url = "https://files.pythonhosted.org/packages/a3/0f/36d89194b5a32c054ce93e586d4049b6c2c22887b0eb229c61c68afd3078/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5", size = 60104, upload-time = "2025-08-10T21:27:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/52/ba/4ed75f59e4658fd21fe7dde1fee0ac397c678ec3befba3fe6482d987af87/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa", size = 58592, upload-time = "2025-08-10T21:27:44.314Z" }, + { url = "https://files.pythonhosted.org/packages/33/01/a8ea7c5ea32a9b45ceeaee051a04c8ed4320f5add3c51bfa20879b765b70/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2", size = 80281, upload-time = "2025-08-10T21:27:45.369Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/dbd2ecdce306f1d07a1aaf324817ee993aab7aee9db47ceac757deabafbe/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f", size = 78009, upload-time = "2025-08-10T21:27:46.376Z" }, + { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" }, +] + +[[package]] +name = "lazy-loader" +version = "0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6b/c875b30a1ba490860c93da4cabf479e03f584eba06fe5963f6f6644653d8/lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1", size = 15431, upload-time = "2024-04-05T13:03:12.261Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc", size = 12097, upload-time = "2024-04-05T13:03:10.514Z" }, +] + +[[package]] +name = "levenshtein" +version = "0.27.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rapidfuzz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/b3/b5f8011483ba9083a0bc74c4d58705e9cf465fbe55c948a1b1357d0a2aa8/levenshtein-0.27.1.tar.gz", hash = "sha256:3e18b73564cfc846eec94dd13fab6cb006b5d2e0cc56bad1fd7d5585881302e3", size = 382571, upload-time = "2025-03-02T19:44:56.148Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/84/110136e740655779aceb0da2399977362f21b2dbf3ea3646557f9c2237c4/levenshtein-0.27.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e6f1760108319a108dceb2f02bc7cdb78807ad1f9c673c95eaa1d0fe5dfcaae", size = 174555, upload-time = "2025-03-02T19:42:51.781Z" }, + { url = "https://files.pythonhosted.org/packages/19/5b/176d96959f5c5969f356d8856f8e20d2e72f7e4879f6d1cda8e5c2ac2614/levenshtein-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c4ed8400d94ab348099395e050b8ed9dd6a5d6b5b9e75e78b2b3d0b5f5b10f38", size = 156286, upload-time = "2025-03-02T19:42:53.106Z" }, + { url = "https://files.pythonhosted.org/packages/2a/2d/a75abaafc8a46b0dc52ab14dc96708989a31799a02a4914f9210c3415f04/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7826efe51be8ff58bc44a633e022fdd4b9fc07396375a6dbc4945a3bffc7bf8f", size = 152413, upload-time = "2025-03-02T19:42:55.129Z" }, + { url = "https://files.pythonhosted.org/packages/9a/5f/533f4adf964b10817a1d0ecca978b3542b3b9915c96172d20162afe18bed/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff5afb78719659d353055863c7cb31599fbea6865c0890b2d840ee40214b3ddb", size = 184236, upload-time = "2025-03-02T19:42:56.427Z" }, + { url = "https://files.pythonhosted.org/packages/02/79/e698623795e36e0d166a3aa1eac6fe1e446cac3a5c456664a95c351571d1/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:201dafd5c004cd52018560cf3213da799534d130cf0e4db839b51f3f06771de0", size = 185502, upload-time = "2025-03-02T19:42:57.596Z" }, + { url = "https://files.pythonhosted.org/packages/ac/94/76b64762f4af6e20bbab79713c4c48783240e6e502b2f52e5037ddda688a/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5ddd59f3cfaec216811ee67544779d9e2d6ed33f79337492a248245d6379e3d", size = 161749, upload-time = "2025-03-02T19:42:59.222Z" }, + { url = "https://files.pythonhosted.org/packages/56/d0/d10eff9224c94a478078a469aaeb43471fdeddad035f443091224c7544b8/levenshtein-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6afc241d27ecf5b921063b796812c55b0115423ca6fa4827aa4b1581643d0a65", size = 246686, upload-time = "2025-03-02T19:43:00.454Z" }, + { url = "https://files.pythonhosted.org/packages/b2/8a/ebbeff74461da3230d00e8a8197480a2ea1a9bbb7dbc273214d7ea3896cb/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee2e766277cceb8ca9e584ea03b8dc064449ba588d3e24c1923e4b07576db574", size = 1116616, upload-time = "2025-03-02T19:43:02.431Z" }, + { url = "https://files.pythonhosted.org/packages/1d/9b/e7323684f833ede13113fba818c3afe665a78b47d720afdeb2e530c1ecb3/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:920b23d6109453913ce78ec451bc402ff19d020ee8be4722e9d11192ec2fac6f", size = 1401483, upload-time = "2025-03-02T19:43:04.62Z" }, + { url = "https://files.pythonhosted.org/packages/ef/1d/9b6ab30ff086a33492d6f7de86a07050b15862ccf0d9feeccfbe26af52d8/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:560d7edba126e2eea3ac3f2f12e7bd8bc9c6904089d12b5b23b6dfa98810b209", size = 1225805, upload-time = "2025-03-02T19:43:06.734Z" }, + { url = "https://files.pythonhosted.org/packages/1b/07/ae2f31e87ff65ba4857e25192646f1f3c8cca83c2ac1c27e551215b7e1b6/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8d5362b6c7aa4896dc0cb1e7470a4ad3c06124e0af055dda30d81d3c5549346b", size = 1419860, upload-time = "2025-03-02T19:43:08.084Z" }, + { url = "https://files.pythonhosted.org/packages/43/d2/dfcc5c22c07bab9be99f3f47a907be583bcd37bfd2eec57a205e59671019/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:65ba880815b0f80a80a293aeebac0fab8069d03ad2d6f967a886063458f9d7a1", size = 1188823, upload-time = "2025-03-02T19:43:09.592Z" }, + { url = "https://files.pythonhosted.org/packages/8b/96/713335623f8ab50eba0627c8685618dc3a985aedaaea9f492986b9443551/levenshtein-0.27.1-cp311-cp311-win32.whl", hash = "sha256:fcc08effe77fec0bc5b0f6f10ff20b9802b961c4a69047b5499f383119ddbe24", size = 88156, upload-time = "2025-03-02T19:43:11.442Z" }, + { url = "https://files.pythonhosted.org/packages/aa/ae/444d6e8ba9a35379a56926716f18bb2e77c6cf69e5324521fbe6885f14f6/levenshtein-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:0ed402d8902be7df212ac598fc189f9b2d520817fdbc6a05e2ce44f7f3ef6857", size = 100399, upload-time = "2025-03-02T19:43:13.066Z" }, + { url = "https://files.pythonhosted.org/packages/80/c0/ff226897a238a2deb2ca2c00d658755a1aa01884b0ddc8f5d406cb5f2b0d/levenshtein-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:7fdaab29af81a8eb981043737f42450efca64b9761ca29385487b29c506da5b5", size = 88033, upload-time = "2025-03-02T19:43:14.211Z" }, + { url = "https://files.pythonhosted.org/packages/7d/44/c5955d0b6830925559b00617d80c9f6e03a9b00c451835ee4da7010e71cd/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:909b7b6bce27a4ec90576c9a9bd9af5a41308dfecf364b410e80b58038277bbe", size = 170533, upload-time = "2025-03-02T19:44:38.096Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3f/858572d68b33e13a9c154b99f153317efe68381bf63cc4e986e820935fc3/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d193a7f97b8c6a350e36ec58e41a627c06fa4157c3ce4b2b11d90cfc3c2ebb8f", size = 153119, upload-time = "2025-03-02T19:44:39.388Z" }, + { url = "https://files.pythonhosted.org/packages/d1/60/2bd8d001ea4eb53ca16faa7a649d56005ba22b1bcc2a4f1617ab27ed7e48/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:614be316e3c06118705fae1f717f9072d35108e5fd4e66a7dd0e80356135340b", size = 149576, upload-time = "2025-03-02T19:44:40.617Z" }, + { url = "https://files.pythonhosted.org/packages/e4/db/0580797e1e4ac26cf67761a235b29b49f62d2b175dbbc609882f2aecd4e4/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31fc0a5bb070722bdabb6f7e14955a294a4a968c68202d294699817f21545d22", size = 157445, upload-time = "2025-03-02T19:44:41.901Z" }, + { url = "https://files.pythonhosted.org/packages/f4/de/9c171c96d1f15c900086d7212b5543a85539e767689fc4933d14048ba1ec/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9415aa5257227af543be65768a80c7a75e266c3c818468ce6914812f88f9c3df", size = 243141, upload-time = "2025-03-02T19:44:43.228Z" }, + { url = "https://files.pythonhosted.org/packages/dc/1e/408fd10217eac0e43aea0604be22b4851a09e03d761d44d4ea12089dd70e/levenshtein-0.27.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:7987ef006a3cf56a4532bd4c90c2d3b7b4ca9ad3bf8ae1ee5713c4a3bdfda913", size = 98045, upload-time = "2025-03-02T19:44:44.527Z" }, +] + +[[package]] +name = "llvmlite" +version = "0.45.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/99/8d/5baf1cef7f9c084fb35a8afbde88074f0d6a727bc63ef764fe0e7543ba40/llvmlite-0.45.1.tar.gz", hash = "sha256:09430bb9d0bb58fc45a45a57c7eae912850bedc095cd0810a57de109c69e1c32", size = 185600, upload-time = "2025-10-01T17:59:52.046Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/ad/9bdc87b2eb34642c1cfe6bcb4f5db64c21f91f26b010f263e7467e7536a3/llvmlite-0.45.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:60f92868d5d3af30b4239b50e1717cb4e4e54f6ac1c361a27903b318d0f07f42", size = 43043526, upload-time = "2025-10-01T18:03:15.051Z" }, + { url = "https://files.pythonhosted.org/packages/a5/ea/c25c6382f452a943b4082da5e8c1665ce29a62884e2ec80608533e8e82d5/llvmlite-0.45.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98baab513e19beb210f1ef39066288784839a44cd504e24fff5d17f1b3cf0860", size = 37253118, upload-time = "2025-10-01T18:04:06.783Z" }, + { url = "https://files.pythonhosted.org/packages/fe/af/85fc237de98b181dbbe8647324331238d6c52a3554327ccdc83ced28efba/llvmlite-0.45.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3adc2355694d6a6fbcc024d59bb756677e7de506037c878022d7b877e7613a36", size = 56288209, upload-time = "2025-10-01T18:01:00.168Z" }, + { url = "https://files.pythonhosted.org/packages/0a/df/3daf95302ff49beff4230065e3178cd40e71294968e8d55baf4a9e560814/llvmlite-0.45.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2f3377a6db40f563058c9515dedcc8a3e562d8693a106a28f2ddccf2c8fcf6ca", size = 55140958, upload-time = "2025-10-01T18:02:11.199Z" }, + { url = "https://files.pythonhosted.org/packages/a4/56/4c0d503fe03bac820ecdeb14590cf9a248e120f483bcd5c009f2534f23f0/llvmlite-0.45.1-cp311-cp311-win_amd64.whl", hash = "sha256:f9c272682d91e0d57f2a76c6d9ebdfccc603a01828cdbe3d15273bdca0c3363a", size = 38132232, upload-time = "2025-10-01T18:04:52.181Z" }, +] + +[[package]] +name = "lxml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/d5/becbe1e2569b474a23f0c672ead8a29ac50b2dc1d5b9de184831bda8d14c/lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607", size = 8634365, upload-time = "2025-09-22T04:00:45.672Z" }, + { url = "https://files.pythonhosted.org/packages/28/66/1ced58f12e804644426b85d0bb8a4478ca77bc1761455da310505f1a3526/lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938", size = 4650793, upload-time = "2025-09-22T04:00:47.783Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/549098ffea39dfd167e3f174b4ce983d0eed61f9d8d25b7bf2a57c3247fc/lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d", size = 4944362, upload-time = "2025-09-22T04:00:49.845Z" }, + { url = "https://files.pythonhosted.org/packages/ac/bd/f207f16abf9749d2037453d56b643a7471d8fde855a231a12d1e095c4f01/lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438", size = 5083152, upload-time = "2025-09-22T04:00:51.709Z" }, + { url = "https://files.pythonhosted.org/packages/15/ae/bd813e87d8941d52ad5b65071b1affb48da01c4ed3c9c99e40abb266fbff/lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964", size = 5023539, upload-time = "2025-09-22T04:00:53.593Z" }, + { url = "https://files.pythonhosted.org/packages/02/cd/9bfef16bd1d874fbe0cb51afb00329540f30a3283beb9f0780adbb7eec03/lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d", size = 5344853, upload-time = "2025-09-22T04:00:55.524Z" }, + { url = "https://files.pythonhosted.org/packages/b8/89/ea8f91594bc5dbb879734d35a6f2b0ad50605d7fb419de2b63d4211765cc/lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7", size = 5225133, upload-time = "2025-09-22T04:00:57.269Z" }, + { url = "https://files.pythonhosted.org/packages/b9/37/9c735274f5dbec726b2db99b98a43950395ba3d4a1043083dba2ad814170/lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178", size = 4677944, upload-time = "2025-09-22T04:00:59.052Z" }, + { url = "https://files.pythonhosted.org/packages/20/28/7dfe1ba3475d8bfca3878365075abe002e05d40dfaaeb7ec01b4c587d533/lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553", size = 5284535, upload-time = "2025-09-22T04:01:01.335Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5f14bc0de763498fc29510e3532bf2b4b3a1c1d5d0dff2e900c16ba021ef/lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb", size = 5067343, upload-time = "2025-09-22T04:01:03.13Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b0/bb8275ab5472f32b28cfbbcc6db7c9d092482d3439ca279d8d6fa02f7025/lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a", size = 4725419, upload-time = "2025-09-22T04:01:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/25/4c/7c222753bc72edca3b99dbadba1b064209bc8ed4ad448af990e60dcce462/lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c", size = 5275008, upload-time = "2025-09-22T04:01:07.327Z" }, + { url = "https://files.pythonhosted.org/packages/6c/8c/478a0dc6b6ed661451379447cdbec77c05741a75736d97e5b2b729687828/lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7", size = 5248906, upload-time = "2025-09-22T04:01:09.452Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d9/5be3a6ab2784cdf9accb0703b65e1b64fcdd9311c9f007630c7db0cfcce1/lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46", size = 3610357, upload-time = "2025-09-22T04:01:11.102Z" }, + { url = "https://files.pythonhosted.org/packages/e2/7d/ca6fb13349b473d5732fb0ee3eec8f6c80fc0688e76b7d79c1008481bf1f/lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078", size = 4036583, upload-time = "2025-09-22T04:01:12.766Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a2/51363b5ecd3eab46563645f3a2c3836a2fc67d01a1b87c5017040f39f567/lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285", size = 3680591, upload-time = "2025-09-22T04:01:14.874Z" }, + { url = "https://files.pythonhosted.org/packages/0b/11/29d08bc103a62c0eba8016e7ed5aeebbf1e4312e83b0b1648dd203b0e87d/lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700", size = 3949829, upload-time = "2025-09-22T04:04:45.608Z" }, + { url = "https://files.pythonhosted.org/packages/12/b3/52ab9a3b31e5ab8238da241baa19eec44d2ab426532441ee607165aebb52/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee", size = 4226277, upload-time = "2025-09-22T04:04:47.754Z" }, + { url = "https://files.pythonhosted.org/packages/a0/33/1eaf780c1baad88224611df13b1c2a9dfa460b526cacfe769103ff50d845/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f", size = 4330433, upload-time = "2025-09-22T04:04:49.907Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c1/27428a2ff348e994ab4f8777d3a0ad510b6b92d37718e5887d2da99952a2/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9", size = 4272119, upload-time = "2025-09-22T04:04:51.801Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d0/3020fa12bcec4ab62f97aab026d57c2f0cfd480a558758d9ca233bb6a79d/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a", size = 4417314, upload-time = "2025-09-22T04:04:55.024Z" }, + { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, +] + +[[package]] +name = "markdown" +version = "3.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353, upload-time = "2024-10-18T15:21:02.187Z" }, + { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392, upload-time = "2024-10-18T15:21:02.941Z" }, + { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984, upload-time = "2024-10-18T15:21:03.953Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120, upload-time = "2024-10-18T15:21:06.495Z" }, + { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032, upload-time = "2024-10-18T15:21:07.295Z" }, + { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057, upload-time = "2024-10-18T15:21:08.073Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359, upload-time = "2024-10-18T15:21:09.318Z" }, + { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306, upload-time = "2024-10-18T15:21:10.185Z" }, + { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094, upload-time = "2024-10-18T15:21:11.005Z" }, + { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521, upload-time = "2024-10-18T15:21:12.911Z" }, +] + +[[package]] +name = "matplotlib" +version = "3.10.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "contourpy" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "kiwisolver" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ae/e2/d2d5295be2f44c678ebaf3544ba32d20c1f9ef08c49fe47f496180e1db15/matplotlib-3.10.7.tar.gz", hash = "sha256:a06ba7e2a2ef9131c79c49e63dad355d2d878413a0376c1727c8b9335ff731c7", size = 34804865, upload-time = "2025-10-09T00:28:00.669Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/bc/0fb489005669127ec13f51be0c6adc074d7cf191075dab1da9fe3b7a3cfc/matplotlib-3.10.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:53b492410a6cd66c7a471de6c924f6ede976e963c0f3097a3b7abfadddc67d0a", size = 8257507, upload-time = "2025-10-09T00:26:19.073Z" }, + { url = "https://files.pythonhosted.org/packages/e2/6a/d42588ad895279ff6708924645b5d2ed54a7fb2dc045c8a804e955aeace1/matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9749313deb729f08207718d29c86246beb2ea3fdba753595b55901dee5d2fd6", size = 8119565, upload-time = "2025-10-09T00:26:21.023Z" }, + { url = "https://files.pythonhosted.org/packages/10/b7/4aa196155b4d846bd749cf82aa5a4c300cf55a8b5e0dfa5b722a63c0f8a0/matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2222c7ba2cbde7fe63032769f6eb7e83ab3227f47d997a8453377709b7fe3a5a", size = 8692668, upload-time = "2025-10-09T00:26:22.967Z" }, + { url = "https://files.pythonhosted.org/packages/e6/e7/664d2b97016f46683a02d854d730cfcf54ff92c1dafa424beebef50f831d/matplotlib-3.10.7-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e91f61a064c92c307c5a9dc8c05dc9f8a68f0a3be199d9a002a0622e13f874a1", size = 9521051, upload-time = "2025-10-09T00:26:25.041Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a3/37aef1404efa615f49b5758a5e0261c16dd88f389bc1861e722620e4a754/matplotlib-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f1851eab59ca082c95df5a500106bad73672645625e04538b3ad0f69471ffcc", size = 9576878, upload-time = "2025-10-09T00:26:27.478Z" }, + { url = "https://files.pythonhosted.org/packages/33/cd/b145f9797126f3f809d177ca378de57c45413c5099c5990de2658760594a/matplotlib-3.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:6516ce375109c60ceec579e699524e9d504cd7578506f01150f7a6bc174a775e", size = 8115142, upload-time = "2025-10-09T00:26:29.774Z" }, + { url = "https://files.pythonhosted.org/packages/2e/39/63bca9d2b78455ed497fcf51a9c71df200a11048f48249038f06447fa947/matplotlib-3.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:b172db79759f5f9bc13ef1c3ef8b9ee7b37b0247f987fbbbdaa15e4f87fd46a9", size = 7992439, upload-time = "2025-10-09T00:26:40.32Z" }, + { url = "https://files.pythonhosted.org/packages/58/8f/76d5dc21ac64a49e5498d7f0472c0781dae442dd266a67458baec38288ec/matplotlib-3.10.7-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15112bcbaef211bd663fa935ec33313b948e214454d949b723998a43357b17b0", size = 8252283, upload-time = "2025-10-09T00:27:54.739Z" }, + { url = "https://files.pythonhosted.org/packages/27/0d/9c5d4c2317feb31d819e38c9f947c942f42ebd4eb935fc6fd3518a11eaa7/matplotlib-3.10.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d2a959c640cdeecdd2ec3136e8ea0441da59bcaf58d67e9c590740addba2cb68", size = 8116733, upload-time = "2025-10-09T00:27:56.406Z" }, + { url = "https://files.pythonhosted.org/packages/9a/cc/3fe688ff1355010937713164caacf9ed443675ac48a997bab6ed23b3f7c0/matplotlib-3.10.7-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3886e47f64611046bc1db523a09dd0a0a6bed6081e6f90e13806dd1d1d1b5e91", size = 8693919, upload-time = "2025-10-09T00:27:58.41Z" }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "mmcv" +version = "2.1.0" +source = { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl" } +dependencies = [ + { name = "addict" }, + { name = "mmengine" }, + { name = "numpy" }, + { name = "opencv-python" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyyaml" }, + { name = "regex", marker = "sys_platform == 'win32'" }, + { name = "yapf" }, +] +wheels = [ + { url = "https://download.openmmlab.com/mmcv/dist/cu121/torch2.1.0/mmcv-2.1.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e0ee80f8f86e4227eb58636406517a92699a806db427846cceb2ad28ecd73243" }, +] + +[package.metadata] +requires-dist = [ + { name = "addict" }, + { name = "addict", marker = "extra == 'all'" }, + { name = "coverage", marker = "extra == 'all'" }, + { name = "coverage", marker = "extra == 'tests'" }, + { name = "lmdb", marker = "extra == 'all'" }, + { name = "lmdb", marker = "extra == 'tests'" }, + { name = "mmengine", specifier = ">=0.3.0" }, + { name = "mmengine", marker = "extra == 'all'", specifier = ">=0.3.0" }, + { name = "ninja", marker = "extra == 'all'" }, + { name = "ninja", marker = "extra == 'optional'" }, + { name = "numpy" }, + { name = "numpy", marker = "extra == 'all'" }, + { name = "onnx", marker = "extra == 'all'" }, + { name = "onnx", marker = "extra == 'tests'" }, + { name = "onnxoptimizer", marker = "extra == 'all'" }, + { name = "onnxoptimizer", marker = "extra == 'tests'" }, + { name = "onnxruntime", marker = "extra == 'all'" }, + { name = "onnxruntime", marker = "extra == 'tests'" }, + { name = "opencv-python", specifier = ">=3" }, + { name = "packaging" }, + { name = "packaging", marker = "extra == 'all'" }, + { name = "pillow" }, + { name = "pillow", marker = "extra == 'all'" }, + { name = "psutil", marker = "extra == 'all'" }, + { name = "psutil", marker = "extra == 'optional'" }, + { name = "pytest", marker = "extra == 'all'" }, + { name = "pytest", marker = "extra == 'tests'" }, + { name = "pytest-runner", marker = "extra == 'all'" }, + { name = "pytest-runner", marker = "extra == 'build'" }, + { name = "pyturbojpeg", marker = "extra == 'all'" }, + { name = "pyturbojpeg", marker = "extra == 'tests'" }, + { name = "pyyaml" }, + { name = "pyyaml", marker = "extra == 'all'" }, + { name = "regex", marker = "sys_platform == 'win32'" }, + { name = "regex", marker = "sys_platform == 'win32' and extra == 'all'" }, + { name = "scipy", marker = "extra == 'all'" }, + { name = "scipy", marker = "extra == 'tests'" }, + { name = "tifffile", marker = "extra == 'all'" }, + { name = "tifffile", marker = "extra == 'tests'" }, + { name = "yapf" }, + { name = "yapf", marker = "extra == 'all'" }, +] +provides-extras = ["all", "build", "optional", "tests"] + +[[package]] +name = "mmdet" +version = "3.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pycocotools" }, + { name = "scipy" }, + { name = "shapely" }, + { name = "six" }, + { name = "terminaltables" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/9e/c897d2fe3c3aa40fd83ea04c6103412cf0bd4db4bb20db4248f5c09673e7/mmdet-3.3.0.tar.gz", hash = "sha256:fe8cc2685d60a2a4f2530a4e92aa6269fe45af93265303a31bf4ea463eb3164f", size = 1249318, upload-time = "2024-01-05T06:25:32.637Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/c7/c2d91161c9b3e1c237ea00e9cefb7f4bfe2854769f56025db415b734aedb/mmdet-3.3.0-py3-none-any.whl", hash = "sha256:2e23e291281ac57e7dccf8678e957da45fbe560ce78a1f5ded6afeccd3730f17", size = 2231444, upload-time = "2024-01-05T06:25:30.116Z" }, +] + +[[package]] +name = "mmengine" +version = "0.10.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "addict" }, + { name = "matplotlib" }, + { name = "numpy" }, + { name = "opencv-python" }, + { name = "pyyaml" }, + { name = "regex", marker = "sys_platform == 'win32'" }, + { name = "rich" }, + { name = "termcolor" }, + { name = "yapf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/17/14/959360bbd8374e23fc1b720906999add16a3ac071a501636db12c5861ff5/mmengine-0.10.7.tar.gz", hash = "sha256:d20ffcc31127567e53dceff132612a87f0081de06cbb7ab2bdb7439125a69225", size = 378090, upload-time = "2025-03-04T12:23:09.568Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/8e/f98332248aad102511bea4ae19c0ddacd2f0a994f3ca4c82b7a369e0af8b/mmengine-0.10.7-py3-none-any.whl", hash = "sha256:262ac976a925562f78cd5fd14dd1bc9b680ed0aa81f0d85b723ef782f99c54ee", size = 452720, upload-time = "2025-03-04T12:23:06.339Z" }, +] + +[[package]] +name = "more-itertools" +version = "10.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "msgpack" +version = "1.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/97/560d11202bcd537abca693fd85d81cebe2107ba17301de42b01ac1677b69/msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c", size = 82271, upload-time = "2025-10-08T09:14:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/83/04/28a41024ccbd67467380b6fb440ae916c1e4f25e2cd4c63abe6835ac566e/msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0", size = 84914, upload-time = "2025-10-08T09:14:50.958Z" }, + { url = "https://files.pythonhosted.org/packages/71/46/b817349db6886d79e57a966346cf0902a426375aadc1e8e7a86a75e22f19/msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296", size = 416962, upload-time = "2025-10-08T09:14:51.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/e0/6cc2e852837cd6086fe7d8406af4294e66827a60a4cf60b86575a4a65ca8/msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef", size = 426183, upload-time = "2025-10-08T09:14:53.477Z" }, + { url = "https://files.pythonhosted.org/packages/25/98/6a19f030b3d2ea906696cedd1eb251708e50a5891d0978b012cb6107234c/msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c", size = 411454, upload-time = "2025-10-08T09:14:54.648Z" }, + { url = "https://files.pythonhosted.org/packages/b7/cd/9098fcb6adb32187a70b7ecaabf6339da50553351558f37600e53a4a2a23/msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e", size = 422341, upload-time = "2025-10-08T09:14:56.328Z" }, + { url = "https://files.pythonhosted.org/packages/e6/ae/270cecbcf36c1dc85ec086b33a51a4d7d08fc4f404bdbc15b582255d05ff/msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e", size = 64747, upload-time = "2025-10-08T09:14:57.882Z" }, + { url = "https://files.pythonhosted.org/packages/2a/79/309d0e637f6f37e83c711f547308b91af02b72d2326ddd860b966080ef29/msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68", size = 71633, upload-time = "2025-10-08T09:14:59.177Z" }, + { url = "https://files.pythonhosted.org/packages/73/4d/7c4e2b3d9b1106cd0aa6cb56cc57c6267f59fa8bfab7d91df5adc802c847/msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406", size = 64755, upload-time = "2025-10-08T09:15:00.48Z" }, +] + +[[package]] +name = "msgpack-numpy" +version = "0.4.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "msgpack" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/08/94/61e8aee142733ebfdc400a05bdac6e1763c4514bba3b42743d223f388450/msgpack-numpy-0.4.8.tar.gz", hash = "sha256:c667d3180513422f9c7545be5eec5d296dcbb357e06f72ed39cc683797556e69", size = 10923, upload-time = "2022-06-09T03:43:08.739Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/5d/f25ac7d4fb77cbd53ddc6d05d833c6bf52b12770a44fa9a447eed470ca9a/msgpack_numpy-0.4.8-py2.py3-none-any.whl", hash = "sha256:773c19d4dfbae1b3c7b791083e2caf66983bb19b40901646f61d8731554ae3da", size = 6919, upload-time = "2022-06-09T03:43:06.82Z" }, +] + +[[package]] +name = "multidict" +version = "6.6.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/7f/90a7f01e2d005d6653c689039977f6856718c75c5579445effb7e60923d1/multidict-6.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c7a0e9b561e6460484318a7612e725df1145d46b0ef57c6b9866441bf6e27e0c", size = 76472, upload-time = "2025-08-11T12:06:29.006Z" }, + { url = "https://files.pythonhosted.org/packages/54/a3/bed07bc9e2bb302ce752f1dabc69e884cd6a676da44fb0e501b246031fdd/multidict-6.6.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6bf2f10f70acc7a2446965ffbc726e5fc0b272c97a90b485857e5c70022213eb", size = 44634, upload-time = "2025-08-11T12:06:30.374Z" }, + { url = "https://files.pythonhosted.org/packages/a7/4b/ceeb4f8f33cf81277da464307afeaf164fb0297947642585884f5cad4f28/multidict-6.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:66247d72ed62d5dd29752ffc1d3b88f135c6a8de8b5f63b7c14e973ef5bda19e", size = 44282, upload-time = "2025-08-11T12:06:31.958Z" }, + { url = "https://files.pythonhosted.org/packages/03/35/436a5da8702b06866189b69f655ffdb8f70796252a8772a77815f1812679/multidict-6.6.4-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:105245cc6b76f51e408451a844a54e6823bbd5a490ebfe5bdfc79798511ceded", size = 229696, upload-time = "2025-08-11T12:06:33.087Z" }, + { url = "https://files.pythonhosted.org/packages/b6/0e/915160be8fecf1fca35f790c08fb74ca684d752fcba62c11daaf3d92c216/multidict-6.6.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cbbc54e58b34c3bae389ef00046be0961f30fef7cb0dd9c7756aee376a4f7683", size = 246665, upload-time = "2025-08-11T12:06:34.448Z" }, + { url = "https://files.pythonhosted.org/packages/08/ee/2f464330acd83f77dcc346f0b1a0eaae10230291450887f96b204b8ac4d3/multidict-6.6.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:56c6b3652f945c9bc3ac6c8178cd93132b8d82dd581fcbc3a00676c51302bc1a", size = 225485, upload-time = "2025-08-11T12:06:35.672Z" }, + { url = "https://files.pythonhosted.org/packages/71/cc/9a117f828b4d7fbaec6adeed2204f211e9caf0a012692a1ee32169f846ae/multidict-6.6.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b95494daf857602eccf4c18ca33337dd2be705bccdb6dddbfc9d513e6addb9d9", size = 257318, upload-time = "2025-08-11T12:06:36.98Z" }, + { url = "https://files.pythonhosted.org/packages/25/77/62752d3dbd70e27fdd68e86626c1ae6bccfebe2bb1f84ae226363e112f5a/multidict-6.6.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e5b1413361cef15340ab9dc61523e653d25723e82d488ef7d60a12878227ed50", size = 254689, upload-time = "2025-08-11T12:06:38.233Z" }, + { url = "https://files.pythonhosted.org/packages/00/6e/fac58b1072a6fc59af5e7acb245e8754d3e1f97f4f808a6559951f72a0d4/multidict-6.6.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e167bf899c3d724f9662ef00b4f7fef87a19c22b2fead198a6f68b263618df52", size = 246709, upload-time = "2025-08-11T12:06:39.517Z" }, + { url = "https://files.pythonhosted.org/packages/01/ef/4698d6842ef5e797c6db7744b0081e36fb5de3d00002cc4c58071097fac3/multidict-6.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aaea28ba20a9026dfa77f4b80369e51cb767c61e33a2d4043399c67bd95fb7c6", size = 243185, upload-time = "2025-08-11T12:06:40.796Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c9/d82e95ae1d6e4ef396934e9b0e942dfc428775f9554acf04393cce66b157/multidict-6.6.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:8c91cdb30809a96d9ecf442ec9bc45e8cfaa0f7f8bdf534e082c2443a196727e", size = 237838, upload-time = "2025-08-11T12:06:42.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/cf/f94af5c36baaa75d44fab9f02e2a6bcfa0cd90acb44d4976a80960759dbc/multidict-6.6.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a0ccbfe93ca114c5d65a2471d52d8829e56d467c97b0e341cf5ee45410033b3", size = 246368, upload-time = "2025-08-11T12:06:44.304Z" }, + { url = "https://files.pythonhosted.org/packages/4a/fe/29f23460c3d995f6a4b678cb2e9730e7277231b981f0b234702f0177818a/multidict-6.6.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:55624b3f321d84c403cb7d8e6e982f41ae233d85f85db54ba6286f7295dc8a9c", size = 253339, upload-time = "2025-08-11T12:06:45.597Z" }, + { url = "https://files.pythonhosted.org/packages/29/b6/fd59449204426187b82bf8a75f629310f68c6adc9559dc922d5abe34797b/multidict-6.6.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4a1fb393a2c9d202cb766c76208bd7945bc194eba8ac920ce98c6e458f0b524b", size = 246933, upload-time = "2025-08-11T12:06:46.841Z" }, + { url = "https://files.pythonhosted.org/packages/19/52/d5d6b344f176a5ac3606f7a61fb44dc746e04550e1a13834dff722b8d7d6/multidict-6.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:43868297a5759a845fa3a483fb4392973a95fb1de891605a3728130c52b8f40f", size = 242225, upload-time = "2025-08-11T12:06:48.588Z" }, + { url = "https://files.pythonhosted.org/packages/ec/d3/5b2281ed89ff4d5318d82478a2a2450fcdfc3300da48ff15c1778280ad26/multidict-6.6.4-cp311-cp311-win32.whl", hash = "sha256:ed3b94c5e362a8a84d69642dbeac615452e8af9b8eb825b7bc9f31a53a1051e2", size = 41306, upload-time = "2025-08-11T12:06:49.95Z" }, + { url = "https://files.pythonhosted.org/packages/74/7d/36b045c23a1ab98507aefd44fd8b264ee1dd5e5010543c6fccf82141ccef/multidict-6.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:d8c112f7a90d8ca5d20213aa41eac690bb50a76da153e3afb3886418e61cb22e", size = 46029, upload-time = "2025-08-11T12:06:51.082Z" }, + { url = "https://files.pythonhosted.org/packages/0f/5e/553d67d24432c5cd52b49047f2d248821843743ee6d29a704594f656d182/multidict-6.6.4-cp311-cp311-win_arm64.whl", hash = "sha256:3bb0eae408fa1996d87247ca0d6a57b7fc1dcf83e8a5c47ab82c558c250d4adf", size = 43017, upload-time = "2025-08-11T12:06:52.243Z" }, + { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, +] + +[[package]] +name = "multiprocess" +version = "0.70.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload-time = "2024-01-28T18:52:30.853Z" }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, +] + +[[package]] +name = "narwhals" +version = "2.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/e5/ef07d31c2e07d99eecac8e14ace5c20aeb00ecba4ed5bb00343136380524/narwhals-2.10.0.tar.gz", hash = "sha256:1c05bbef2048a4045263de7d98c3d06140583eb13d796dd733b2157f05d24485", size = 582423, upload-time = "2025-10-27T17:55:55.632Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/13/024ae0586d901f8a6f99e2d29b4ae217e8ef11d3fd944cdfc3bbde5f2a08/narwhals-2.10.0-py3-none-any.whl", hash = "sha256:baed44e8fc38e800e3a585e3fa9843a7079a6fad5fbffbecee4348d6ac52298c", size = 418077, upload-time = "2025-10-27T17:55:53.709Z" }, +] + +[[package]] +name = "natsort" +version = "8.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581", size = 76575, upload-time = "2023-06-20T04:17:19.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/82/7a9d0550484a62c6da82858ee9419f3dd1ccc9aa1c26a1e43da3ecd20b0d/natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c", size = 38268, upload-time = "2023-06-20T04:17:17.522Z" }, +] + +[[package]] +name = "nest-asyncio" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, +] + +[[package]] +name = "networkx" +version = "3.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, +] + +[[package]] +name = "numba" +version = "0.62.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "llvmlite" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/20/33dbdbfe60e5fd8e3dbfde299d106279a33d9f8308346022316781368591/numba-0.62.1.tar.gz", hash = "sha256:7b774242aa890e34c21200a1fc62e5b5757d5286267e71103257f4e2af0d5161", size = 2749817, upload-time = "2025-09-29T10:46:31.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/5f/8b3491dd849474f55e33c16ef55678ace1455c490555337899c35826836c/numba-0.62.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:f43e24b057714e480fe44bc6031de499e7cf8150c63eb461192caa6cc8530bc8", size = 2684279, upload-time = "2025-09-29T10:43:37.213Z" }, + { url = "https://files.pythonhosted.org/packages/bf/18/71969149bfeb65a629e652b752b80167fe8a6a6f6e084f1f2060801f7f31/numba-0.62.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:57cbddc53b9ee02830b828a8428757f5c218831ccc96490a314ef569d8342b7b", size = 2687330, upload-time = "2025-09-29T10:43:59.601Z" }, + { url = "https://files.pythonhosted.org/packages/0e/7d/403be3fecae33088027bc8a95dc80a2fda1e3beff3e0e5fc4374ada3afbe/numba-0.62.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:604059730c637c7885386521bb1b0ddcbc91fd56131a6dcc54163d6f1804c872", size = 3739727, upload-time = "2025-09-29T10:42:45.922Z" }, + { url = "https://files.pythonhosted.org/packages/e0/c3/3d910d08b659a6d4c62ab3cd8cd93c4d8b7709f55afa0d79a87413027ff6/numba-0.62.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d6c540880170bee817011757dc9049dba5a29db0c09b4d2349295991fe3ee55f", size = 3445490, upload-time = "2025-09-29T10:43:12.692Z" }, + { url = "https://files.pythonhosted.org/packages/5b/82/9d425c2f20d9f0a37f7cb955945a553a00fa06a2b025856c3550227c5543/numba-0.62.1-cp311-cp311-win_amd64.whl", hash = "sha256:03de6d691d6b6e2b76660ba0f38f37b81ece8b2cc524a62f2a0cfae2bfb6f9da", size = 2745550, upload-time = "2025-09-29T10:44:20.571Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, +] + +[[package]] +name = "oauth2client" +version = "4.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httplib2" }, + { name = "pyasn1" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/7b/17244b1083e8e604bf154cf9b716aecd6388acd656dd01893d0d244c94d9/oauth2client-4.1.3.tar.gz", hash = "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6", size = 155910, upload-time = "2018-09-07T21:38:18.036Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/a9/4f25a14d23f0786b64875b91784607c2277eff25d48f915e39ff0cff505a/oauth2client-4.1.3-py2.py3-none-any.whl", hash = "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac", size = 98206, upload-time = "2018-09-07T21:38:16.742Z" }, +] + +[[package]] +name = "oauthlib" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, +] + +[[package]] +name = "omegaconf" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" }, +] + +[[package]] +name = "opencv-python" +version = "4.11.0.86" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322, upload-time = "2025-01-16T13:52:25.887Z" }, + { url = "https://files.pythonhosted.org/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197, upload-time = "2025-01-16T13:55:21.222Z" }, + { url = "https://files.pythonhosted.org/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439, upload-time = "2025-01-16T13:51:35.822Z" }, + { url = "https://files.pythonhosted.org/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597, upload-time = "2025-01-16T13:52:08.836Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" }, +] + +[[package]] +name = "outcome" +version = "1.3.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" }, +] + +[[package]] +name = "packaging" +version = "25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/6f/75aa71f8a14267117adeeed5d21b204770189c0a0025acbdc03c337b28fc/pandas-2.3.1.tar.gz", hash = "sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2", size = 4487493, upload-time = "2025-07-07T19:20:04.079Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/1c/ccf70029e927e473a4476c00e0d5b32e623bff27f0402d0a92b7fc29bb9f/pandas-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b", size = 11566608, upload-time = "2025-07-07T19:18:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/ec/d3/3c37cb724d76a841f14b8f5fe57e5e3645207cc67370e4f84717e8bb7657/pandas-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f", size = 10823181, upload-time = "2025-07-07T19:18:36.151Z" }, + { url = "https://files.pythonhosted.org/packages/8a/4c/367c98854a1251940edf54a4df0826dcacfb987f9068abf3e3064081a382/pandas-2.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85", size = 11793570, upload-time = "2025-07-07T19:18:38.385Z" }, + { url = "https://files.pythonhosted.org/packages/07/5f/63760ff107bcf5146eee41b38b3985f9055e710a72fdd637b791dea3495c/pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d", size = 12378887, upload-time = "2025-07-07T19:18:41.284Z" }, + { url = "https://files.pythonhosted.org/packages/15/53/f31a9b4dfe73fe4711c3a609bd8e60238022f48eacedc257cd13ae9327a7/pandas-2.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678", size = 13230957, upload-time = "2025-07-07T19:18:44.187Z" }, + { url = "https://files.pythonhosted.org/packages/e0/94/6fce6bf85b5056d065e0a7933cba2616dcb48596f7ba3c6341ec4bcc529d/pandas-2.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299", size = 13883883, upload-time = "2025-07-07T19:18:46.498Z" }, + { url = "https://files.pythonhosted.org/packages/c8/7b/bdcb1ed8fccb63d04bdb7635161d0ec26596d92c9d7a6cce964e7876b6c1/pandas-2.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab", size = 11340212, upload-time = "2025-07-07T19:18:49.293Z" }, +] + +[[package]] +name = "parso" +version = "0.8.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, +] + +[[package]] +name = "pdf2image" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" }, +] + +[[package]] +name = "pdfminer-six" +version = "20251230" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "charset-normalizer" }, + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/46/9a/d79d8fa6d47a0338846bb558b39b9963b8eb2dfedec61867c138c1b17eeb/pdfminer_six-20251230.tar.gz", hash = "sha256:e8f68a14c57e00c2d7276d26519ea64be1b48f91db1cdc776faa80528ca06c1e", size = 8511285, upload-time = "2025-12-30T15:49:13.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/d7/b288ea32deb752a09aab73c75e1e7572ab2a2b56c3124a5d1eb24c62ceb3/pdfminer_six-20251230-py3-none-any.whl", hash = "sha256:9ff2e3466a7dfc6de6fd779478850b6b7c2d9e9405aa2a5869376a822771f485", size = 6591909, upload-time = "2025-12-30T15:49:10.76Z" }, +] + +[[package]] +name = "pdfplumber" +version = "0.11.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pdfminer-six" }, + { name = "pillow" }, + { name = "pypdfium2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/37/9ca3519e92a8434eb93be570b131476cc0a4e840bb39c62ddb7813a39d53/pdfplumber-0.11.9.tar.gz", hash = "sha256:481224b678b2bbdbf376e2c39bf914144eef7c3d301b4a28eebf0f7f6109d6dc", size = 102768, upload-time = "2026-01-05T08:10:29.072Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/c8/cdbc975f5b634e249cfa6597e37c50f3078412474f21c015e508bfbfe3c3/pdfplumber-0.11.9-py3-none-any.whl", hash = "sha256:33ec5580959ba524e9100138746e090879504c42955df1b8a997604dd326c443", size = 60045, upload-time = "2026-01-05T08:10:27.512Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + +[[package]] +name = "pillow" +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/26/77f8ed17ca4ffd60e1dcd220a6ec6d71210ba398cfa33a13a1cd614c5613/pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722", size = 5316531, upload-time = "2025-07-01T09:13:59.203Z" }, + { url = "https://files.pythonhosted.org/packages/cb/39/ee475903197ce709322a17a866892efb560f57900d9af2e55f86db51b0a5/pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288", size = 4686560, upload-time = "2025-07-01T09:14:01.101Z" }, + { url = "https://files.pythonhosted.org/packages/d5/90/442068a160fd179938ba55ec8c97050a612426fae5ec0a764e345839f76d/pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d", size = 5870978, upload-time = "2025-07-03T13:09:55.638Z" }, + { url = "https://files.pythonhosted.org/packages/13/92/dcdd147ab02daf405387f0218dcf792dc6dd5b14d2573d40b4caeef01059/pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494", size = 7641168, upload-time = "2025-07-03T13:10:00.37Z" }, + { url = "https://files.pythonhosted.org/packages/6e/db/839d6ba7fd38b51af641aa904e2960e7a5644d60ec754c046b7d2aee00e5/pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58", size = 5973053, upload-time = "2025-07-01T09:14:04.491Z" }, + { url = "https://files.pythonhosted.org/packages/f2/2f/d7675ecae6c43e9f12aa8d58b6012683b20b6edfbdac7abcb4e6af7a3784/pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f", size = 6640273, upload-time = "2025-07-01T09:14:06.235Z" }, + { url = "https://files.pythonhosted.org/packages/45/ad/931694675ede172e15b2ff03c8144a0ddaea1d87adb72bb07655eaffb654/pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e", size = 6082043, upload-time = "2025-07-01T09:14:07.978Z" }, + { url = "https://files.pythonhosted.org/packages/3a/04/ba8f2b11fc80d2dd462d7abec16351b45ec99cbbaea4387648a44190351a/pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94", size = 6715516, upload-time = "2025-07-01T09:14:10.233Z" }, + { url = "https://files.pythonhosted.org/packages/48/59/8cd06d7f3944cc7d892e8533c56b0acb68399f640786313275faec1e3b6f/pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0", size = 6274768, upload-time = "2025-07-01T09:14:11.921Z" }, + { url = "https://files.pythonhosted.org/packages/f1/cc/29c0f5d64ab8eae20f3232da8f8571660aa0ab4b8f1331da5c2f5f9a938e/pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac", size = 6986055, upload-time = "2025-07-01T09:14:13.623Z" }, + { url = "https://files.pythonhosted.org/packages/c6/df/90bd886fabd544c25addd63e5ca6932c86f2b701d5da6c7839387a076b4a/pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd", size = 2423079, upload-time = "2025-07-01T09:14:15.268Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e3/6fa84033758276fb31da12e5fb66ad747ae83b93c67af17f8c6ff4cc8f34/pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6", size = 5270566, upload-time = "2025-07-01T09:16:19.801Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ee/e8d2e1ab4892970b561e1ba96cbd59c0d28cf66737fc44abb2aec3795a4e/pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438", size = 4654618, upload-time = "2025-07-01T09:16:21.818Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6d/17f80f4e1f0761f02160fc433abd4109fa1548dcfdca46cfdadaf9efa565/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3", size = 4874248, upload-time = "2025-07-03T13:11:20.738Z" }, + { url = "https://files.pythonhosted.org/packages/de/5f/c22340acd61cef960130585bbe2120e2fd8434c214802f07e8c03596b17e/pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c", size = 6583963, upload-time = "2025-07-03T13:11:26.283Z" }, + { url = "https://files.pythonhosted.org/packages/31/5e/03966aedfbfcbb4d5f8aa042452d3361f325b963ebbadddac05b122e47dd/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361", size = 4957170, upload-time = "2025-07-01T09:16:23.762Z" }, + { url = "https://files.pythonhosted.org/packages/cc/2d/e082982aacc927fc2cab48e1e731bdb1643a1406acace8bed0900a61464e/pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7", size = 5581505, upload-time = "2025-07-01T09:16:25.593Z" }, + { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" }, +] + +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" }, +] + +[[package]] +name = "playwright" +version = "1.55.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/3a/c81ff76df266c62e24f19718df9c168f49af93cabdbc4608ae29656a9986/playwright-1.55.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:d7da108a95001e412effca4f7610de79da1637ccdf670b1ae3fdc08b9694c034", size = 40428109, upload-time = "2025-08-28T15:46:20.357Z" }, + { url = "https://files.pythonhosted.org/packages/cf/f5/bdb61553b20e907196a38d864602a9b4a461660c3a111c67a35179b636fa/playwright-1.55.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:8290cf27a5d542e2682ac274da423941f879d07b001f6575a5a3a257b1d4ba1c", size = 38687254, upload-time = "2025-08-28T15:46:23.925Z" }, + { url = "https://files.pythonhosted.org/packages/4a/64/48b2837ef396487807e5ab53c76465747e34c7143fac4a084ef349c293a8/playwright-1.55.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:25b0d6b3fd991c315cca33c802cf617d52980108ab8431e3e1d37b5de755c10e", size = 40428108, upload-time = "2025-08-28T15:46:27.119Z" }, + { url = "https://files.pythonhosted.org/packages/08/33/858312628aa16a6de97839adc2ca28031ebc5391f96b6fb8fdf1fcb15d6c/playwright-1.55.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c6d4d8f6f8c66c483b0835569c7f0caa03230820af8e500c181c93509c92d831", size = 45905643, upload-time = "2025-08-28T15:46:30.312Z" }, + { url = "https://files.pythonhosted.org/packages/83/83/b8d06a5b5721931aa6d5916b83168e28bd891f38ff56fe92af7bdee9860f/playwright-1.55.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29a0777c4ce1273acf90c87e4ae2fe0130182100d99bcd2ae5bf486093044838", size = 45296647, upload-time = "2025-08-28T15:46:33.221Z" }, + { url = "https://files.pythonhosted.org/packages/06/2e/9db64518aebcb3d6ef6cd6d4d01da741aff912c3f0314dadb61226c6a96a/playwright-1.55.0-py3-none-win32.whl", hash = "sha256:29e6d1558ad9d5b5c19cbec0a72f6a2e35e6353cd9f262e22148685b86759f90", size = 35476046, upload-time = "2025-08-28T15:46:36.184Z" }, + { url = "https://files.pythonhosted.org/packages/46/4f/9ba607fa94bb9cee3d4beb1c7b32c16efbfc9d69d5037fa85d10cafc618b/playwright-1.55.0-py3-none-win_amd64.whl", hash = "sha256:7eb5956473ca1951abb51537e6a0da55257bb2e25fc37c2b75af094a5c93736c", size = 35476048, upload-time = "2025-08-28T15:46:38.867Z" }, + { url = "https://files.pythonhosted.org/packages/21/98/5ca173c8ec906abde26c28e1ecb34887343fd71cc4136261b90036841323/playwright-1.55.0-py3-none-win_arm64.whl", hash = "sha256:012dc89ccdcbd774cdde8aeee14c08e0dd52ddb9135bf10e9db040527386bd76", size = 31225543, upload-time = "2025-08-28T15:46:41.613Z" }, +] + +[[package]] +name = "plotly" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "narwhals" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0c/63/961d47c9ffd592a575495891cdcf7875dc0903ebb33ac238935714213789/plotly-6.3.1.tar.gz", hash = "sha256:dd896e3d940e653a7ce0470087e82c2bd903969a55e30d1b01bb389319461bb0", size = 6956460, upload-time = "2025-10-02T16:10:34.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e32393b6ab9d11a664a60e9b7b7/plotly-6.3.1-py3-none-any.whl", hash = "sha256:8b4420d1dcf2b040f5983eed433f95732ed24930e496d36eb70d211923532e64", size = 9833698, upload-time = "2025-10-02T16:10:22.584Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "postgrest" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecation" }, + { name = "httpx", extra = ["http2"] }, + { name = "pydantic" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/ce/cf638adae7b454650aeba0537886a4ab23327d0bfdf7842d74173584345b/postgrest-2.25.1.tar.gz", hash = "sha256:73fcf2acfc0724702c0487224e3a1fdb888f7bfd9644eeb225a94d91be0920f9", size = 13681, upload-time = "2025-12-10T21:48:28.22Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/b6/b20f69577f693d415981af21e2b3b03308ef50c79a2ee7a8bed796791965/postgrest-2.25.1-py3-none-any.whl", hash = "sha256:8fb7944c613022398ff1e643621c232b170d363a7333b9dd316360ab37dc5b4e", size = 21582, upload-time = "2025-12-10T21:48:27.017Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + +[[package]] +name = "propcache" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/8d/e8b436717ab9c2cfc23b116d2c297305aa4cd8339172a456d61ebf5669b8/propcache-0.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b8d2f607bd8f80ddc04088bc2a037fdd17884a6fcadc47a96e334d72f3717be", size = 74207, upload-time = "2025-06-09T22:54:05.399Z" }, + { url = "https://files.pythonhosted.org/packages/d6/29/1e34000e9766d112171764b9fa3226fa0153ab565d0c242c70e9945318a7/propcache-0.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06766d8f34733416e2e34f46fea488ad5d60726bb9481d3cddf89a6fa2d9603f", size = 43648, upload-time = "2025-06-09T22:54:08.023Z" }, + { url = "https://files.pythonhosted.org/packages/46/92/1ad5af0df781e76988897da39b5f086c2bf0f028b7f9bd1f409bb05b6874/propcache-0.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2dc1f4a1df4fecf4e6f68013575ff4af84ef6f478fe5344317a65d38a8e6dc9", size = 43496, upload-time = "2025-06-09T22:54:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/ce/e96392460f9fb68461fabab3e095cb00c8ddf901205be4eae5ce246e5b7e/propcache-0.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be29c4f4810c5789cf10ddf6af80b041c724e629fa51e308a7a0fb19ed1ef7bf", size = 217288, upload-time = "2025-06-09T22:54:10.466Z" }, + { url = "https://files.pythonhosted.org/packages/c5/2a/866726ea345299f7ceefc861a5e782b045545ae6940851930a6adaf1fca6/propcache-0.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d61f6970ecbd8ff2e9360304d5c8876a6abd4530cb752c06586849ac8a9dc9", size = 227456, upload-time = "2025-06-09T22:54:11.828Z" }, + { url = "https://files.pythonhosted.org/packages/de/03/07d992ccb6d930398689187e1b3c718339a1c06b8b145a8d9650e4726166/propcache-0.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62180e0b8dbb6b004baec00a7983e4cc52f5ada9cd11f48c3528d8cfa7b96a66", size = 225429, upload-time = "2025-06-09T22:54:13.823Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/116ba39448753b1330f48ab8ba927dcd6cf0baea8a0ccbc512dfb49ba670/propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c144ca294a204c470f18cf4c9d78887810d04a3e2fbb30eea903575a779159df", size = 213472, upload-time = "2025-06-09T22:54:15.232Z" }, + { url = "https://files.pythonhosted.org/packages/a6/85/f01f5d97e54e428885a5497ccf7f54404cbb4f906688a1690cd51bf597dc/propcache-0.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5c2a784234c28854878d68978265617aa6dc0780e53d44b4d67f3651a17a9a2", size = 204480, upload-time = "2025-06-09T22:54:17.104Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/7bf5ab9033b8b8194cc3f7cf1aaa0e9c3256320726f64a3e1f113a812dce/propcache-0.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5745bc7acdafa978ca1642891b82c19238eadc78ba2aaa293c6863b304e552d7", size = 214530, upload-time = "2025-06-09T22:54:18.512Z" }, + { url = "https://files.pythonhosted.org/packages/31/0b/bd3e0c00509b609317df4a18e6b05a450ef2d9a963e1d8bc9c9415d86f30/propcache-0.3.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:c0075bf773d66fa8c9d41f66cc132ecc75e5bb9dd7cce3cfd14adc5ca184cb95", size = 205230, upload-time = "2025-06-09T22:54:19.947Z" }, + { url = "https://files.pythonhosted.org/packages/7a/23/fae0ff9b54b0de4e819bbe559508da132d5683c32d84d0dc2ccce3563ed4/propcache-0.3.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5f57aa0847730daceff0497f417c9de353c575d8da3579162cc74ac294c5369e", size = 206754, upload-time = "2025-06-09T22:54:21.716Z" }, + { url = "https://files.pythonhosted.org/packages/b7/7f/ad6a3c22630aaa5f618b4dc3c3598974a72abb4c18e45a50b3cdd091eb2f/propcache-0.3.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:eef914c014bf72d18efb55619447e0aecd5fb7c2e3fa7441e2e5d6099bddff7e", size = 218430, upload-time = "2025-06-09T22:54:23.17Z" }, + { url = "https://files.pythonhosted.org/packages/5b/2c/ba4f1c0e8a4b4c75910742f0d333759d441f65a1c7f34683b4a74c0ee015/propcache-0.3.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2a4092e8549031e82facf3decdbc0883755d5bbcc62d3aea9d9e185549936dcf", size = 223884, upload-time = "2025-06-09T22:54:25.539Z" }, + { url = "https://files.pythonhosted.org/packages/88/e4/ebe30fc399e98572019eee82ad0caf512401661985cbd3da5e3140ffa1b0/propcache-0.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:85871b050f174bc0bfb437efbdb68aaf860611953ed12418e4361bc9c392749e", size = 211480, upload-time = "2025-06-09T22:54:26.892Z" }, + { url = "https://files.pythonhosted.org/packages/96/0a/7d5260b914e01d1d0906f7f38af101f8d8ed0dc47426219eeaf05e8ea7c2/propcache-0.3.2-cp311-cp311-win32.whl", hash = "sha256:36c8d9b673ec57900c3554264e630d45980fd302458e4ac801802a7fd2ef7897", size = 37757, upload-time = "2025-06-09T22:54:28.241Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2d/89fe4489a884bc0da0c3278c552bd4ffe06a1ace559db5ef02ef24ab446b/propcache-0.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53af8cb6a781b02d2ea079b5b853ba9430fcbe18a8e3ce647d5982a3ff69f39", size = 41500, upload-time = "2025-06-09T22:54:29.4Z" }, + { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, +] + +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, +] + +[[package]] +name = "protobuf" +version = "6.32.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2", size = 440614, upload-time = "2025-08-14T21:21:25.015Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741", size = 424409, upload-time = "2025-08-14T21:21:12.366Z" }, + { url = "https://files.pythonhosted.org/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e", size = 435735, upload-time = "2025-08-14T21:21:15.046Z" }, + { url = "https://files.pythonhosted.org/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0", size = 426449, upload-time = "2025-08-14T21:21:16.687Z" }, + { url = "https://files.pythonhosted.org/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1", size = 322869, upload-time = "2025-08-14T21:21:18.282Z" }, + { url = "https://files.pythonhosted.org/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c", size = 322009, upload-time = "2025-08-14T21:21:19.893Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783", size = 169287, upload-time = "2025-08-14T21:21:23.515Z" }, +] + +[[package]] +name = "psutil" +version = "7.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/ec/7b8e6b9b1d22708138630ef34c53ab2b61032c04f16adfdbb96791c8c70c/psutil-7.1.2.tar.gz", hash = "sha256:aa225cdde1335ff9684708ee8c72650f6598d5ed2114b9a7c5802030b1785018", size = 487424, upload-time = "2025-10-25T10:46:34.931Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/89/b9f8d47ddbc52d7301fc868e8224e5f44ed3c7f55e6d0f54ecaf5dd9ff5e/psutil-7.1.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c9ba5c19f2d46203ee8c152c7b01df6eec87d883cfd8ee1af2ef2727f6b0f814", size = 237244, upload-time = "2025-10-25T10:47:07.086Z" }, + { url = "https://files.pythonhosted.org/packages/c8/7a/8628c2f6b240680a67d73d8742bb9ff39b1820a693740e43096d5dcb01e5/psutil-7.1.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:2a486030d2fe81bec023f703d3d155f4823a10a47c36784c84f1cc7f8d39bedb", size = 238101, upload-time = "2025-10-25T10:47:09.523Z" }, + { url = "https://files.pythonhosted.org/packages/30/28/5e27f4d5a0e347f8e3cc16cd7d35533dbce086c95807f1f0e9cd77e26c10/psutil-7.1.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3efd8fc791492e7808a51cb2b94889db7578bfaea22df931424f874468e389e3", size = 258675, upload-time = "2025-10-25T10:47:11.082Z" }, + { url = "https://files.pythonhosted.org/packages/e5/5c/79cf60c9acf36d087f0db0f82066fca4a780e97e5b3a2e4c38209c03d170/psutil-7.1.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2aeb9b64f481b8eabfc633bd39e0016d4d8bbcd590d984af764d80bf0851b8a", size = 260203, upload-time = "2025-10-25T10:47:13.226Z" }, + { url = "https://files.pythonhosted.org/packages/f7/03/0a464404c51685dcb9329fdd660b1721e076ccd7b3d97dee066bcc9ffb15/psutil-7.1.2-cp37-abi3-win_amd64.whl", hash = "sha256:8e17852114c4e7996fe9da4745c2bdef001ebbf2f260dec406290e66628bdb91", size = 246714, upload-time = "2025-10-25T10:47:15.093Z" }, + { url = "https://files.pythonhosted.org/packages/6a/32/97ca2090f2f1b45b01b6aa7ae161cfe50671de097311975ca6eea3e7aabc/psutil-7.1.2-cp37-abi3-win_arm64.whl", hash = "sha256:3e988455e61c240cc879cb62a008c2699231bf3e3d061d7fce4234463fd2abb4", size = 243742, upload-time = "2025-10-25T10:47:17.302Z" }, +] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + +[[package]] +name = "pyarrow" +version = "21.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" }, + { url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" }, + { url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" }, + { url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" }, + { url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" }, +] + +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + +[[package]] +name = "pycocotools" +version = "2.0.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/a6/694fd661f0feb5e91f7049a202ea12de312ca9010c33bd9d9f0c63046c01/pycocotools-2.0.10.tar.gz", hash = "sha256:7a47609cdefc95e5e151313c7d93a61cf06e15d42c7ba99b601e3bc0f9ece2e1", size = 25389, upload-time = "2025-06-04T23:37:47.879Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/36/aebbbddd9c659f1fc9d78daeaf6e39860813bb014b0de873073361ad40f1/pycocotools-2.0.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:68846da0ee3ea82d71bcbd99ed28271633a67a899cfbacd2ef309b2e455524b2", size = 155033, upload-time = "2025-06-04T23:37:01.835Z" }, + { url = "https://files.pythonhosted.org/packages/57/c2/e4c96950604c709fbd71c49828968fadd9d8ca8cf74f52be4cd4b2ff9300/pycocotools-2.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20831839a771d4bc60a814e7b54a92d9a45a773dee47959d30888d00066059c3", size = 470328, upload-time = "2025-06-04T23:37:03.675Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ec/7827cd9ce6e80f739fab0163ecb3765df54af744a9bab64b0058bdce47ef/pycocotools-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1760c10459dfb4229e7436ae380228428efb0115bbe332a51b72d07fa085d8c0", size = 477331, upload-time = "2025-06-04T23:37:05.703Z" }, + { url = "https://files.pythonhosted.org/packages/81/74/33ce685ae1cd6312b2526f701e43dfeb73d1c860878b72a30ac1cc322536/pycocotools-2.0.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5146bc881f380e8fb493e49216083298e4a06f778841f8b9b1d45b21e211d0e4", size = 489735, upload-time = "2025-06-04T23:37:08.488Z" }, + { url = "https://files.pythonhosted.org/packages/17/79/0e02ce700ff9c9fd30e57a84add42bd6fc033e743b76870ef68215d3f3f4/pycocotools-2.0.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:23f7d0c551d4c31cab629ce177186db9562f10414320add5267707a84cf6cdfa", size = 507779, upload-time = "2025-06-04T23:37:10.159Z" }, + { url = "https://files.pythonhosted.org/packages/d5/12/00fac39ad26f762c50e5428cc8b3c83de28c5d64b5b858181583522a4e28/pycocotools-2.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:03c3aacec2a6aa5171016303a539d07a7b22a34557456eadf0eb40853bdd813e", size = 80808, upload-time = "2025-06-04T23:37:11.865Z" }, + { url = "https://files.pythonhosted.org/packages/3d/cd/50970a64365f013151086d54d60b40369cf612f117d72cd9d6bd2966932c/pycocotools-2.0.10-cp311-cp311-win_arm64.whl", hash = "sha256:1f942352b1ab11b9732443ab832cbe5836441f4ec30e1f61b44e1421dbb0a0f5", size = 69566, upload-time = "2025-06-04T23:37:13.067Z" }, + { url = "https://files.pythonhosted.org/packages/d7/b4/3b87dce90fc81b8283b2b0e32b22642939e25f3a949581cb6777f5eebb12/pycocotools-2.0.10-cp312-abi3-macosx_10_13_universal2.whl", hash = "sha256:e1359f556986c8c4ac996bf8e473ff891d87630491357aaabd12601687af5edb", size = 142896, upload-time = "2025-06-04T23:37:14.748Z" }, + { url = "https://files.pythonhosted.org/packages/29/d5/b17bb67722432a191cb86121cda33cd8edb4d5b15beda43bc97a7d5ae404/pycocotools-2.0.10-cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:075788c90bfa6a8989d628932854f3e32c25dac3c1bf7c1183cefad29aee16c8", size = 390111, upload-time = "2025-06-04T23:37:16.588Z" }, + { url = "https://files.pythonhosted.org/packages/49/80/912b4c60f94e747dd2c3adbda5d4a4edc1d735fbfa0d91ab2eb231decb5d/pycocotools-2.0.10-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4539d8b29230de042f574012edd0b5227528da083c4f12bbd6488567aabd3920", size = 397099, upload-time = "2025-06-04T23:37:18.105Z" }, + { url = "https://files.pythonhosted.org/packages/df/d7/b3c2f731252a096bbae1a47cb1bbeab4560620a82585d40cce67eca5f043/pycocotools-2.0.10-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:da7b339624d0f78aa5bdc1c86a53f2dcb36ae7e10ab5fe45ba69878bb7837c7a", size = 396111, upload-time = "2025-06-04T23:37:20.642Z" }, + { url = "https://files.pythonhosted.org/packages/2c/6f/2eceba57245bfc86174263e12716cbe91b329a3677fbeff246148ce6a664/pycocotools-2.0.10-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffdbf8810f27b32c5c5c85d9cd65e8e066852fef9775e58a7b23abdffeaf8252", size = 416393, upload-time = "2025-06-04T23:37:22.287Z" }, + { url = "https://files.pythonhosted.org/packages/e1/31/d87f781759b2ad177dd6d41c5fe0ce154f14fc8b384e9b80cd21a157395b/pycocotools-2.0.10-cp312-abi3-win_amd64.whl", hash = "sha256:998a88f90bb663548e767470181175343d406b6673b8b9ef5bdbb3a6d3eb3b11", size = 76824, upload-time = "2025-06-04T23:37:23.744Z" }, + { url = "https://files.pythonhosted.org/packages/27/13/7674d61658b58b8310e3de1270bce18f92a6ee8136e54a7e5696d6f72fd4/pycocotools-2.0.10-cp312-abi3-win_arm64.whl", hash = "sha256:76cd86a80171f8f7da3250be0e40d75084f1f1505d376ae0d08ed0be1ba8a90d", size = 64753, upload-time = "2025-06-04T23:37:25.202Z" }, +] + +[[package]] +name = "pycparser" +version = "2.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/cf/d2d3b9f5699fb1e4615c8e32ff220203e43b248e1dfcc6736ad9057731ca/pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", size = 173734, upload-time = "2025-09-09T13:23:47.91Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" }, +] + +[[package]] +name = "pydantic" +version = "2.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, +] + +[[package]] +name = "pydantic-argparse" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/ea/e63d587294c20d3b83e9c312b5d577c9ec28962ee8490839ca9996672849/pydantic_argparse-0.10.0.tar.gz", hash = "sha256:d57eb0a84c8f0af6605376157d3f445cfd786700f2e596ba9d48d15d557185eb", size = 15928, upload-time = "2025-02-09T08:18:30.425Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/14/9ee71e3a183f76ff93e46b36157d6ddbf29ec2547b7d2c57931cd5d3aecc/pydantic_argparse-0.10.0-py3-none-any.whl", hash = "sha256:e317f001208d77a5600ece6f7ac78d768d8221a7d64a958980705e9630c2e299", size = 25265, upload-time = "2025-02-09T08:18:27.671Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.33.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, + { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, + { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, + { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, + { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, + { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, + { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, + { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, + { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, + { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, + { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, + { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, + { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, + { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, + { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, + { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, +] + +[[package]] +name = "pydantic-settings" +version = "2.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/20/c5/dbbc27b814c71676593d1c3f718e6cd7d4f00652cefa24b75f7aa3efb25e/pydantic_settings-2.11.0.tar.gz", hash = "sha256:d0e87a1c7d33593beb7194adb8470fc426e95ba02af83a0f23474a04c9a08180", size = 188394, upload-time = "2025-09-24T14:19:11.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/d6/887a1ff844e64aa823fb4905978d882a633cfe295c32eacad582b78a7d8b/pydantic_settings-2.11.0-py3-none-any.whl", hash = "sha256:fe2cea3413b9530d10f3a5875adffb17ada5c1e1bab0b2885546d7310415207c", size = 48608, upload-time = "2025-09-24T14:19:10.015Z" }, +] + +[[package]] +name = "pydrive2" +version = "1.21.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "google-api-python-client" }, + { name = "oauth2client" }, + { name = "pyopenssl" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/dc/92b0beba58f09441219bb6720bebdb895317632db4778cfe1d21532d27e5/pydrive2-1.21.3.tar.gz", hash = "sha256:649b84d60c637bc7146485039535aa8f1254ad156423739f07e5d32507447c13", size = 63348, upload-time = "2024-11-29T09:49:53.556Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/de/eef2e2661371b02d4231c5cacbb758a52ea9ea98cb5f52d69298641e2631/PyDrive2-1.21.3-py3-none-any.whl", hash = "sha256:843a304f500e71508162807001f5e19487f272e8ff5648f43582bd24c6250200", size = 47972, upload-time = "2024-11-29T09:49:51.254Z" }, +] + +[[package]] +name = "pyee" +version = "13.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/03/1fd98d5841cd7964a27d729ccf2199602fe05eb7a405c1462eb7277945ed/pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37", size = 31250, upload-time = "2025-03-17T18:53:15.955Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pyjwt" +version = "2.11.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5a/b46fa56bf322901eee5b0454a34343cdbdae202cd421775a8ee4e42fd519/pyjwt-2.11.0.tar.gz", hash = "sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623", size = 98019, upload-time = "2026-01-30T19:59:55.694Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/01/c26ce75ba460d5cd503da9e13b21a33804d38c2165dec7b716d06b13010c/pyjwt-2.11.0-py3-none-any.whl", hash = "sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469", size = 28224, upload-time = "2026-01-30T19:59:54.539Z" }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography" }, +] + +[[package]] +name = "pymupdf" +version = "1.26.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6d/d4/70a265e4bcd43e97480ae62da69396ef4507c8f9cfd179005ee731c92a04/pymupdf-1.26.3.tar.gz", hash = "sha256:b7d2c3ffa9870e1e4416d18862f5ccd356af5fe337b4511093bbbce2ca73b7e5", size = 75990308, upload-time = "2025-07-02T21:34:22.243Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/70/d3/c7af70545cd3097a869fd635bb6222108d3a0fb28c0b8254754a126c4cbb/pymupdf-1.26.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ded891963944e5f13b03b88f6d9e982e816a4ec8689fe360876eef000c161f2b", size = 23057205, upload-time = "2025-07-02T21:26:16.326Z" }, + { url = "https://files.pythonhosted.org/packages/04/3d/ec5b69bfeaa5deefa7141fc0b20d77bb20404507cf17196b4eb59f1f2977/pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:436a33c738bb10eadf00395d18a6992b801ffb26521ee1f361ae786dd283327a", size = 22406630, upload-time = "2025-07-02T21:27:10.112Z" }, + { url = "https://files.pythonhosted.org/packages/fc/20/661d3894bb05ad75ed6ca103ee2c3fa44d88a458b5c8d4a946b9c0f2569b/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a2d7a3cd442f12f05103cb3bb1415111517f0a97162547a3720f3bbbc5e0b51c", size = 23450287, upload-time = "2025-07-03T07:22:19.317Z" }, + { url = "https://files.pythonhosted.org/packages/9c/7f/21828f018e65b16a033731d21f7b46d93fa81c6e8257f769ca4a1c2a1cb0/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:454f38c8cf07eb333eb4646dca10517b6e90f57ce2daa2265a78064109d85555", size = 24057319, upload-time = "2025-07-02T21:28:26.697Z" }, + { url = "https://files.pythonhosted.org/packages/71/5d/e8f88cd5a45b8f5fa6590ce8cef3ce0fad30eac6aac8aea12406f95bee7d/pymupdf-1.26.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:759b75d2f710ff4edf8d097d2e98f60e9ecef47632cead6f949b3412facdb9f0", size = 24261350, upload-time = "2025-07-02T21:29:21.733Z" }, + { url = "https://files.pythonhosted.org/packages/82/22/ecc560e4f281b5dffafbf3a81f023d268b1746d028044f495115b74a2e70/pymupdf-1.26.3-cp39-abi3-win32.whl", hash = "sha256:a839ed44742faa1cd4956bb18068fe5aae435d67ce915e901318646c4e7bbea6", size = 17116371, upload-time = "2025-07-02T21:30:23.253Z" }, + { url = "https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl", hash = "sha256:b4cd5124d05737944636cf45fc37ce5824f10e707b0342efe109c7b6bd37a9cc", size = 18735124, upload-time = "2025-07-02T21:31:10.992Z" }, +] + +[[package]] +name = "pynndescent" +version = "0.5.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "llvmlite" }, + { name = "numba" }, + { name = "scikit-learn" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/58/560a4db5eb3794d922fe55804b10326534ded3d971e1933c1eef91193f5e/pynndescent-0.5.13.tar.gz", hash = "sha256:d74254c0ee0a1eeec84597d5fe89fedcf778593eeabe32c2f97412934a9800fb", size = 2975955, upload-time = "2024-06-17T15:48:32.914Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/53/d23a97e0a2c690d40b165d1062e2c4ccc796be458a1ce59f6ba030434663/pynndescent-0.5.13-py3-none-any.whl", hash = "sha256:69aabb8f394bc631b6ac475a1c7f3994c54adf3f51cd63b2730fefba5771b949", size = 56850, upload-time = "2024-06-17T15:48:31.184Z" }, +] + +[[package]] +name = "pyopenssl" +version = "24.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5d/70/ff56a63248562e77c0c8ee4aefc3224258f1856977e0c1472672b62dadb8/pyopenssl-24.2.1.tar.gz", hash = "sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95", size = 184323, upload-time = "2024-07-20T17:26:31.252Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/dd/e0aa7ebef5168c75b772eda64978c597a9129b46be17779054652a7999e4/pyOpenSSL-24.2.1-py3-none-any.whl", hash = "sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d", size = 58390, upload-time = "2024-07-20T17:26:29.057Z" }, +] + +[[package]] +name = "pyparsing" +version = "3.2.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/a5/181488fc2b9d093e3972d2a472855aae8a03f000592dbfce716a512b3359/pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6", size = 1099274, upload-time = "2025-09-21T04:11:06.277Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" }, +] + +[[package]] +name = "pypdf2" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" }, +] + +[[package]] +name = "pypdfium2" +version = "5.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/83/173dab58beb6c7e772b838199014c173a2436018dd7cfde9bbf4a3be15da/pypdfium2-5.3.0.tar.gz", hash = "sha256:2873ffc95fcb01f329257ebc64a5fdce44b36447b6b171fe62f7db5dc3269885", size = 268742, upload-time = "2026-01-05T16:29:03.02Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/a4/6bb5b5918c7fc236ec426be8a0205a984fe0a26ae23d5e4dd497398a6571/pypdfium2-5.3.0-py3-none-android_23_arm64_v8a.whl", hash = "sha256:885df6c78d41600cb086dc0c76b912d165b5bd6931ca08138329ea5a991b3540", size = 2763287, upload-time = "2026-01-05T16:28:24.21Z" }, + { url = "https://files.pythonhosted.org/packages/3e/64/24b41b906006bf07099b095f0420ee1f01a3a83a899f3e3731e4da99c06a/pypdfium2-5.3.0-py3-none-android_23_armeabi_v7a.whl", hash = "sha256:6e53dee6b333ee77582499eff800300fb5aa0c7eb8f52f95ccb5ca35ebc86d48", size = 2303285, upload-time = "2026-01-05T16:28:26.274Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c0/3ec73f4ded83ba6c02acf6e9d228501759d5d74fe57f1b93849ab92dcc20/pypdfium2-5.3.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ce4466bdd62119fe25a5f74d107acc9db8652062bf217057630c6ff0bb419523", size = 2816066, upload-time = "2026-01-05T16:28:28.099Z" }, + { url = "https://files.pythonhosted.org/packages/62/ca/e553b3b8b5c2cdc3d955cc313493ac27bbe63fc22624769d56ded585dd5e/pypdfium2-5.3.0-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:cc2647fd03db42b8a56a8835e8bc7899e604e2042cd6fedeea53483185612907", size = 2945545, upload-time = "2026-01-05T16:28:29.489Z" }, + { url = "https://files.pythonhosted.org/packages/a1/56/615b776071e95c8570d579038256d0c77969ff2ff381e427be4ab8967f44/pypdfium2-5.3.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35e205f537ddb4069e4b4e22af7ffe84fcf2d686c3fee5e5349f73268a0ef1ca", size = 2979892, upload-time = "2026-01-05T16:28:31.088Z" }, + { url = "https://files.pythonhosted.org/packages/df/10/27114199b765bdb7d19a9514c07036ad2fc3a579b910e7823ba167ead6de/pypdfium2-5.3.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5795298f44050797ac030994fc2525ea35d2d714efe70058e0ee22e5f613f27", size = 2765738, upload-time = "2026-01-05T16:28:33.18Z" }, + { url = "https://files.pythonhosted.org/packages/b4/d7/2a3afa35e6c205a4f6264c33b8d2f659707989f93c30b336aa58575f66fa/pypdfium2-5.3.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7cd43dfceb77137e69e74c933d41506da1dddaff70f3a794fb0ad0d73e90d75", size = 3064338, upload-time = "2026-01-05T16:28:34.731Z" }, + { url = "https://files.pythonhosted.org/packages/a2/f1/6658755cf6e369bb51d0bccb81c51c300404fbe67c2f894c90000b6442dd/pypdfium2-5.3.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5956867558fd3a793e58691cf169718864610becb765bfe74dd83f05cbf1ae3", size = 3415059, upload-time = "2026-01-05T16:28:37.313Z" }, + { url = "https://files.pythonhosted.org/packages/f5/34/f86482134fa641deb1f524c45ec7ebd6fc8d404df40c5657ddfce528593e/pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ff1071e9a782625822658dfe6e29e3a644a66960f8713bb17819f5a0ac5987", size = 2998517, upload-time = "2026-01-05T16:28:38.873Z" }, + { url = "https://files.pythonhosted.org/packages/09/34/40ab99425dcf503c172885904c5dc356c052bfdbd085f9f3cc920e0b8b25/pypdfium2-5.3.0-py3-none-manylinux_2_27_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f319c46ead49d289ab8c1ed2ea63c91e684f35bdc4cf4dc52191c441182ac481", size = 3673154, upload-time = "2026-01-05T16:28:40.347Z" }, + { url = "https://files.pythonhosted.org/packages/a5/67/0f7532f80825a7728a5cbff3f1104857f8f9fe49ebfd6cb25582a89ae8e1/pypdfium2-5.3.0-py3-none-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6dc67a186da0962294321cace6ccc0a4d212dbc5e9522c640d35725a812324b8", size = 2965002, upload-time = "2026-01-05T16:28:42.143Z" }, + { url = "https://files.pythonhosted.org/packages/ce/6c/c03d2a3d6621b77aac9604bce1c060de2af94950448787298501eac6c6a2/pypdfium2-5.3.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0ad0afd3d2b5b54d86287266fd6ae3fef0e0a1a3df9d2c4984b3e3f8f70e6330", size = 4130530, upload-time = "2026-01-05T16:28:44.264Z" }, + { url = "https://files.pythonhosted.org/packages/af/39/9ad1f958cbe35d4693ae87c09ebafda4bb3e4709c7ccaec86c1a829163a3/pypdfium2-5.3.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1afe35230dc3951b3e79b934c0c35a2e79e2372d06503fce6cf1926d2a816f47", size = 3746568, upload-time = "2026-01-05T16:28:45.897Z" }, + { url = "https://files.pythonhosted.org/packages/2a/e2/4d32310166c2d6955d924737df8b0a3e3efc8d133344a98b10f96320157d/pypdfium2-5.3.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:00385793030cadce08469085cd21b168fd8ff981b009685fef3103bdc5fc4686", size = 4336683, upload-time = "2026-01-05T16:28:47.584Z" }, + { url = "https://files.pythonhosted.org/packages/14/ea/38c337ff12a8cec4b00fd4fdb0a63a70597a344581e20b02addbd301ab56/pypdfium2-5.3.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:d911e82676398949697fef80b7f412078df14d725a91c10e383b727051530285", size = 4375030, upload-time = "2026-01-05T16:28:49.5Z" }, + { url = "https://files.pythonhosted.org/packages/a1/77/9d8de90c35d2fc383be8819bcde52f5821dacbd7404a0225e4010b99d080/pypdfium2-5.3.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:ca1dc625ed347fac3d9002a3ed33d521d5803409bd572e7b3f823c12ab2ef58f", size = 3928914, upload-time = "2026-01-05T16:28:51.433Z" }, + { url = "https://files.pythonhosted.org/packages/a5/39/9d4a6fbd78fcb6803b0ea5e4952a31d6182a0aaa2609cfcd0eb88446fdb8/pypdfium2-5.3.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:ea4f9db2d3575f22cd41f4c7a855240ded842f135e59a961b5b1351a65ce2b6e", size = 4997777, upload-time = "2026-01-05T16:28:53.589Z" }, + { url = "https://files.pythonhosted.org/packages/9d/38/cdd4ed085c264234a59ad32df1dfe432c77a7403da2381e0fcc1ba60b74e/pypdfium2-5.3.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0ea24409613df350223c6afc50911c99dca0d43ddaf2616c5a1ebdffa3e1bcb5", size = 4179895, upload-time = "2026-01-05T16:28:55.322Z" }, + { url = "https://files.pythonhosted.org/packages/93/4c/d2f40145c9012482699664f615d7ae540a346c84f68a8179449e69dcc4d8/pypdfium2-5.3.0-py3-none-win32.whl", hash = "sha256:5bf695d603f9eb8fdd7c1786add5cf420d57fbc81df142ed63c029ce29614df9", size = 2993570, upload-time = "2026-01-05T16:28:58.37Z" }, + { url = "https://files.pythonhosted.org/packages/2c/dc/1388ea650020c26ef3f68856b9227e7f153dcaf445e7e4674a0b8f26891e/pypdfium2-5.3.0-py3-none-win_amd64.whl", hash = "sha256:8365af22a39d4373c265f8e90e561cd64d4ddeaf5e6a66546a8caed216ab9574", size = 3102340, upload-time = "2026-01-05T16:28:59.933Z" }, + { url = "https://files.pythonhosted.org/packages/c8/71/a433668d33999b3aeb2c2dda18aaf24948e862ea2ee148078a35daac6c1c/pypdfium2-5.3.0-py3-none-win_arm64.whl", hash = "sha256:0b2c6bf825e084d91d34456be54921da31e9199d9530b05435d69d1a80501a12", size = 2940987, upload-time = "2026-01-05T16:29:01.511Z" }, +] + +[[package]] +name = "pyreadline3" +version = "3.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, +] + +[[package]] +name = "pysocks" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" }, +] + +[[package]] +name = "pytesseract" +version = "0.3.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" }, +] + +[[package]] +name = "pytest" +version = "8.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, +] + +[[package]] +name = "python-barcode" +version = "0.16.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0e/f2/4c0b07f100e1e184ba682021322c336bbba6aa7adfabd2616f70eff917d9/python_barcode-0.16.1.tar.gz", hash = "sha256:665ed09516b0088b5593061c5ac8662caa0b08d56bdad328388b1cab39939ac5", size = 233777, upload-time = "2025-08-27T11:05:45.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/34/810885dca784b02e5ad0f71ced9c06ba5e9d33a6493bc886f7470ce82a39/python_barcode-0.16.1-py3-none-any.whl", hash = "sha256:5776567478c9a0dae473374bb86631ba0b5ea99aaf302763b364e367ac51f367", size = 228046, upload-time = "2025-08-27T11:05:42.776Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, +] + +[[package]] +name = "python-levenshtein" +version = "0.27.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "levenshtein" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/f6/d865a565b7eeef4b5f9a18accafb03d5730c712420fc84a3a40555f7ea6b/python_levenshtein-0.27.1.tar.gz", hash = "sha256:3a5314a011016d373d309a68e875fd029caaa692ad3f32e78319299648045f11", size = 12326, upload-time = "2025-03-02T19:47:25.641Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/95/8c8fd923b0a702388da4f9e0368f490d123cc5224279e6a083984304a15e/python_levenshtein-0.27.1-py3-none-any.whl", hash = "sha256:e1a4bc2a70284b2ebc4c505646142fecd0f831e49aa04ed972995895aec57396", size = 9426, upload-time = "2025-03-02T19:47:24.801Z" }, +] + +[[package]] +name = "python-multipart" +version = "0.0.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, +] + +[[package]] +name = "pytorch-ignite" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/e5/7fe880b24de30b4eadc8d997ea8d3c4a8f507b1a34dcdced08d88f665ee3/pytorch_ignite-0.5.3.tar.gz", hash = "sha256:75c645f02fea66cc80c1998ade3f8402e0e6b6d73f3f4ad727c171f6e93874f4", size = 7506607, upload-time = "2025-10-16T00:42:05.142Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/ea/f6d5ee7433a5a1c1e4746e2a4e9a222eab545fdbe04b66754ffdab479ee8/pytorch_ignite-0.5.3-py3-none-any.whl", hash = "sha256:4ced7539c690a3b6f3116da7878389954dff787c33669f83b38221f3746bc63e", size = 343802, upload-time = "2025-10-16T00:41:55.738Z" }, +] + +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, +] + +[[package]] +name = "pyxdameraulevenshtein" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/db/b529c031e92a36ded5cca06b48e31e4e2a388da231de84ccb383c8e27af1/pyxDamerauLevenshtein-1.9.0.tar.gz", hash = "sha256:50c84b1b7272c4f1dcee732d6b1713f4871921c99e4cf80e722c65928ca94ce1", size = 73255, upload-time = "2025-10-01T03:55:24.931Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/5e/f3b7c546274a9f908e307d9811c1e25da21df33a288a14b5edc54b0d8bfc/pyxdameraulevenshtein-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b77f6d620ee0a1706005033db847897e22dddfaa152dfa3b4babde1de880c194", size = 32639, upload-time = "2025-10-01T03:55:46.017Z" }, + { url = "https://files.pythonhosted.org/packages/9a/30/dabd621749c4a2138ee2805e0d12235abdfa25e4c3a4a9397de0b20028e7/pyxdameraulevenshtein-1.9.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:11ff877684aef45045896234383e216a9e55083cc1eb139e9d19579f894f459e", size = 38198, upload-time = "2025-10-01T03:55:32.211Z" }, + { url = "https://files.pythonhosted.org/packages/ef/8d/4918da22a6e7416b5bd834cf0ba5cebb440f3cb0f19dd46cb12f5aae630d/pyxdameraulevenshtein-1.9.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2b33d87166eaa9133903a163dcca71fd35a7f20a6c62929187958c6a5e87359f", size = 38563, upload-time = "2025-10-01T03:55:27.02Z" }, + { url = "https://files.pythonhosted.org/packages/28/22/fd466f404e7f1a8d3fe307121a796ad2100ad837863d51f36a747aca2678/pyxdameraulevenshtein-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:42e96b2d340ecb60cd5bba31766d783f5e1175fa63c7973b61e33c2fdffdd771", size = 30924, upload-time = "2025-10-01T03:55:44.627Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" }, +] + +[[package]] +name = "pyzmq" +version = "27.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "implementation_name == 'pypy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/5d/305323ba86b284e6fcb0d842d6adaa2999035f70f8c38a9b6d21ad28c3d4/pyzmq-27.1.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86", size = 1333328, upload-time = "2025-09-08T23:07:45.946Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a0/fc7e78a23748ad5443ac3275943457e8452da67fda347e05260261108cbc/pyzmq-27.1.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581", size = 908803, upload-time = "2025-09-08T23:07:47.551Z" }, + { url = "https://files.pythonhosted.org/packages/7e/22/37d15eb05f3bdfa4abea6f6d96eb3bb58585fbd3e4e0ded4e743bc650c97/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f", size = 668836, upload-time = "2025-09-08T23:07:49.436Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c4/2a6fe5111a01005fc7af3878259ce17684fabb8852815eda6225620f3c59/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e", size = 857038, upload-time = "2025-09-08T23:07:51.234Z" }, + { url = "https://files.pythonhosted.org/packages/cb/eb/bfdcb41d0db9cd233d6fb22dc131583774135505ada800ebf14dfb0a7c40/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e", size = 1657531, upload-time = "2025-09-08T23:07:52.795Z" }, + { url = "https://files.pythonhosted.org/packages/ab/21/e3180ca269ed4a0de5c34417dfe71a8ae80421198be83ee619a8a485b0c7/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2", size = 2034786, upload-time = "2025-09-08T23:07:55.047Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b1/5e21d0b517434b7f33588ff76c177c5a167858cc38ef740608898cd329f2/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394", size = 1894220, upload-time = "2025-09-08T23:07:57.172Z" }, + { url = "https://files.pythonhosted.org/packages/03/f2/44913a6ff6941905efc24a1acf3d3cb6146b636c546c7406c38c49c403d4/pyzmq-27.1.0-cp311-cp311-win32.whl", hash = "sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f", size = 567155, upload-time = "2025-09-08T23:07:59.05Z" }, + { url = "https://files.pythonhosted.org/packages/23/6d/d8d92a0eb270a925c9b4dd039c0b4dc10abc2fcbc48331788824ef113935/pyzmq-27.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97", size = 633428, upload-time = "2025-09-08T23:08:00.663Z" }, + { url = "https://files.pythonhosted.org/packages/ae/14/01afebc96c5abbbd713ecfc7469cfb1bc801c819a74ed5c9fad9a48801cb/pyzmq-27.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07", size = 559497, upload-time = "2025-09-08T23:08:02.15Z" }, + { url = "https://files.pythonhosted.org/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" }, + { url = "https://files.pythonhosted.org/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" }, + { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" }, + { url = "https://files.pythonhosted.org/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" }, + { url = "https://files.pythonhosted.org/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" }, + { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" }, + { url = "https://files.pythonhosted.org/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" }, + { url = "https://files.pythonhosted.org/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" }, + { url = "https://files.pythonhosted.org/packages/4c/c6/c4dcdecdbaa70969ee1fdced6d7b8f60cfabe64d25361f27ac4665a70620/pyzmq-27.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066", size = 836265, upload-time = "2025-09-08T23:09:49.376Z" }, + { url = "https://files.pythonhosted.org/packages/3e/79/f38c92eeaeb03a2ccc2ba9866f0439593bb08c5e3b714ac1d553e5c96e25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604", size = 800208, upload-time = "2025-09-08T23:09:51.073Z" }, + { url = "https://files.pythonhosted.org/packages/49/0e/3f0d0d335c6b3abb9b7b723776d0b21fa7f3a6c819a0db6097059aada160/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c", size = 567747, upload-time = "2025-09-08T23:09:52.698Z" }, + { url = "https://files.pythonhosted.org/packages/a1/cf/f2b3784d536250ffd4be70e049f3b60981235d70c6e8ce7e3ef21e1adb25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271", size = 747371, upload-time = "2025-09-08T23:09:54.563Z" }, + { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" }, +] + +[[package]] +name = "rapidfuzz" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/f6/6895abc3a3d056b9698da3199b04c0e56226d530ae44a470edabf8b664f0/rapidfuzz-3.13.0.tar.gz", hash = "sha256:d2eaf3839e52cbcc0accbe9817a67b4b0fcf70aaeb229cfddc1c28061f9ce5d8", size = 57904226, upload-time = "2025-04-03T20:38:51.226Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/17/9be9eff5a3c7dfc831c2511262082c6786dca2ce21aa8194eef1cb71d67a/rapidfuzz-3.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d395a5cad0c09c7f096433e5fd4224d83b53298d53499945a9b0e5a971a84f3a", size = 1999453, upload-time = "2025-04-03T20:35:40.804Z" }, + { url = "https://files.pythonhosted.org/packages/75/67/62e57896ecbabe363f027d24cc769d55dd49019e576533ec10e492fcd8a2/rapidfuzz-3.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7b3eda607a019169f7187328a8d1648fb9a90265087f6903d7ee3a8eee01805", size = 1450881, upload-time = "2025-04-03T20:35:42.734Z" }, + { url = "https://files.pythonhosted.org/packages/96/5c/691c5304857f3476a7b3df99e91efc32428cbe7d25d234e967cc08346c13/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98e0bfa602e1942d542de077baf15d658bd9d5dcfe9b762aff791724c1c38b70", size = 1422990, upload-time = "2025-04-03T20:35:45.158Z" }, + { url = "https://files.pythonhosted.org/packages/46/81/7a7e78f977496ee2d613154b86b203d373376bcaae5de7bde92f3ad5a192/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bef86df6d59667d9655905b02770a0c776d2853971c0773767d5ef8077acd624", size = 5342309, upload-time = "2025-04-03T20:35:46.952Z" }, + { url = "https://files.pythonhosted.org/packages/51/44/12fdd12a76b190fe94bf38d252bb28ddf0ab7a366b943e792803502901a2/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fedd316c165beed6307bf754dee54d3faca2c47e1f3bcbd67595001dfa11e969", size = 1656881, upload-time = "2025-04-03T20:35:49.954Z" }, + { url = "https://files.pythonhosted.org/packages/27/ae/0d933e660c06fcfb087a0d2492f98322f9348a28b2cc3791a5dbadf6e6fb/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5158da7f2ec02a930be13bac53bb5903527c073c90ee37804090614cab83c29e", size = 1608494, upload-time = "2025-04-03T20:35:51.646Z" }, + { url = "https://files.pythonhosted.org/packages/3d/2c/4b2f8aafdf9400e5599b6ed2f14bc26ca75f5a923571926ccbc998d4246a/rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b6f913ee4618ddb6d6f3e387b76e8ec2fc5efee313a128809fbd44e65c2bbb2", size = 3072160, upload-time = "2025-04-03T20:35:53.472Z" }, + { url = "https://files.pythonhosted.org/packages/60/7d/030d68d9a653c301114101c3003b31ce01cf2c3224034cd26105224cd249/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d25fdbce6459ccbbbf23b4b044f56fbd1158b97ac50994eaae2a1c0baae78301", size = 2491549, upload-time = "2025-04-03T20:35:55.391Z" }, + { url = "https://files.pythonhosted.org/packages/8e/cd/7040ba538fc6a8ddc8816a05ecf46af9988b46c148ddd7f74fb0fb73d012/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25343ccc589a4579fbde832e6a1e27258bfdd7f2eb0f28cb836d6694ab8591fc", size = 7584142, upload-time = "2025-04-03T20:35:57.71Z" }, + { url = "https://files.pythonhosted.org/packages/c1/96/85f7536fbceb0aa92c04a1c37a3fc4fcd4e80649e9ed0fb585382df82edc/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a9ad1f37894e3ffb76bbab76256e8a8b789657183870be11aa64e306bb5228fd", size = 2896234, upload-time = "2025-04-03T20:35:59.969Z" }, + { url = "https://files.pythonhosted.org/packages/55/fd/460e78438e7019f2462fe9d4ecc880577ba340df7974c8a4cfe8d8d029df/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5dc71ef23845bb6b62d194c39a97bb30ff171389c9812d83030c1199f319098c", size = 3437420, upload-time = "2025-04-03T20:36:01.91Z" }, + { url = "https://files.pythonhosted.org/packages/cc/df/c3c308a106a0993befd140a414c5ea78789d201cf1dfffb8fd9749718d4f/rapidfuzz-3.13.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b7f4c65facdb94f44be759bbd9b6dda1fa54d0d6169cdf1a209a5ab97d311a75", size = 4410860, upload-time = "2025-04-03T20:36:04.352Z" }, + { url = "https://files.pythonhosted.org/packages/75/ee/9d4ece247f9b26936cdeaae600e494af587ce9bf8ddc47d88435f05cfd05/rapidfuzz-3.13.0-cp311-cp311-win32.whl", hash = "sha256:b5104b62711565e0ff6deab2a8f5dbf1fbe333c5155abe26d2cfd6f1849b6c87", size = 1843161, upload-time = "2025-04-03T20:36:06.802Z" }, + { url = "https://files.pythonhosted.org/packages/c9/5a/d00e1f63564050a20279015acb29ecaf41646adfacc6ce2e1e450f7f2633/rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:9093cdeb926deb32a4887ebe6910f57fbcdbc9fbfa52252c10b56ef2efb0289f", size = 1629962, upload-time = "2025-04-03T20:36:09.133Z" }, + { url = "https://files.pythonhosted.org/packages/3b/74/0a3de18bc2576b794f41ccd07720b623e840fda219ab57091897f2320fdd/rapidfuzz-3.13.0-cp311-cp311-win_arm64.whl", hash = "sha256:f70f646751b6aa9d05be1fb40372f006cc89d6aad54e9d79ae97bd1f5fce5203", size = 866631, upload-time = "2025-04-03T20:36:11.022Z" }, + { url = "https://files.pythonhosted.org/packages/88/df/6060c5a9c879b302bd47a73fc012d0db37abf6544c57591bcbc3459673bd/rapidfuzz-3.13.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1ba007f4d35a45ee68656b2eb83b8715e11d0f90e5b9f02d615a8a321ff00c27", size = 1905935, upload-time = "2025-04-03T20:38:18.07Z" }, + { url = "https://files.pythonhosted.org/packages/a2/6c/a0b819b829e20525ef1bd58fc776fb8d07a0c38d819e63ba2b7c311a2ed4/rapidfuzz-3.13.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7a217310429b43be95b3b8ad7f8fc41aba341109dc91e978cd7c703f928c58f", size = 1383714, upload-time = "2025-04-03T20:38:20.628Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c1/3da3466cc8a9bfb9cd345ad221fac311143b6a9664b5af4adb95b5e6ce01/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:558bf526bcd777de32b7885790a95a9548ffdcce68f704a81207be4a286c1095", size = 1367329, upload-time = "2025-04-03T20:38:23.01Z" }, + { url = "https://files.pythonhosted.org/packages/da/f0/9f2a9043bfc4e66da256b15d728c5fc2d865edf0028824337f5edac36783/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:202a87760f5145140d56153b193a797ae9338f7939eb16652dd7ff96f8faf64c", size = 5251057, upload-time = "2025-04-03T20:38:25.52Z" }, + { url = "https://files.pythonhosted.org/packages/6a/ff/af2cb1d8acf9777d52487af5c6b34ce9d13381a753f991d95ecaca813407/rapidfuzz-3.13.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcccc08f671646ccb1e413c773bb92e7bba789e3a1796fd49d23c12539fe2e4", size = 2992401, upload-time = "2025-04-03T20:38:28.196Z" }, + { url = "https://files.pythonhosted.org/packages/c1/c5/c243b05a15a27b946180db0d1e4c999bef3f4221505dff9748f1f6c917be/rapidfuzz-3.13.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:1f219f1e3c3194d7a7de222f54450ce12bc907862ff9a8962d83061c1f923c86", size = 1553782, upload-time = "2025-04-03T20:38:30.778Z" }, +] + +[[package]] +name = "realtime" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/50/340b7f4e9469c9e532657cc96179150b05a868d619183c0fbe9438d5e9ed/realtime-2.25.1.tar.gz", hash = "sha256:0ecd710c37dc42ccb01be5eb25146b249a2b73668da22fd93eae776869db57b6", size = 18537, upload-time = "2025-12-10T21:48:29.81Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/99/10ab53febfa7401ae4899e05eeffa5597523979dea280ad31ba433c9d88a/realtime-2.25.1-py3-none-any.whl", hash = "sha256:3af1da47391cc0da947b4f3850f8e0403ec9be0988c14c2fa3fe66a9458251be", size = 22139, upload-time = "2025-12-10T21:48:28.844Z" }, +] + +[[package]] +name = "redis" +version = "7.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e9/31/1476f206482dd9bc53fdbbe9f6fbd5e05d153f18e54667ce839df331f2e6/redis-7.2.1.tar.gz", hash = "sha256:6163c1a47ee2d9d01221d8456bc1c75ab953cbda18cfbc15e7140e9ba16ca3a5", size = 4906735, upload-time = "2026-02-25T20:05:18.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/98/1dd1a5c060916cf21d15e67b7d6a7078e26e2605d5c37cbc9f4f5454c478/redis-7.2.1-py3-none-any.whl", hash = "sha256:49e231fbc8df2001436ae5252b3f0f3dc930430239bfeb6da4c7ee92b16e5d33", size = 396057, upload-time = "2026-02-25T20:05:16.533Z" }, +] + +[[package]] +name = "regex" +version = "2025.7.34" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/de/e13fa6dc61d78b30ba47481f99933a3b49a57779d625c392d8036770a60d/regex-2025.7.34.tar.gz", hash = "sha256:9ead9765217afd04a86822dfcd4ed2747dfe426e887da413b15ff0ac2457e21a", size = 400714, upload-time = "2025-07-31T00:21:16.262Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/85/f497b91577169472f7c1dc262a5ecc65e39e146fc3a52c571e5daaae4b7d/regex-2025.7.34-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:da304313761b8500b8e175eb2040c4394a875837d5635f6256d6fa0377ad32c8", size = 484594, upload-time = "2025-07-31T00:19:13.927Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c5/ad2a5c11ce9e6257fcbfd6cd965d07502f6054aaa19d50a3d7fd991ec5d1/regex-2025.7.34-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:35e43ebf5b18cd751ea81455b19acfdec402e82fe0dc6143edfae4c5c4b3909a", size = 289294, upload-time = "2025-07-31T00:19:15.395Z" }, + { url = "https://files.pythonhosted.org/packages/8e/01/83ffd9641fcf5e018f9b51aa922c3e538ac9439424fda3df540b643ecf4f/regex-2025.7.34-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96bbae4c616726f4661fe7bcad5952e10d25d3c51ddc388189d8864fbc1b3c68", size = 285933, upload-time = "2025-07-31T00:19:16.704Z" }, + { url = "https://files.pythonhosted.org/packages/77/20/5edab2e5766f0259bc1da7381b07ce6eb4401b17b2254d02f492cd8a81a8/regex-2025.7.34-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9feab78a1ffa4f2b1e27b1bcdaad36f48c2fed4870264ce32f52a393db093c78", size = 792335, upload-time = "2025-07-31T00:19:18.561Z" }, + { url = "https://files.pythonhosted.org/packages/30/bd/744d3ed8777dce8487b2606b94925e207e7c5931d5870f47f5b643a4580a/regex-2025.7.34-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f14b36e6d4d07f1a5060f28ef3b3561c5d95eb0651741474ce4c0a4c56ba8719", size = 858605, upload-time = "2025-07-31T00:19:20.204Z" }, + { url = "https://files.pythonhosted.org/packages/99/3d/93754176289718d7578c31d151047e7b8acc7a8c20e7706716f23c49e45e/regex-2025.7.34-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85c3a958ef8b3d5079c763477e1f09e89d13ad22198a37e9d7b26b4b17438b33", size = 905780, upload-time = "2025-07-31T00:19:21.876Z" }, + { url = "https://files.pythonhosted.org/packages/ee/2e/c689f274a92deffa03999a430505ff2aeace408fd681a90eafa92fdd6930/regex-2025.7.34-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:37555e4ae0b93358fa7c2d240a4291d4a4227cc7c607d8f85596cdb08ec0a083", size = 798868, upload-time = "2025-07-31T00:19:23.222Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9e/39673688805d139b33b4a24851a71b9978d61915c4d72b5ffda324d0668a/regex-2025.7.34-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee38926f31f1aa61b0232a3a11b83461f7807661c062df9eb88769d86e6195c3", size = 781784, upload-time = "2025-07-31T00:19:24.59Z" }, + { url = "https://files.pythonhosted.org/packages/18/bd/4c1cab12cfabe14beaa076523056b8ab0c882a8feaf0a6f48b0a75dab9ed/regex-2025.7.34-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a664291c31cae9c4a30589bd8bc2ebb56ef880c9c6264cb7643633831e606a4d", size = 852837, upload-time = "2025-07-31T00:19:25.911Z" }, + { url = "https://files.pythonhosted.org/packages/cb/21/663d983cbb3bba537fc213a579abbd0f263fb28271c514123f3c547ab917/regex-2025.7.34-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f3e5c1e0925e77ec46ddc736b756a6da50d4df4ee3f69536ffb2373460e2dafd", size = 844240, upload-time = "2025-07-31T00:19:27.688Z" }, + { url = "https://files.pythonhosted.org/packages/8e/2d/9beeeb913bc5d32faa913cf8c47e968da936af61ec20af5d269d0f84a100/regex-2025.7.34-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d428fc7731dcbb4e2ffe43aeb8f90775ad155e7db4347a639768bc6cd2df881a", size = 787139, upload-time = "2025-07-31T00:19:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f5/9b9384415fdc533551be2ba805dd8c4621873e5df69c958f403bfd3b2b6e/regex-2025.7.34-cp311-cp311-win32.whl", hash = "sha256:e154a7ee7fa18333ad90b20e16ef84daaeac61877c8ef942ec8dfa50dc38b7a1", size = 264019, upload-time = "2025-07-31T00:19:31.129Z" }, + { url = "https://files.pythonhosted.org/packages/18/9d/e069ed94debcf4cc9626d652a48040b079ce34c7e4fb174f16874958d485/regex-2025.7.34-cp311-cp311-win_amd64.whl", hash = "sha256:24257953d5c1d6d3c129ab03414c07fc1a47833c9165d49b954190b2b7f21a1a", size = 276047, upload-time = "2025-07-31T00:19:32.497Z" }, + { url = "https://files.pythonhosted.org/packages/fd/cf/3bafbe9d1fd1db77355e7fbbbf0d0cfb34501a8b8e334deca14f94c7b315/regex-2025.7.34-cp311-cp311-win_arm64.whl", hash = "sha256:3157aa512b9e606586900888cd469a444f9b898ecb7f8931996cb715f77477f0", size = 268362, upload-time = "2025-07-31T00:19:34.094Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[package.optional-dependencies] +socks = [ + { name = "pysocks" }, +] + +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "oauthlib" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" }, +] + +[[package]] +name = "retrying" +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c8/5a/b17e1e257d3e6f2e7758930e1256832c9ddd576f8631781e6a072914befa/retrying-1.4.2.tar.gz", hash = "sha256:d102e75d53d8d30b88562d45361d6c6c934da06fab31bd81c0420acb97a8ba39", size = 11411, upload-time = "2025-08-03T03:35:25.189Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/f3/6cd296376653270ac1b423bb30bd70942d9916b6978c6f40472d6ac038e7/retrying-1.4.2-py3-none-any.whl", hash = "sha256:bbc004aeb542a74f3569aeddf42a2516efefcdaff90df0eb38fbfbf19f179f59", size = 10859, upload-time = "2025-08-03T03:35:23.829Z" }, +] + +[[package]] +name = "rich" +version = "14.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" }, +] + +[[package]] +name = "rq" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "croniter" }, + { name = "redis" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/9b/93b7180220fe462b4128425e687665bcdeffddc51683d41e7fbe509c2d2e/rq-2.7.0.tar.gz", hash = "sha256:c2156fc7249b5d43dda918c4355cfbf8d0d299a5cdd3963918e9c8daf4b1e0c0", size = 679396, upload-time = "2026-02-22T11:10:50.775Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/1a/3b64696bc0c33aa1d86d3e6add03c4e0afe51110264fd41208bd95c2665c/rq-2.7.0-py3-none-any.whl", hash = "sha256:4b320e95968208d2e249fa0d3d90ee309478e2d7ea60a116f8ff9aa343a4c117", size = 115728, upload-time = "2026-02-22T11:10:48.401Z" }, +] + +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, +] + +[[package]] +name = "safetensors" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, + { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, + { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, + { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, + { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, + { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/83/564e141eef908a5863a54da8ca342a137f45a0bfb71d1d79704c9894c9d1/scikit_learn-1.7.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e", size = 9331967, upload-time = "2025-09-09T08:20:32.421Z" }, + { url = "https://files.pythonhosted.org/packages/18/d6/ba863a4171ac9d7314c4d3fc251f015704a2caeee41ced89f321c049ed83/scikit_learn-1.7.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1", size = 8648645, upload-time = "2025-09-09T08:20:34.436Z" }, + { url = "https://files.pythonhosted.org/packages/ef/0e/97dbca66347b8cf0ea8b529e6bb9367e337ba2e8be0ef5c1a545232abfde/scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d", size = 9715424, upload-time = "2025-09-09T08:20:36.776Z" }, + { url = "https://files.pythonhosted.org/packages/f7/32/1f3b22e3207e1d2c883a7e09abb956362e7d1bd2f14458c7de258a26ac15/scikit_learn-1.7.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1", size = 9509234, upload-time = "2025-09-09T08:20:38.957Z" }, + { url = "https://files.pythonhosted.org/packages/9f/71/34ddbd21f1da67c7a768146968b4d0220ee6831e4bcbad3e03dd3eae88b6/scikit_learn-1.7.2-cp311-cp311-win_amd64.whl", hash = "sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1", size = 8894244, upload-time = "2025-09-09T08:20:41.166Z" }, +] + +[[package]] +name = "scipy" +version = "1.16.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/3b/546a6f0bfe791bbb7f8d591613454d15097e53f906308ec6f7c1ce588e8e/scipy-1.16.2.tar.gz", hash = "sha256:af029b153d243a80afb6eabe40b0a07f8e35c9adc269c019f364ad747f826a6b", size = 30580599, upload-time = "2025-09-11T17:48:08.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/ef/37ed4b213d64b48422df92560af7300e10fe30b5d665dd79932baebee0c6/scipy-1.16.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:6ab88ea43a57da1af33292ebd04b417e8e2eaf9d5aa05700be8d6e1b6501cd92", size = 36619956, upload-time = "2025-09-11T17:39:20.5Z" }, + { url = "https://files.pythonhosted.org/packages/85/ab/5c2eba89b9416961a982346a4d6a647d78c91ec96ab94ed522b3b6baf444/scipy-1.16.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c95e96c7305c96ede73a7389f46ccd6c659c4da5ef1b2789466baeaed3622b6e", size = 28931117, upload-time = "2025-09-11T17:39:29.06Z" }, + { url = "https://files.pythonhosted.org/packages/80/d1/eed51ab64d227fe60229a2d57fb60ca5898cfa50ba27d4f573e9e5f0b430/scipy-1.16.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:87eb178db04ece7c698220d523c170125dbffebb7af0345e66c3554f6f60c173", size = 20921997, upload-time = "2025-09-11T17:39:34.892Z" }, + { url = "https://files.pythonhosted.org/packages/be/7c/33ea3e23bbadde96726edba6bf9111fb1969d14d9d477ffa202c67bec9da/scipy-1.16.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:4e409eac067dcee96a57fbcf424c13f428037827ec7ee3cb671ff525ca4fc34d", size = 23523374, upload-time = "2025-09-11T17:39:40.846Z" }, + { url = "https://files.pythonhosted.org/packages/96/0b/7399dc96e1e3f9a05e258c98d716196a34f528eef2ec55aad651ed136d03/scipy-1.16.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e574be127bb760f0dad24ff6e217c80213d153058372362ccb9555a10fc5e8d2", size = 33583702, upload-time = "2025-09-11T17:39:49.011Z" }, + { url = "https://files.pythonhosted.org/packages/1a/bc/a5c75095089b96ea72c1bd37a4497c24b581ec73db4ef58ebee142ad2d14/scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f5db5ba6188d698ba7abab982ad6973265b74bb40a1efe1821b58c87f73892b9", size = 35883427, upload-time = "2025-09-11T17:39:57.406Z" }, + { url = "https://files.pythonhosted.org/packages/ab/66/e25705ca3d2b87b97fe0a278a24b7f477b4023a926847935a1a71488a6a6/scipy-1.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec6e74c4e884104ae006d34110677bfe0098203a3fec2f3faf349f4cb05165e3", size = 36212940, upload-time = "2025-09-11T17:40:06.013Z" }, + { url = "https://files.pythonhosted.org/packages/d6/fd/0bb911585e12f3abdd603d721d83fc1c7492835e1401a0e6d498d7822b4b/scipy-1.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:912f46667d2d3834bc3d57361f854226475f695eb08c08a904aadb1c936b6a88", size = 38865092, upload-time = "2025-09-11T17:40:15.143Z" }, + { url = "https://files.pythonhosted.org/packages/d6/73/c449a7d56ba6e6f874183759f8483cde21f900a8be117d67ffbb670c2958/scipy-1.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:91e9e8a37befa5a69e9cacbe0bcb79ae5afb4a0b130fd6db6ee6cc0d491695fa", size = 38687626, upload-time = "2025-09-11T17:40:24.041Z" }, + { url = "https://files.pythonhosted.org/packages/68/72/02f37316adf95307f5d9e579023c6899f89ff3a051fa079dbd6faafc48e5/scipy-1.16.2-cp311-cp311-win_arm64.whl", hash = "sha256:f3bf75a6dcecab62afde4d1f973f1692be013110cad5338007927db8da73249c", size = 25503506, upload-time = "2025-09-11T17:40:30.703Z" }, +] + +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, +] + +[[package]] +name = "selenium" +version = "4.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "trio" }, + { name = "trio-websocket" }, + { name = "typing-extensions" }, + { name = "urllib3", extra = ["socks"] }, + { name = "websocket-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/35/33d3d84e3399c9d00b489aeccfdc78115e149e45816fb8fe84274329e8a2/selenium-4.36.0.tar.gz", hash = "sha256:0eced83038736c3a013b824116df0b6dbb83e93721545f51b680451013416723", size = 913613, upload-time = "2025-10-02T15:24:37.483Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/9e/642a355e43a4ebf68bc4f00dd4ab264f635079c5dc7ed6d9991a0c2be3d7/selenium-4.36.0-py3-none-any.whl", hash = "sha256:525fdfe96b99c27d9a2c773c75aa7413f4c24bdb7b9749c1950aa3b5f79ed915", size = 9587029, upload-time = "2025-10-02T15:24:35.025Z" }, +] + +[[package]] +name = "sentence-transformers" +version = "5.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "pillow" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "torch" }, + { name = "tqdm" }, + { name = "transformers" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0f/96/f3f3409179d14dbfdbea8622e2e9eaa3c8836ddcaecd2cd5ff0a11731d20/sentence_transformers-5.1.2.tar.gz", hash = "sha256:0f6c8bd916a78dc65b366feb8d22fd885efdb37432e7630020d113233af2b856", size = 375185, upload-time = "2025-10-22T12:47:55.019Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/a6/a607a737dc1a00b7afe267b9bfde101b8cee2529e197e57471d23137d4e5/sentence_transformers-5.1.2-py3-none-any.whl", hash = "sha256:724ce0ea62200f413f1a5059712aff66495bc4e815a1493f7f9bca242414c333", size = 488009, upload-time = "2025-10-22T12:47:53.433Z" }, +] + +[[package]] +name = "sentry-sdk" +version = "2.35.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/31/83/055dc157b719651ef13db569bb8cf2103df11174478649735c1b2bf3f6bc/sentry_sdk-2.35.0.tar.gz", hash = "sha256:5ea58d352779ce45d17bc2fa71ec7185205295b83a9dbb5707273deb64720092", size = 343014, upload-time = "2025-08-14T17:11:20.223Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/36/3d/742617a7c644deb0c1628dcf6bb2d2165ab7c6aab56fe5222758994007f8/sentry_sdk-2.35.0-py2.py3-none-any.whl", hash = "sha256:6e0c29b9a5d34de8575ffb04d289a987ff3053cf2c98ede445bea995e3830263", size = 363806, upload-time = "2025-08-14T17:11:18.29Z" }, +] + +[[package]] +name = "seqeval" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scikit-learn" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f", size = 43605, upload-time = "2020-10-24T00:24:54.926Z" } + +[[package]] +name = "setuptools" +version = "78.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/9c/42314ee079a3e9c24b27515f9fbc7a3c1d29992c33451779011c74488375/setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d", size = 1368163, upload-time = "2025-04-19T18:23:36.68Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/99/158ad0609729111163fc1f674a5a42f2605371a4cf036d0441070e2f7455/setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561", size = 1256462, upload-time = "2025-04-19T18:23:34.525Z" }, +] + +[[package]] +name = "shapely" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" }, + { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" }, + { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" }, + { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" }, + { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" }, + { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" }, + { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" }, + { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" }, +] + +[[package]] +name = "simplebloom" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5f/85/45f0e8448f37baa7e13949d1d93cf15264737498f4f953464494bf96f8c9/simplebloom-1.1.0.tar.gz", hash = "sha256:2e553d2cea8557c067156de7b8b28af738f36488eef01291559ccd7fa77c7b72", size = 74021, upload-time = "2025-10-10T17:53:46.531Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/79/33f98bcf49c930476ce982cc16424a9c63f801d54145ad1e26d5df64b7da/simplebloom-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5368923334c1852db3ff280880fe0ddb31585e0ac7cda0736d24fdd0cbdf4243", size = 28575, upload-time = "2025-10-10T17:52:57.518Z" }, + { url = "https://files.pythonhosted.org/packages/da/b3/83857a8617546d9ffd598eb3a0d42f16db67d5add5b68b988a0bf86a8858/simplebloom-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b29e0372612c2faf0f971db56f17d1dd06aca383a8649c5c9aced9caed51a8e", size = 29679, upload-time = "2025-10-10T17:52:58.847Z" }, + { url = "https://files.pythonhosted.org/packages/d6/07/d218212d28d16e9ad0e444cfc19053040e9291c41b85d846be2ee720c6a5/simplebloom-1.1.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81a40982f2863618ee1ec93c39af03c9853f1102c5285ad9689414d9c821999b", size = 28608, upload-time = "2025-10-10T17:53:00.349Z" }, + { url = "https://files.pythonhosted.org/packages/d1/1a/e95815814ca9b2d3db710510423d5caa99df05864ca5cf37ca00f9b080b0/simplebloom-1.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:211efd0210bea019174b078fc6304b52ca675d8e76842199c1339d84ee4571b5", size = 28656, upload-time = "2025-10-10T17:53:01.428Z" }, + { url = "https://files.pythonhosted.org/packages/ce/d2/2f2e5789eec283dee652707768923d7fcc0673301a5cea60238f722ebffd/simplebloom-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6174b2e9096d6fd0ee9ed9eb0c4c7568034ee397a0a691629f9050c766e094ed", size = 28339, upload-time = "2025-10-10T17:53:02.325Z" }, + { url = "https://files.pythonhosted.org/packages/02/aa/0c6719d7d49c31c21181461928a401e37c6c1825fa1e77d6da8054bad8b8/simplebloom-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e16f8ae71bf4c453c59faad9580043d1ce90c15e0c0bd21b80b8de855f961a9b", size = 28780, upload-time = "2025-10-10T17:53:03.23Z" }, + { url = "https://files.pythonhosted.org/packages/ee/fd/ca9450ec8623e9568413605ae1f2a4f580821d9a1e9c60dbae060666bd65/simplebloom-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:1438207604a4ec0a03a811630997205a9aa38b5971a390714353a89bbec653e4", size = 27199, upload-time = "2025-10-10T17:53:04.149Z" }, + { url = "https://files.pythonhosted.org/packages/c0/9c/4418182183aa2091f48e3deb2352bc6dae1dbc771ceb83478d28b8073a58/simplebloom-1.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:434821861b1c00f50cdd77f87b5e98348b23c6625f877b268af03dbf822a5217", size = 24751, upload-time = "2025-10-10T17:53:05.055Z" }, + { url = "https://files.pythonhosted.org/packages/ec/cb/b54368b35260e20bf6ccbdfd47123ae4984ee2eff4d3ede8fc13f6631dc0/simplebloom-1.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bb066a469328f13838d24f1712cc7aec25c9b257ab9b4ccefb91b4528b88a321", size = 23460, upload-time = "2025-10-10T17:53:41.801Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a1/11785a806750a41048af35f7bc46d3bc89f20b55fd731de1a71986debad3/simplebloom-1.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:be8ac8feb9790a53280ac1bb4995d940ff6b9fca5e75bd131f760bdaa01e609f", size = 24222, upload-time = "2025-10-10T17:53:42.75Z" }, + { url = "https://files.pythonhosted.org/packages/3f/80/10bc657c173e9fe3a17a611418ac89015b4dae28d642125d1dd2d6deee82/simplebloom-1.1.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e9efd9fda69f365c2546ba5fc24e27e46201675a8a0d398f0455fbd85bf7ca44", size = 25327, upload-time = "2025-10-10T17:53:43.678Z" }, + { url = "https://files.pythonhosted.org/packages/88/d4/35d695e7dab216a14f4a064022e747b42a65c1a359c110bf359c96b1d101/simplebloom-1.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:86310d530533f2a6ac987d1753dab226d81d20049da30e3038f22bef4ac295b5", size = 25283, upload-time = "2025-10-10T17:53:44.643Z" }, + { url = "https://files.pythonhosted.org/packages/b0/99/15b32a9aa3f3682ac1ed08bcb9214ddd5b45d7488c6442ad81ae576c9a62/simplebloom-1.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15a81a6337680377f9037eebba2368f9174fc0d365f4b0c88717cd4dfb4f1233", size = 24255, upload-time = "2025-10-10T17:53:45.556Z" }, +] + +[[package]] +name = "simplejpeg" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/64/da60f0ba80570f9a36c9b6e055f4364bda2c547715296d5773d2ea6d5a60/simplejpeg-1.9.0.tar.gz", hash = "sha256:5ac7d9489eeb812c2e7ea5c283994a29d9fefdfe5ed7b86c09d485e0dd366689", size = 3965764, upload-time = "2025-10-10T10:58:08.197Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/1c/787e062aa3ad48b93cbf516f7aff9ade275f2e3cd901e4eb81744959e5bb/simplejpeg-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60191ea898d58aaef489a8f94bf34a7472a3ae5a40f16a364f154151f751d08b", size = 425492, upload-time = "2025-10-10T10:57:29.067Z" }, + { url = "https://files.pythonhosted.org/packages/17/5f/00178980659301d4257499143243fa7b7fa0ad348762072f40b08a0459bc/simplejpeg-1.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6cbc0eba5159c9c4b6d2930f429856b4f5b7b792fb48a4c93141e56878c9b71e", size = 401393, upload-time = "2025-10-10T10:57:30.321Z" }, + { url = "https://files.pythonhosted.org/packages/4d/42/941441677d990e43a53d96c667bf32a3e930855e4807a12e69dedf69c24a/simplejpeg-1.9.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:216ff066e9a05743470ade59ee6014c1a40655bf38a0fc40bae8c78511749a90", size = 448250, upload-time = "2025-10-10T10:57:31.602Z" }, + { url = "https://files.pythonhosted.org/packages/8e/2f/34c30d9dc903119931f03a1e81112c8f3cd829e833972f6446c0e49ff53f/simplejpeg-1.9.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9cd72c67f1c8fc67f1db432fdae7b03272ca56b72cbb43883c082b63358851c4", size = 405949, upload-time = "2025-10-10T10:57:32.837Z" }, + { url = "https://files.pythonhosted.org/packages/3a/6a/9952d5c3464f82cf974432ce52a4106ff7b26742eab6e2caa737c28df0ca/simplejpeg-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:8f242aa7401b12edfe3b5c76ee4391a30bfba8e0cb93bc5ddb6ff0c2d2bef33c", size = 292682, upload-time = "2025-10-10T10:57:34.181Z" }, + { url = "https://files.pythonhosted.org/packages/61/94/aed8b242461a3a603331d3c8eb59e4d56de4532b345d68764ad0896cf750/simplejpeg-1.9.0-cp311-cp311-win_arm64.whl", hash = "sha256:0e28186618efc16b02526ad68ecd53ef84babb3c88a7313624ed665dfe4649ac", size = 253544, upload-time = "2025-10-10T10:57:35.412Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "smmap" +version = "5.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload-time = "2025-04-20T18:50:08.518Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" }, +] + +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + +[[package]] +name = "starlette" +version = "0.50.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" }, +] + +[[package]] +name = "storage3" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecation" }, + { name = "httpx", extra = ["http2"] }, + { name = "pydantic" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/40/12/1f7723cd3538181bd37d626d8d7fd1c77e66be38bb0735a332604f48efcf/storage3-2.25.1.tar.gz", hash = "sha256:eb445dcaa3a6ead1c0b27d7d06bf9074592a1fdc07e57c648a69a9bf5057d7a0", size = 18546, upload-time = "2025-12-10T21:48:31.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/a4/b20e10088e093ac499ea96a731781ee588aa22202c245cbc6a12a7a3cdfc/storage3-2.25.1-py3-none-any.whl", hash = "sha256:85e2439a5a092965b991ee018a510c3c1a3404b1e029813eca241f5a6bdd6296", size = 26756, upload-time = "2025-12-10T21:48:30.47Z" }, +] + +[[package]] +name = "strenum" +version = "0.4.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" }, +] + +[[package]] +name = "supabase" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "postgrest" }, + { name = "realtime" }, + { name = "storage3" }, + { name = "supabase-auth" }, + { name = "supabase-functions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/65/ec2bdfd8c593d98a76b3c5e480a00455014a3c65795bb3b04f1cf71d1a8d/supabase-2.25.1.tar.gz", hash = "sha256:dd6663b6e63c93b12df999da6746127f948581302e86578454812d57328aea92", size = 9567, upload-time = "2025-12-10T21:48:32.891Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/5e/919b96e5464a6283cec6ebf250adcf82d5b514c085476278a91083a331d3/supabase-2.25.1-py3-none-any.whl", hash = "sha256:ddb209761ac741b6a474b2e125c77875490dfeeac29ca4fa6730df396f06eac0", size = 16442, upload-time = "2025-12-10T21:48:31.962Z" }, +] + +[[package]] +name = "supabase-auth" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx", extra = ["http2"] }, + { name = "pydantic" }, + { name = "pyjwt", extra = ["crypto"] }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/56/9ae28ab511ad200662a3ef3ad3277c296c0c7bb78edafb4f258c6af372ea/supabase_auth-2.25.1.tar.gz", hash = "sha256:978168ba28cba87f2c56b80ce596bcebabd51fe51816fc0007e9bedae22cc0ee", size = 38796, upload-time = "2025-12-10T21:48:35.958Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/3d/68c0a3616db75ce3f9189c64bc9f96ec712cb6a64938b66b9b0d0b506fe1/supabase_auth-2.25.1-py3-none-any.whl", hash = "sha256:cf18c9b0a92c986e53d4e3db2911d86b2688d1fc63f51f933d8315147d4d7118", size = 48019, upload-time = "2025-12-10T21:48:35.007Z" }, +] + +[[package]] +name = "supabase-functions" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx", extra = ["http2"] }, + { name = "strenum" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/4a/32a5010858f4f94ec6b31c2d065ea75f90a29595ddedb3a9b28c44ce533e/supabase_functions-2.25.1.tar.gz", hash = "sha256:6c8c47e29cafede051550a607fac750db4335382fd916d06239fa16be6afadbe", size = 4523, upload-time = "2025-12-10T21:48:37.752Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/1f/7ff96448db26ebc43e9d9e4d3b3e49e23c6b4a7dda908413a93ef20a8370/supabase_functions-2.25.1-py3-none-any.whl", hash = "sha256:8ba549a2e3d12a95f46438ad8474e15394dcc7abd05fc5b73b134eda712d096d", size = 8473, upload-time = "2025-12-10T21:48:36.675Z" }, +] + +[[package]] +name = "sympy" +version = "1.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "tensorboard" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "absl-py" }, + { name = "grpcio" }, + { name = "markdown" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "protobuf" }, + { name = "setuptools" }, + { name = "tensorboard-data-server" }, + { name = "werkzeug" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl", hash = "sha256:9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6", size = 5525680, upload-time = "2025-07-17T19:20:49.638Z" }, +] + +[[package]] +name = "tensorboard-data-server" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356, upload-time = "2023-10-23T21:23:32.16Z" }, + { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598, upload-time = "2023-10-23T21:23:33.714Z" }, + { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, +] + +[[package]] +name = "tensorboardx" +version = "2.6.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2b/c5/d4cc6e293fb837aaf9f76dd7745476aeba8ef7ef5146c3b3f9ee375fe7a5/tensorboardx-2.6.4.tar.gz", hash = "sha256:b163ccb7798b31100b9f5fa4d6bc22dad362d7065c2f24b51e50731adde86828", size = 4769801, upload-time = "2025-06-10T22:37:07.419Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/1d/b5d63f1a6b824282b57f7b581810d20b7a28ca951f2d5b59f1eb0782c12b/tensorboardx-2.6.4-py3-none-any.whl", hash = "sha256:5970cf3a1f0a6a6e8b180ccf46f3fe832b8a25a70b86e5a237048a7c0beb18e2", size = 87201, upload-time = "2025-06-10T22:37:05.44Z" }, +] + +[[package]] +name = "termcolor" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/87/56/ab275c2b56a5e2342568838f0d5e3e66a32354adcc159b495e374cda43f5/termcolor-3.2.0.tar.gz", hash = "sha256:610e6456feec42c4bcd28934a8c87a06c3fa28b01561d46aa09a9881b8622c58", size = 14423, upload-time = "2025-10-25T19:11:42.586Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/d5/141f53d7c1eb2a80e6d3e9a390228c3222c27705cbe7f048d3623053f3ca/termcolor-3.2.0-py3-none-any.whl", hash = "sha256:a10343879eba4da819353c55cb8049b0933890c2ebf9ad5d3ecd2bb32ea96ea6", size = 7698, upload-time = "2025-10-25T19:11:41.536Z" }, +] + +[[package]] +name = "terminaltables" +version = "3.1.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/fc/0b73d782f5ab7feba8d007573a3773c58255f223c5940a7b7085f02153c3/terminaltables-3.1.10.tar.gz", hash = "sha256:ba6eca5cb5ba02bba4c9f4f985af80c54ec3dccf94cfcd190154386255e47543", size = 12264, upload-time = "2021-12-07T19:03:35.758Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/fb/ea621e0a19733e01fe4005d46087d383693c0f4a8f824b47d8d4122c87e0/terminaltables-3.1.10-py2.py3-none-any.whl", hash = "sha256:e4fdc4179c9e4aab5f674d80f09d76fa436b96fdc698a8505e0a36bf0804a874", size = 15155, upload-time = "2021-12-07T19:03:34.013Z" }, +] + +[[package]] +name = "textdistance" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/4c/96d7ff24f1bee11ade34b1daea9f70fc4c115781bbf380089470c053ef4d/textdistance-4.2.0.tar.gz", hash = "sha256:6d2a398815aeed453cfb38a3b62da74e33fa6a5f4e42845fd1d2c9611836befd", size = 34519, upload-time = "2020-04-13T09:59:24.571Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/71/87133323736b9b0180f600d477507318dae0abde613a54df33bfd0248614/textdistance-4.2.0-py3-none-any.whl", hash = "sha256:61ddcdd9a78da99eff11dc1219d444f72915212cf36947de3266a356f5e934f7", size = 29118, upload-time = "2020-04-13T09:59:27.03Z" }, +] + +[package.optional-dependencies] +levenshtein = [ + { name = "abydos" }, + { name = "jellyfish" }, + { name = "numpy" }, + { name = "python-levenshtein" }, + { name = "pyxdameraulevenshtein" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "timm" +version = "1.0.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, + { name = "torchvision", version = "0.16.0", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.16.0+cu121", source = { registry = "https://download.pytorch.org/whl/cu121" }, marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/63/ab9bc9441f173fab436d15070dbc90341ff1e439f3b76c6871bc37176580/timm-1.0.21.tar.gz", hash = "sha256:aa372fe43a85ed6ea0dd14945dac724c842e6e373779e2a2afd67d7dc1b82c4c", size = 2382582, upload-time = "2025-10-24T22:37:57.756Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/8c/a668e732032f6de4ecc6b33f7ed27eab1c238dce35f6fe39986ad61aed9e/timm-1.0.21-py3-none-any.whl", hash = "sha256:e7428083af9f68af5ef1d50724946d9b6a2ccba8688c3e5fc9370f59f76e50cf", size = 2529988, upload-time = "2025-10-24T22:37:55.539Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.21.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/2f/402986d0823f8d7ca139d969af2917fefaa9b947d1fb32f6168c509f2492/tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880", size = 351253, upload-time = "2025-07-28T15:48:54.325Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/c6/fdb6f72bf6454f52eb4a2510be7fb0f614e541a2554d6210e370d85efff4/tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133", size = 2863987, upload-time = "2025-07-28T15:48:44.877Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a6/28975479e35ddc751dc1ddc97b9b69bf7fcf074db31548aab37f8116674c/tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60", size = 2732457, upload-time = "2025-07-28T15:48:43.265Z" }, + { url = "https://files.pythonhosted.org/packages/aa/8f/24f39d7b5c726b7b0be95dca04f344df278a3fe3a4deb15a975d194cbb32/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5", size = 3012624, upload-time = "2025-07-28T13:22:43.895Z" }, + { url = "https://files.pythonhosted.org/packages/58/47/26358925717687a58cb74d7a508de96649544fad5778f0cd9827398dc499/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6", size = 2939681, upload-time = "2025-07-28T13:22:47.499Z" }, + { url = "https://files.pythonhosted.org/packages/99/6f/cc300fea5db2ab5ddc2c8aea5757a27b89c84469899710c3aeddc1d39801/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9", size = 3247445, upload-time = "2025-07-28T15:48:39.711Z" }, + { url = "https://files.pythonhosted.org/packages/be/bf/98cb4b9c3c4afd8be89cfa6423704337dc20b73eb4180397a6e0d456c334/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732", size = 3428014, upload-time = "2025-07-28T13:22:49.569Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/96c1cc780e6ca7f01a57c13235dd05b7bc1c0f3588512ebe9d1331b5f5ae/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2", size = 3193197, upload-time = "2025-07-28T13:22:51.471Z" }, + { url = "https://files.pythonhosted.org/packages/f2/90/273b6c7ec78af547694eddeea9e05de771278bd20476525ab930cecaf7d8/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff", size = 3115426, upload-time = "2025-07-28T15:48:41.439Z" }, + { url = "https://files.pythonhosted.org/packages/91/43/c640d5a07e95f1cf9d2c92501f20a25f179ac53a4f71e1489a3dcfcc67ee/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2", size = 9089127, upload-time = "2025-07-28T15:48:46.472Z" }, + { url = "https://files.pythonhosted.org/packages/44/a1/dd23edd6271d4dca788e5200a807b49ec3e6987815cd9d0a07ad9c96c7c2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78", size = 9055243, upload-time = "2025-07-28T15:48:48.539Z" }, + { url = "https://files.pythonhosted.org/packages/21/2b/b410d6e9021c4b7ddb57248304dc817c4d4970b73b6ee343674914701197/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b", size = 9298237, upload-time = "2025-07-28T15:48:50.443Z" }, + { url = "https://files.pythonhosted.org/packages/b7/0a/42348c995c67e2e6e5c89ffb9cfd68507cbaeb84ff39c49ee6e0a6dd0fd2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24", size = 9461980, upload-time = "2025-07-28T15:48:52.325Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d3/dacccd834404cd71b5c334882f3ba40331ad2120e69ded32cf5fda9a7436/tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0", size = 2329871, upload-time = "2025-07-28T15:48:56.841Z" }, + { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" }, +] + +[[package]] +name = "torch" +version = "2.1.0+cu121" +source = { registry = "https://download.pytorch.org/whl/cu121" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "sympy" }, + { name = "triton" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-linux_x86_64.whl", hash = "sha256:aa984599c2c4ffbc57c48d0d965cbe832e610c967e8179d4ac0a582c733fe112" }, + { url = "https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-win_amd64.whl", hash = "sha256:3b7c6dd1ab12a9c70b29bf1ea34fcf2c519233c58c619c1a553d328955c8a602" }, +] + +[[package]] +name = "torchinfo" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/53/d9/2b811d1c0812e9ef23e6cf2dbe022becbe6c5ab065e33fd80ee05c0cd996/torchinfo-1.8.0.tar.gz", hash = "sha256:72e94b0e9a3e64dc583a8e5b7940b8938a1ac0f033f795457f27e6f4e7afa2e9", size = 25880, upload-time = "2023-05-14T19:23:26.377Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/25/973bd6128381951b23cdcd8a9870c6dcfc5606cb864df8eabd82e529f9c1/torchinfo-1.8.0-py3-none-any.whl", hash = "sha256:2e911c2918603f945c26ff21a3a838d12709223dc4ccf243407bce8b6e897b46", size = 23377, upload-time = "2023-05-14T19:23:24.141Z" }, +] + +[[package]] +name = "torchvision" +version = "0.16.0" +source = { registry = "https://download.pytorch.org/whl/cu121" } +resolution-markers = [ + "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'", +] +dependencies = [ + { name = "numpy", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, + { name = "requests", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/torchvision-0.16.0-cp311-cp311-linux_aarch64.whl", hash = "sha256:9ed5f21e5a56e466667c6f9f6f93dba2a75e29921108bd70043eaf8e9ba0a7cc" }, +] + +[[package]] +name = "torchvision" +version = "0.16.0+cu121" +source = { registry = "https://download.pytorch.org/whl/cu121" } +resolution-markers = [ + "platform_machine == 'aarch64' and platform_python_implementation != 'CPython' and sys_platform == 'linux'", + "platform_machine != 'aarch64' and sys_platform == 'linux'", + "sys_platform == 'darwin'", + "sys_platform != 'darwin' and sys_platform != 'linux'", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, + { name = "requests", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, + { name = "torch", marker = "platform_machine != 'aarch64' or platform_python_implementation != 'CPython' or sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp311-cp311-linux_x86_64.whl", hash = "sha256:7a325270c7806571ceddbd27c8ece5c163cceb476f09dcca7eb5157073216b22" }, + { url = "https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp311-cp311-win_amd64.whl", hash = "sha256:09dea0b374be56df4ae148e83221f172a8a6c999475e9483037ab6efa3cd6b80" }, +] + +[[package]] +name = "tornado" +version = "6.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/ce/1eb500eae19f4648281bb2186927bb062d2438c2e5093d1360391afd2f90/tornado-6.5.2.tar.gz", hash = "sha256:ab53c8f9a0fa351e2c0741284e06c7a45da86afb544133201c5cc8578eb076a0", size = 510821, upload-time = "2025-08-08T18:27:00.78Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/48/6a7529df2c9cc12efd2e8f5dd219516184d703b34c06786809670df5b3bd/tornado-6.5.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2436822940d37cde62771cff8774f4f00b3c8024fe482e16ca8387b8a2724db6", size = 442563, upload-time = "2025-08-08T18:26:42.945Z" }, + { url = "https://files.pythonhosted.org/packages/f2/b5/9b575a0ed3e50b00c40b08cbce82eb618229091d09f6d14bce80fc01cb0b/tornado-6.5.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:583a52c7aa94ee046854ba81d9ebb6c81ec0fd30386d96f7640c96dad45a03ef", size = 440729, upload-time = "2025-08-08T18:26:44.473Z" }, + { url = "https://files.pythonhosted.org/packages/1b/4e/619174f52b120efcf23633c817fd3fed867c30bff785e2cd5a53a70e483c/tornado-6.5.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0fe179f28d597deab2842b86ed4060deec7388f1fd9c1b4a41adf8af058907e", size = 444295, upload-time = "2025-08-08T18:26:46.021Z" }, + { url = "https://files.pythonhosted.org/packages/95/fa/87b41709552bbd393c85dd18e4e3499dcd8983f66e7972926db8d96aa065/tornado-6.5.2-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b186e85d1e3536d69583d2298423744740986018e393d0321df7340e71898882", size = 443644, upload-time = "2025-08-08T18:26:47.625Z" }, + { url = "https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108", size = 443878, upload-time = "2025-08-08T18:26:50.599Z" }, + { url = "https://files.pythonhosted.org/packages/11/92/fe6d57da897776ad2e01e279170ea8ae726755b045fe5ac73b75357a5a3f/tornado-6.5.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:06ceb1300fd70cb20e43b1ad8aaee0266e69e7ced38fa910ad2e03285009ce7c", size = 444549, upload-time = "2025-08-08T18:26:51.864Z" }, + { url = "https://files.pythonhosted.org/packages/9b/02/c8f4f6c9204526daf3d760f4aa555a7a33ad0e60843eac025ccfd6ff4a93/tornado-6.5.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:74db443e0f5251be86cbf37929f84d8c20c27a355dd452a5cfa2aada0d001ec4", size = 443973, upload-time = "2025-08-08T18:26:53.625Z" }, + { url = "https://files.pythonhosted.org/packages/ae/2d/f5f5707b655ce2317190183868cd0f6822a1121b4baeae509ceb9590d0bd/tornado-6.5.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b5e735ab2889d7ed33b32a459cac490eda71a1ba6857b0118de476ab6c366c04", size = 443954, upload-time = "2025-08-08T18:26:55.072Z" }, + { url = "https://files.pythonhosted.org/packages/e8/59/593bd0f40f7355806bf6573b47b8c22f8e1374c9b6fd03114bd6b7a3dcfd/tornado-6.5.2-cp39-abi3-win32.whl", hash = "sha256:c6f29e94d9b37a95013bb669616352ddb82e3bfe8326fccee50583caebc8a5f0", size = 445023, upload-time = "2025-08-08T18:26:56.677Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2a/f609b420c2f564a748a2d80ebfb2ee02a73ca80223af712fca591386cafb/tornado-6.5.2-cp39-abi3-win_amd64.whl", hash = "sha256:e56a5af51cc30dd2cae649429af65ca2f6571da29504a07995175df14c18f35f", size = 445427, upload-time = "2025-08-08T18:26:57.91Z" }, + { url = "https://files.pythonhosted.org/packages/5e/4f/e1f65e8f8c76d73658b33d33b81eed4322fb5085350e4328d5c956f0c8f9/tornado-6.5.2-cp39-abi3-win_arm64.whl", hash = "sha256:d6c33dc3672e3a1f3618eb63b7ef4683a7688e7b9e6e8f0d9aa5726360a004af", size = 444456, upload-time = "2025-08-08T18:26:59.207Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + +[[package]] +name = "transformers" +version = "4.49.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/50/46573150944f46df8ec968eda854023165a84470b42f69f67c7d475dabc5/transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e", size = 8610952, upload-time = "2025-02-17T15:19:03.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03", size = 9970275, upload-time = "2025-02-17T15:18:58.814Z" }, +] + +[[package]] +name = "trio" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux'" }, + { name = "idna" }, + { name = "outcome" }, + { name = "sniffio" }, + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/8f/c6e36dd11201e2a565977d8b13f0b027ba4593c1a80bed5185489178e257/trio-0.31.0.tar.gz", hash = "sha256:f71d551ccaa79d0cb73017a33ef3264fde8335728eb4c6391451fe5d253a9d5b", size = 605825, upload-time = "2025-09-09T15:17:15.242Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/5b/94237a3485620dbff9741df02ff6d8acaa5fdec67d81ab3f62e4d8511bf7/trio-0.31.0-py3-none-any.whl", hash = "sha256:b5d14cd6293d79298b49c3485ffd9c07e3ce03a6da8c7dfbe0cb3dd7dc9a4774", size = 512679, upload-time = "2025-09-09T15:17:13.821Z" }, +] + +[[package]] +name = "trio-websocket" +version = "0.12.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "outcome" }, + { name = "trio" }, + { name = "wsproto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" }, +] + +[[package]] +name = "triton" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/c1/54fffb2eb13d293d9a429fead3646752ea190de0229bcf3d591ba2481263/triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8", size = 89234153, upload-time = "2023-09-01T07:26:20.161Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.14.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/5a/da40306b885cc8c09109dc2e1abd358d5684b1425678151cdaed4731c822/typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36", size = 107673, upload-time = "2025-07-04T13:28:34.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, +] + +[[package]] +name = "umap-learn" +version = "0.5.9.post2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numba" }, + { name = "numpy" }, + { name = "pynndescent" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5f/ee/6bc65bd375c812026a7af63fe9d09d409382120aff25f2152f1ba12af5ec/umap_learn-0.5.9.post2.tar.gz", hash = "sha256:bdf60462d779bd074ce177a0714ced17e6d161285590fa487f3f9548dd3c31c9", size = 95441, upload-time = "2025-07-03T00:18:02.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/b1/c24deeda9baf1fd491aaad941ed89e0fed6c583a117fd7b79e0a33a1e6c0/umap_learn-0.5.9.post2-py3-none-any.whl", hash = "sha256:fbe51166561e0e7fab00ef3d516ac2621243b8d15cf4bef9f656d701736b16a0", size = 90146, upload-time = "2025-07-03T00:18:01.042Z" }, +] + +[[package]] +name = "uritemplate" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/98/60/f174043244c5306c9988380d2cb10009f91563fc4b31293d27e17201af56/uritemplate-4.2.0.tar.gz", hash = "sha256:480c2ed180878955863323eea31b0ede668795de182617fef9c6ca09e6ec9d0e", size = 33267, upload-time = "2025-06-02T15:12:06.318Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/99/3ae339466c9183ea5b8ae87b34c0b897eda475d2aec2307cae60e5cd4f29/uritemplate-4.2.0-py3-none-any.whl", hash = "sha256:962201ba1c4edcab02e60f9a0d3821e82dfc5d2d6662a21abd533879bdb8a686", size = 11488, upload-time = "2025-06-02T15:12:03.405Z" }, +] + +[[package]] +name = "urllib3" +version = "2.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, +] + +[package.optional-dependencies] +socks = [ + { name = "pysocks" }, +] + +[[package]] +name = "uvicorn" +version = "0.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" }, +] + +[package.optional-dependencies] +standard = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "httptools" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" }, + { name = "watchfiles" }, + { name = "websockets" }, +] + +[[package]] +name = "uvloop" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/d5/69900f7883235562f1f50d8184bb7dd84a2fb61e9ec63f3782546fdbd057/uvloop-0.22.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c60ebcd36f7b240b30788554b6f0782454826a0ed765d8430652621b5de674b9", size = 1352420, upload-time = "2025-10-16T22:16:21.187Z" }, + { url = "https://files.pythonhosted.org/packages/a8/73/c4e271b3bce59724e291465cc936c37758886a4868787da0278b3b56b905/uvloop-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b7f102bf3cb1995cfeaee9321105e8f5da76fdb104cdad8986f85461a1b7b77", size = 748677, upload-time = "2025-10-16T22:16:22.558Z" }, + { url = "https://files.pythonhosted.org/packages/86/94/9fb7fad2f824d25f8ecac0d70b94d0d48107ad5ece03769a9c543444f78a/uvloop-0.22.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53c85520781d84a4b8b230e24a5af5b0778efdb39142b424990ff1ef7c48ba21", size = 3753819, upload-time = "2025-10-16T22:16:23.903Z" }, + { url = "https://files.pythonhosted.org/packages/74/4f/256aca690709e9b008b7108bc85fba619a2bc37c6d80743d18abad16ee09/uvloop-0.22.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56a2d1fae65fd82197cb8c53c367310b3eabe1bbb9fb5a04d28e3e3520e4f702", size = 3804529, upload-time = "2025-10-16T22:16:25.246Z" }, + { url = "https://files.pythonhosted.org/packages/7f/74/03c05ae4737e871923d21a76fe28b6aad57f5c03b6e6bfcfa5ad616013e4/uvloop-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40631b049d5972c6755b06d0bfe8233b1bd9a8a6392d9d1c45c10b6f9e9b2733", size = 3621267, upload-time = "2025-10-16T22:16:26.819Z" }, + { url = "https://files.pythonhosted.org/packages/75/be/f8e590fe61d18b4a92070905497aec4c0e64ae1761498cad09023f3f4b3e/uvloop-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:535cc37b3a04f6cd2c1ef65fa1d370c9a35b6695df735fcff5427323f2cd5473", size = 3723105, upload-time = "2025-10-16T22:16:28.252Z" }, +] + +[[package]] +name = "wandb" +version = "0.21.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "gitpython" }, + { name = "packaging" }, + { name = "platformdirs" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sentry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/69/217598886af89350e36bc05c092a67c9c469cff1fd6446edd4c879027e36/wandb-0.21.1.tar.gz", hash = "sha256:753bbdaa3a7703344056e019425b39c17a3d31d8ca0c4d13c4efc046935b08b9", size = 40131395, upload-time = "2025-08-07T18:52:48.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/d0/589f970741f3ead9ad28d4cbb668d1e6a39848df767f004ac9c7bed8f4b5/wandb-0.21.1-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:96f9eedeae428de0d88f9751fb81f1b730ae7902f35c2f5a7a904d7733f124f3", size = 21701698, upload-time = "2025-08-07T18:52:22.399Z" }, + { url = "https://files.pythonhosted.org/packages/41/6c/a6140a0f395a99902aafdfe63088b7aff509e4f14cd7dd084d47eab36f27/wandb-0.21.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:41a1ec1b98d9d7e1bcafc483bce82e184b6cbae7531328a0fe8dd0f56d96a92e", size = 21221046, upload-time = "2025-08-07T18:52:26.134Z" }, + { url = "https://files.pythonhosted.org/packages/e9/d8/dacbb30ed35141d48a387d84f2e792d4b61b5bcdbf5ffdbd3f0b57beb346/wandb-0.21.1-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:f74d4691c38318ed8611e00ca3246b4152a03ff390fdce41816bea5705452a73", size = 21885803, upload-time = "2025-08-07T18:52:28.489Z" }, + { url = "https://files.pythonhosted.org/packages/b0/48/3a7290a33b1f64e29ac8779dab4d4cdef31a9ed3c3d9ea656a4507d64332/wandb-0.21.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c8fbd60b9abf4b9bec201f311602f61394d41a3503c801750b03975a5e36d1b", size = 20825318, upload-time = "2025-08-07T18:52:31.282Z" }, + { url = "https://files.pythonhosted.org/packages/a9/54/c0a087114ff1bb6c32e64aaa58aea4342cebc0ad58b1378c0a5a831d2508/wandb-0.21.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ded9313672630c0630f5b13c598ce9aa0e932e811ebc18823fcc4d73acfb6bb", size = 22362500, upload-time = "2025-08-07T18:52:33.889Z" }, + { url = "https://files.pythonhosted.org/packages/65/68/3aae277ea9fb5d91eec066cf256755bed3a740d92b539888a7ce36cf3f6c/wandb-0.21.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:44f3194d697b409f91708c50c5f9d56e282434a0d60ac380b64f0fb6991cd630", size = 20830372, upload-time = "2025-08-07T18:52:36.76Z" }, + { url = "https://files.pythonhosted.org/packages/d2/bb/58d206e79be1f279ef06cb934ae1e208bcacd2cd73b7a7652236575010d6/wandb-0.21.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e0b68bb6dbe94f1910c665c755f438292df40c272feb1a8b42208c1df52cce26", size = 22438521, upload-time = "2025-08-07T18:52:39.672Z" }, + { url = "https://files.pythonhosted.org/packages/e7/b8/dfe01f8e4c40d5dda820fd839c39431608a3453670f79404fa28915972d2/wandb-0.21.1-py3-none-win32.whl", hash = "sha256:98306c3fb369dfafb7194270b938b000ea2bb08dbddff10c19b5a805fd5cab80", size = 21569814, upload-time = "2025-08-07T18:52:42.58Z" }, + { url = "https://files.pythonhosted.org/packages/51/ba/81c77d5d831fcddb89661c85175fcbb91d2ffecf6b0591972829da3eb42f/wandb-0.21.1-py3-none-win_amd64.whl", hash = "sha256:8be92a7e92b5cb5ce00ec0961f9dbaad7757ffdbc5b5a8f2cc7188e23f653f0a", size = 21569817, upload-time = "2025-08-07T18:52:45.559Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/f8/2c5f479fb531ce2f0564eda479faecf253d886b1ab3630a39b7bf7362d46/watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5", size = 406529, upload-time = "2025-10-14T15:04:32.899Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cd/f515660b1f32f65df671ddf6f85bfaca621aee177712874dc30a97397977/watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741", size = 394384, upload-time = "2025-10-14T15:04:33.761Z" }, + { url = "https://files.pythonhosted.org/packages/7b/c3/28b7dc99733eab43fca2d10f55c86e03bd6ab11ca31b802abac26b23d161/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6", size = 448789, upload-time = "2025-10-14T15:04:34.679Z" }, + { url = "https://files.pythonhosted.org/packages/4a/24/33e71113b320030011c8e4316ccca04194bf0cbbaeee207f00cbc7d6b9f5/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b", size = 460521, upload-time = "2025-10-14T15:04:35.963Z" }, + { url = "https://files.pythonhosted.org/packages/f4/c3/3c9a55f255aa57b91579ae9e98c88704955fa9dac3e5614fb378291155df/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14", size = 488722, upload-time = "2025-10-14T15:04:37.091Z" }, + { url = "https://files.pythonhosted.org/packages/49/36/506447b73eb46c120169dc1717fe2eff07c234bb3232a7200b5f5bd816e9/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d", size = 596088, upload-time = "2025-10-14T15:04:38.39Z" }, + { url = "https://files.pythonhosted.org/packages/82/ab/5f39e752a9838ec4d52e9b87c1e80f1ee3ccdbe92e183c15b6577ab9de16/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff", size = 472923, upload-time = "2025-10-14T15:04:39.666Z" }, + { url = "https://files.pythonhosted.org/packages/af/b9/a419292f05e302dea372fa7e6fda5178a92998411f8581b9830d28fb9edb/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606", size = 456080, upload-time = "2025-10-14T15:04:40.643Z" }, + { url = "https://files.pythonhosted.org/packages/b0/c3/d5932fd62bde1a30c36e10c409dc5d54506726f08cb3e1d8d0ba5e2bc8db/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701", size = 629432, upload-time = "2025-10-14T15:04:41.789Z" }, + { url = "https://files.pythonhosted.org/packages/f7/77/16bddd9779fafb795f1a94319dc965209c5641db5bf1edbbccace6d1b3c0/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10", size = 623046, upload-time = "2025-10-14T15:04:42.718Z" }, + { url = "https://files.pythonhosted.org/packages/46/ef/f2ecb9a0f342b4bfad13a2787155c6ee7ce792140eac63a34676a2feeef2/watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849", size = 271473, upload-time = "2025-10-14T15:04:43.624Z" }, + { url = "https://files.pythonhosted.org/packages/94/bc/f42d71125f19731ea435c3948cad148d31a64fccde3867e5ba4edee901f9/watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4", size = 287598, upload-time = "2025-10-14T15:04:44.516Z" }, + { url = "https://files.pythonhosted.org/packages/57/c9/a30f897351f95bbbfb6abcadafbaca711ce1162f4db95fc908c98a9165f3/watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e", size = 277210, upload-time = "2025-10-14T15:04:45.883Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/e500f8b0b77be4ff753ac94dc06b33d8f0d839377fee1b78e8c8d8f031bf/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88", size = 408250, upload-time = "2025-10-14T15:06:10.264Z" }, + { url = "https://files.pythonhosted.org/packages/bd/95/615e72cd27b85b61eec764a5ca51bd94d40b5adea5ff47567d9ebc4d275a/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336", size = 396117, upload-time = "2025-10-14T15:06:11.28Z" }, + { url = "https://files.pythonhosted.org/packages/c9/81/e7fe958ce8a7fb5c73cc9fb07f5aeaf755e6aa72498c57d760af760c91f8/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24", size = 450493, upload-time = "2025-10-14T15:06:12.321Z" }, + { url = "https://files.pythonhosted.org/packages/6e/d4/ed38dd3b1767193de971e694aa544356e63353c33a85d948166b5ff58b9e/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49", size = 457546, upload-time = "2025-10-14T15:06:13.372Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, +] + +[[package]] +name = "websocket-client" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" }, +] + +[[package]] +name = "websockets" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/32/18fcd5919c293a398db67443acd33fde142f283853076049824fc58e6f75/websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431", size = 175423, upload-time = "2025-03-05T20:01:56.276Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/ba1ad96b07869275ef42e2ce21f07a5b0148936688c2baf7e4a1f60d5058/websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57", size = 173082, upload-time = "2025-03-05T20:01:57.563Z" }, + { url = "https://files.pythonhosted.org/packages/86/f2/10b55821dd40eb696ce4704a87d57774696f9451108cff0d2824c97e0f97/websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905", size = 173330, upload-time = "2025-03-05T20:01:59.063Z" }, + { url = "https://files.pythonhosted.org/packages/a5/90/1c37ae8b8a113d3daf1065222b6af61cc44102da95388ac0018fcb7d93d9/websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562", size = 182878, upload-time = "2025-03-05T20:02:00.305Z" }, + { url = "https://files.pythonhosted.org/packages/8e/8d/96e8e288b2a41dffafb78e8904ea7367ee4f891dafc2ab8d87e2124cb3d3/websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792", size = 181883, upload-time = "2025-03-05T20:02:03.148Z" }, + { url = "https://files.pythonhosted.org/packages/93/1f/5d6dbf551766308f6f50f8baf8e9860be6182911e8106da7a7f73785f4c4/websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413", size = 182252, upload-time = "2025-03-05T20:02:05.29Z" }, + { url = "https://files.pythonhosted.org/packages/d4/78/2d4fed9123e6620cbf1706c0de8a1632e1a28e7774d94346d7de1bba2ca3/websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8", size = 182521, upload-time = "2025-03-05T20:02:07.458Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3b/66d4c1b444dd1a9823c4a81f50231b921bab54eee2f69e70319b4e21f1ca/websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3", size = 181958, upload-time = "2025-03-05T20:02:09.842Z" }, + { url = "https://files.pythonhosted.org/packages/08/ff/e9eed2ee5fed6f76fdd6032ca5cd38c57ca9661430bb3d5fb2872dc8703c/websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf", size = 181918, upload-time = "2025-03-05T20:02:11.968Z" }, + { url = "https://files.pythonhosted.org/packages/d8/75/994634a49b7e12532be6a42103597b71098fd25900f7437d6055ed39930a/websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85", size = 176388, upload-time = "2025-03-05T20:02:13.32Z" }, + { url = "https://files.pythonhosted.org/packages/98/93/e36c73f78400a65f5e236cd376713c34182e6663f6889cd45a4a04d8f203/websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065", size = 176828, upload-time = "2025-03-05T20:02:14.585Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, +] + +[[package]] +name = "werkzeug" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload-time = "2024-11-08T15:52:18.093Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" }, +] + +[[package]] +name = "wheel" +version = "0.45.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/98/2d9906746cdc6a6ef809ae6338005b3f21bb568bea3165cfc6a243fdc25c/wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729", size = 107545, upload-time = "2024-11-23T00:18:23.513Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" }, +] + +[[package]] +name = "wsproto" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/4a/44d3c295350d776427904d73c189e10aeae66d7f555bb2feee16d1e4ba5a/wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", size = 53425, upload-time = "2022-08-23T19:58:21.447Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226, upload-time = "2022-08-23T19:58:19.96Z" }, +] + +[[package]] +name = "xxhash" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" }, + { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" }, + { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" }, + { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" }, + { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" }, + { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" }, + { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" }, + { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" }, + { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" }, + { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" }, + { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" }, + { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" }, + { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" }, +] + +[[package]] +name = "yapf" +version = "0.43.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/97/b6f296d1e9cc1ec25c7604178b48532fa5901f721bcf1b8d8148b13e5588/yapf-0.43.0.tar.gz", hash = "sha256:00d3aa24bfedff9420b2e0d5d9f5ab6d9d4268e72afbf59bb3fa542781d5218e", size = 254907, upload-time = "2024-11-14T00:11:41.584Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/81/6acd6601f61e31cfb8729d3da6d5df966f80f374b78eff83760714487338/yapf-0.43.0-py3-none-any.whl", hash = "sha256:224faffbc39c428cb095818cf6ef5511fdab6f7430a10783fdfb292ccf2852ca", size = 256158, upload-time = "2024-11-14T00:11:39.37Z" }, +] + +[[package]] +name = "yarl" +version = "1.20.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/18/893b50efc2350e47a874c5c2d67e55a0ea5df91186b2a6f5ac52eff887cd/yarl-1.20.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:47ee6188fea634bdfaeb2cc420f5b3b17332e6225ce88149a17c413c77ff269e", size = 133833, upload-time = "2025-06-10T00:43:07.393Z" }, + { url = "https://files.pythonhosted.org/packages/89/ed/b8773448030e6fc47fa797f099ab9eab151a43a25717f9ac043844ad5ea3/yarl-1.20.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d0f6500f69e8402d513e5eedb77a4e1818691e8f45e6b687147963514d84b44b", size = 91070, upload-time = "2025-06-10T00:43:09.538Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e3/409bd17b1e42619bf69f60e4f031ce1ccb29bd7380117a55529e76933464/yarl-1.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8900a42fcdaad568de58887c7b2f602962356908eedb7628eaf6021a6e435b", size = 89818, upload-time = "2025-06-10T00:43:11.575Z" }, + { url = "https://files.pythonhosted.org/packages/f8/77/64d8431a4d77c856eb2d82aa3de2ad6741365245a29b3a9543cd598ed8c5/yarl-1.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bad6d131fda8ef508b36be3ece16d0902e80b88ea7200f030a0f6c11d9e508d4", size = 347003, upload-time = "2025-06-10T00:43:14.088Z" }, + { url = "https://files.pythonhosted.org/packages/8d/d2/0c7e4def093dcef0bd9fa22d4d24b023788b0a33b8d0088b51aa51e21e99/yarl-1.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:df018d92fe22aaebb679a7f89fe0c0f368ec497e3dda6cb81a567610f04501f1", size = 336537, upload-time = "2025-06-10T00:43:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/f0/f3/fc514f4b2cf02cb59d10cbfe228691d25929ce8f72a38db07d3febc3f706/yarl-1.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f969afbb0a9b63c18d0feecf0db09d164b7a44a053e78a7d05f5df163e43833", size = 362358, upload-time = "2025-06-10T00:43:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6d/a313ac8d8391381ff9006ac05f1d4331cee3b1efaa833a53d12253733255/yarl-1.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:812303eb4aa98e302886ccda58d6b099e3576b1b9276161469c25803a8db277d", size = 357362, upload-time = "2025-06-10T00:43:20.888Z" }, + { url = "https://files.pythonhosted.org/packages/00/70/8f78a95d6935a70263d46caa3dd18e1f223cf2f2ff2037baa01a22bc5b22/yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98c4a7d166635147924aa0bf9bfe8d8abad6fffa6102de9c99ea04a1376f91e8", size = 348979, upload-time = "2025-06-10T00:43:23.169Z" }, + { url = "https://files.pythonhosted.org/packages/cb/05/42773027968968f4f15143553970ee36ead27038d627f457cc44bbbeecf3/yarl-1.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12e768f966538e81e6e7550f9086a6236b16e26cd964cf4df35349970f3551cf", size = 337274, upload-time = "2025-06-10T00:43:27.111Z" }, + { url = "https://files.pythonhosted.org/packages/05/be/665634aa196954156741ea591d2f946f1b78ceee8bb8f28488bf28c0dd62/yarl-1.20.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe41919b9d899661c5c28a8b4b0acf704510b88f27f0934ac7a7bebdd8938d5e", size = 363294, upload-time = "2025-06-10T00:43:28.96Z" }, + { url = "https://files.pythonhosted.org/packages/eb/90/73448401d36fa4e210ece5579895731f190d5119c4b66b43b52182e88cd5/yarl-1.20.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:8601bc010d1d7780592f3fc1bdc6c72e2b6466ea34569778422943e1a1f3c389", size = 358169, upload-time = "2025-06-10T00:43:30.701Z" }, + { url = "https://files.pythonhosted.org/packages/c3/b0/fce922d46dc1eb43c811f1889f7daa6001b27a4005587e94878570300881/yarl-1.20.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:daadbdc1f2a9033a2399c42646fbd46da7992e868a5fe9513860122d7fe7a73f", size = 362776, upload-time = "2025-06-10T00:43:32.51Z" }, + { url = "https://files.pythonhosted.org/packages/f1/0d/b172628fce039dae8977fd22caeff3eeebffd52e86060413f5673767c427/yarl-1.20.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:03aa1e041727cb438ca762628109ef1333498b122e4c76dd858d186a37cec845", size = 381341, upload-time = "2025-06-10T00:43:34.543Z" }, + { url = "https://files.pythonhosted.org/packages/6b/9b/5b886d7671f4580209e855974fe1cecec409aa4a89ea58b8f0560dc529b1/yarl-1.20.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:642980ef5e0fa1de5fa96d905c7e00cb2c47cb468bfcac5a18c58e27dbf8d8d1", size = 379988, upload-time = "2025-06-10T00:43:36.489Z" }, + { url = "https://files.pythonhosted.org/packages/73/be/75ef5fd0fcd8f083a5d13f78fd3f009528132a1f2a1d7c925c39fa20aa79/yarl-1.20.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:86971e2795584fe8c002356d3b97ef6c61862720eeff03db2a7c86b678d85b3e", size = 371113, upload-time = "2025-06-10T00:43:38.592Z" }, + { url = "https://files.pythonhosted.org/packages/50/4f/62faab3b479dfdcb741fe9e3f0323e2a7d5cd1ab2edc73221d57ad4834b2/yarl-1.20.1-cp311-cp311-win32.whl", hash = "sha256:597f40615b8d25812f14562699e287f0dcc035d25eb74da72cae043bb884d773", size = 81485, upload-time = "2025-06-10T00:43:41.038Z" }, + { url = "https://files.pythonhosted.org/packages/f0/09/d9c7942f8f05c32ec72cd5c8e041c8b29b5807328b68b4801ff2511d4d5e/yarl-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:26ef53a9e726e61e9cd1cda6b478f17e350fb5800b4bd1cd9fe81c4d91cfeb2e", size = 86686, upload-time = "2025-06-10T00:43:42.692Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +]