""" Core processing utilities for DocGenie document generation pipeline. Integrated functionality (All 19 Stages): - Stage 1-2: Seed selection, LLM prompting, response processing, PDF rendering, bbox extraction - Stage 3: Handwriting & visual element synthesis (WordStylist diffusion, stamps, barcodes, logos) - Stage 4: Image finalization & OCR (pdf2image, Microsoft Document Intelligence) - Stage 5: Dataset packaging (bbox normalization, GT verification, analysis, debug viz) References generationfolder for core pipeline logic. """ import asyncio import base64 import json import pathlib import tempfile import time import uuid import re from typing import List, Tuple, Optional, Dict, Any from io import BytesIO import requests import httpx from PIL import Image from pdf2image import convert_from_path from bs4 import BeautifulSoup from playwright.async_api import async_playwright import fitz # PyMuPDF for PDF processing from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME, VISUAL_ELEMENT_TYPE_SYNONYMS from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient, create_message from docgenie.generation.pipeline_03_process_response import ( extract_html_documents_from_text, extract_gt, ) from docgenie.generation.pipeline_03.css import ( increase_handwriting_font_size, unmark_visual_elements, ) from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import ( render_pdf_async, preprocess_html_for_pdf, ) from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf # Stage 3 imports - we implement simplified versions directly in this file # The full pipeline functions are available but require SynDatasetDefinition # For API use, we extract elements directly from HTML/CSS from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS from docgenie.generation.utils.stamp import create_stamp from docgenie import ENV # Import config for handwriting service URL from .config import settings async def download_image_to_base64(url: str) -> str: """ Download image or PDF from URL and convert to base64 JPEG. If URL points to a PDF, converts the first page to an image. Args: url: Image or PDF URL Returns: Base64-encoded JPEG image string """ response = requests.get(url, timeout=30) response.raise_for_status() content_type = response.headers.get('Content-Type', '').lower() is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') if is_pdf: # Handle PDF: convert first page to image print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") # Load PDF from bytes pdf_document = fitz.open(stream=response.content, filetype="pdf") if len(pdf_document) == 0: raise ValueError("PDF has no pages") # Render first page to image at high DPI page = pdf_document[0] # Use 300 DPI for high quality (matrix zoom factor = DPI/72) zoom = 300 / 72 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Convert pixmap to PIL Image img_data = pix.tobytes("png") img = Image.open(BytesIO(img_data)) pdf_document.close() print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") else: # Handle regular image img = Image.open(BytesIO(response.content)) # Convert to RGB if necessary if img.mode != 'RGB': img = img.convert('RGB') # Save as JPEG in memory buffer = BytesIO() img.save(buffer, format='JPEG', quality=95) buffer.seek(0) # Encode to base64 img_base64 = base64.b64encode(buffer.read()).decode('utf-8') return img_base64 def download_seed_images(urls: List[str]) -> List[str]: """ Download multiple seed images/PDFs and convert to base64 (synchronous version for worker). If a URL points to a PDF, converts the first page to an image. Implements retry logic for transient HTTP errors (503, 502, 504, 429). Args: urls: List of image or PDF URLs Returns: List of base64-encoded JPEG image strings """ images = [] for url in urls: # Retry logic for transient HTTP errors max_retries = 3 response = None for attempt in range(max_retries): try: response = requests.get(url, timeout=30) response.raise_for_status() break # Success, exit retry loop except requests.exceptions.HTTPError as e: # Retry on transient server errors if e.response.status_code in [502, 503, 504, 429]: if attempt < max_retries - 1: wait_time = 2 * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s print(f" ⚠️ HTTP {e.response.status_code} error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})...") time.sleep(wait_time) continue # Non-retryable error or last attempt raise except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: if attempt < max_retries - 1: wait_time = 2 * (2 ** attempt) print(f" ⚠️ Network error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries}): {e}") time.sleep(wait_time) continue raise if response is None: raise Exception(f"Failed to download seed image after {max_retries} attempts") content_type = response.headers.get('Content-Type', '').lower() is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') if is_pdf: # Handle PDF: convert first page to image print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") # Load PDF from bytes pdf_document = fitz.open(stream=response.content, filetype="pdf") if len(pdf_document) == 0: raise ValueError("PDF has no pages") # Render first page to image at high DPI page = pdf_document[0] # Use 300 DPI for high quality (matrix zoom factor = DPI/72) zoom = 300 / 72 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Convert pixmap to PIL Image img_data = pix.tobytes("png") img = Image.open(BytesIO(img_data)) pdf_document.close() print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") else: # Handle regular image img = Image.open(BytesIO(response.content)) # Convert to RGB if necessary if img.mode != 'RGB': img = img.convert('RGB') # Save as JPEG in memory buffer = BytesIO() img.save(buffer, format='JPEG', quality=95) buffer.seek(0) # Encode to base64 img_base64 = base64.b64encode(buffer.read()).decode('utf-8') images.append(img_base64) return images def build_prompt( language: str, doc_type: str, gt_type: str, gt_format: str, num_solutions: int, num_seed_images: int, prompt_template_path: pathlib.Path, enable_visual_elements: bool = True, visual_element_types: List[str] = None ) -> str: """ Build the system prompt by injecting parameters into template. Args: language: Language for documents doc_type: Type of documents gt_type: Ground truth type description gt_format: Ground truth format specification num_solutions: Number of documents to generate num_seed_images: Number of seed images provided prompt_template_path: Path to prompt template file enable_visual_elements: Whether to include visual element instructions visual_element_types: List of allowed visual element types Returns: Formatted prompt string """ template = prompt_template_path.read_text(encoding='utf-8') # Handle dynamic Visual Placeholders block import re # Define placeholder block pattern ve_block_pattern = r"## Visual Placeholders \(if document type requires\)\n(.*?)\n\n" if not enable_visual_elements or not visual_element_types: # Remove the whole block template = re.sub(ve_block_pattern, "", template, flags=re.DOTALL) # Also remove the checklist item template = template.replace("- [ ] Visual elements are semantically coherent\n", "") else: # Update the block with specific types types_str = ", ".join(visual_element_types) # Example mapping EXAMPLES = { "stamp": '- Example: `
`', "logo": '- Example: ``', "figure": '- Example: ``', "barcode": '- Example: ``', "photo": '- Example: ``' } # Select examples selected_examples = [] for t in visual_element_types: if t in EXAMPLES: selected_examples.append(EXAMPLES[t]) if len(selected_examples) >= 2: break # Fallback if somehow no types matched (shouldn't happen with valid types) if len(selected_examples) == 0: selected_examples = [EXAMPLES["logo"], EXAMPLES["stamp"]] new_block = [ "## Visual Placeholders (if document type requires)", "- Insert `