""" Core processing utilities for DocGenie document generation pipeline. Integrated functionality (All 19 Stages): - Stage 1-2: Seed selection, LLM prompting, response processing, PDF rendering, bbox extraction - Stage 3: Handwriting & visual element synthesis (WordStylist diffusion, stamps, barcodes, logos) - Stage 4: Image finalization & OCR (pdf2image, Microsoft Document Intelligence) - Stage 5: Dataset packaging (bbox normalization, GT verification, analysis, debug viz) References generationfolder for core pipeline logic. """ import asyncio import base64 import json import pathlib import tempfile import time import uuid import re from typing import List, Tuple, Optional, Dict, Any from io import BytesIO import requests import httpx from PIL import Image from pdf2image import convert_from_path from bs4 import BeautifulSoup from playwright.async_api import async_playwright import fitz # PyMuPDF for PDF processing from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME, VISUAL_ELEMENT_TYPE_SYNONYMS from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient, create_message from docgenie.generation.pipeline_03_process_response import ( extract_html_documents_from_text, extract_gt, ) from docgenie.generation.pipeline_03.css import ( increase_handwriting_font_size, unmark_visual_elements, ) from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import ( render_pdf_async, preprocess_html_for_pdf, ) from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf # Stage 3 imports - we implement simplified versions directly in this file # The full pipeline functions are available but require SynDatasetDefinition # For API use, we extract elements directly from HTML/CSS from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS from docgenie.generation.utils.stamp import create_stamp from docgenie import ENV # Import config for handwriting service URL from .config import settings async def download_image_to_base64(url: str) -> str: """ Download image or PDF from URL and convert to base64 JPEG. If URL points to a PDF, converts the first page to an image. Args: url: Image or PDF URL Returns: Base64-encoded JPEG image string """ response = requests.get(url, timeout=30) response.raise_for_status() content_type = response.headers.get('Content-Type', '').lower() is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') if is_pdf: # Handle PDF: convert first page to image print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") # Load PDF from bytes pdf_document = fitz.open(stream=response.content, filetype="pdf") if len(pdf_document) == 0: raise ValueError("PDF has no pages") # Render first page to image at high DPI page = pdf_document[0] # Use 300 DPI for high quality (matrix zoom factor = DPI/72) zoom = 300 / 72 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Convert pixmap to PIL Image img_data = pix.tobytes("png") img = Image.open(BytesIO(img_data)) pdf_document.close() print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") else: # Handle regular image img = Image.open(BytesIO(response.content)) # Convert to RGB if necessary if img.mode != 'RGB': img = img.convert('RGB') # Save as JPEG in memory buffer = BytesIO() img.save(buffer, format='JPEG', quality=95) buffer.seek(0) # Encode to base64 img_base64 = base64.b64encode(buffer.read()).decode('utf-8') return img_base64 def download_seed_images(urls: List[str]) -> List[str]: """ Download multiple seed images/PDFs and convert to base64 (synchronous version for worker). If a URL points to a PDF, converts the first page to an image. Implements retry logic for transient HTTP errors (503, 502, 504, 429). Args: urls: List of image or PDF URLs Returns: List of base64-encoded JPEG image strings """ images = [] for url in urls: # Retry logic for transient HTTP errors max_retries = 3 response = None for attempt in range(max_retries): try: response = requests.get(url, timeout=30) response.raise_for_status() break # Success, exit retry loop except requests.exceptions.HTTPError as e: # Retry on transient server errors if e.response.status_code in [502, 503, 504, 429]: if attempt < max_retries - 1: wait_time = 2 * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s print(f" ⚠️ HTTP {e.response.status_code} error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})...") time.sleep(wait_time) continue # Non-retryable error or last attempt raise except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: if attempt < max_retries - 1: wait_time = 2 * (2 ** attempt) print(f" ⚠️ Network error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries}): {e}") time.sleep(wait_time) continue raise if response is None: raise Exception(f"Failed to download seed image after {max_retries} attempts") content_type = response.headers.get('Content-Type', '').lower() is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') if is_pdf: # Handle PDF: convert first page to image print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") # Load PDF from bytes pdf_document = fitz.open(stream=response.content, filetype="pdf") if len(pdf_document) == 0: raise ValueError("PDF has no pages") # Render first page to image at high DPI page = pdf_document[0] # Use 300 DPI for high quality (matrix zoom factor = DPI/72) zoom = 300 / 72 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Convert pixmap to PIL Image img_data = pix.tobytes("png") img = Image.open(BytesIO(img_data)) pdf_document.close() print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") else: # Handle regular image img = Image.open(BytesIO(response.content)) # Convert to RGB if necessary if img.mode != 'RGB': img = img.convert('RGB') # Save as JPEG in memory buffer = BytesIO() img.save(buffer, format='JPEG', quality=95) buffer.seek(0) # Encode to base64 img_base64 = base64.b64encode(buffer.read()).decode('utf-8') images.append(img_base64) return images def build_prompt( language: str, doc_type: str, gt_type: str, gt_format: str, num_solutions: int, num_seed_images: int, prompt_template_path: pathlib.Path, enable_visual_elements: bool = True, visual_element_types: List[str] = None ) -> str: """ Build the system prompt by injecting parameters into template. Args: language: Language for documents doc_type: Type of documents gt_type: Ground truth type description gt_format: Ground truth format specification num_solutions: Number of documents to generate num_seed_images: Number of seed images provided prompt_template_path: Path to prompt template file enable_visual_elements: Whether to include visual element instructions visual_element_types: List of allowed visual element types Returns: Formatted prompt string """ template = prompt_template_path.read_text(encoding='utf-8') # Handle dynamic Visual Placeholders block import re # Define placeholder block pattern ve_block_pattern = r"## Visual Placeholders \(if document type requires\)\n(.*?)\n\n" if not enable_visual_elements or not visual_element_types: # Remove the whole block template = re.sub(ve_block_pattern, "", template, flags=re.DOTALL) # Also remove the checklist item template = template.replace("- [ ] Visual elements are semantically coherent\n", "") else: # Update the block with specific types types_str = ", ".join(visual_element_types) # Example mapping EXAMPLES = { "stamp": '- Example: `
`', "logo": '- Example: `
`', "figure": '- Example: `
`', "barcode": '- Example: `
`', "photo": '- Example: `
`' } # Select examples selected_examples = [] for t in visual_element_types: if t in EXAMPLES: selected_examples.append(EXAMPLES[t]) if len(selected_examples) >= 2: break # Fallback if somehow no types matched (shouldn't happen with valid types) if len(selected_examples) == 0: selected_examples = [EXAMPLES["logo"], EXAMPLES["stamp"]] new_block = [ "## Visual Placeholders (if document type requires)", "- Insert `
` for non-text elements at appropriate positions", f"- Valid types are: {types_str}", "- Add data-content attribute with actual content description", "- For stamps, use `position:absolute;z-index:10;` and specify 'top' and 'right'" if "stamp" in visual_element_types else None, "- Always provide appropiate dimensions", ] # Add the selected examples (either 1 or 2) new_block.extend(selected_examples) # Filter out None and join new_block_str = "\n".join([line for line in new_block if line is not None]) + "\n\n" template = re.sub(ve_block_pattern, new_block_str, template, flags=re.DOTALL) # Inject parameters into template prompt = template.format( language=language, doc_type=doc_type, gt_type=gt_type, gt_format=gt_format, num_solutions=num_solutions, num_seed_images=num_seed_images ) return prompt async def call_claude_api_direct( prompt: str, seed_images_base64: List[str], api_key: str, model: str = "claude-sonnet-4-5-20250929", max_tokens: int = 16384 ) -> str: """ Call Claude API directly (non-batched) with prompt and seed images. Used for API endpoint for immediate synchronous responses. Args: prompt: System prompt seed_images_base64: List of base64-encoded seed images api_key: Anthropic API key model: Claude model name max_tokens: Maximum tokens for response Returns: Raw LLM response text """ import anthropic client = anthropic.Anthropic(api_key=api_key) # Build message using the same format as batched client message_content = create_message(prompt=prompt, images_base64=seed_images_base64) # Call API with prompt caching enabled message = client.messages.create( model=model, max_tokens=max_tokens, messages=[message_content], ) # Extract text response response_text = "" for block in message.content: if block.type == "text": response_text += block.text return response_text def extract_html_documents_from_response(response_text: str) -> List[str]: """ Extract individual HTML documents from LLM response. Uses pipeline_03 function for consistency. Args: response_text: Raw LLM response Returns: List of HTML document strings """ # Use the pipeline function for HTML extraction return extract_html_documents_from_text(text=response_text) def extract_ground_truth(html: str) -> Tuple[Optional[dict], str]: """ Extract ground truth JSON from HTML and return cleaned HTML. Uses pipeline_03 function for consistency. Args: html: HTML document with embedded GT Returns: Tuple of (ground_truth_dict, html_without_gt) """ # Use the pipeline function raw_json, html_clean, soup = extract_gt(html=html) if raw_json: try: gt_dict = json.loads(raw_json) return gt_dict, html_clean except json.JSONDecodeError: return None, html return None, html def extract_css_from_html(html: str) -> Tuple[str, str]: """ Extract CSS from HTML and return both separately. Args: html: HTML document Returns: Tuple of (css_string, html_string) """ soup = BeautifulSoup(html, BS_PARSER) css_parts = [] # Extract from