diff --git "a/api/utils.py" "b/api/utils.py" new file mode 100644--- /dev/null +++ "b/api/utils.py" @@ -0,0 +1,2740 @@ +""" +Core processing utilities for DocGenie document generation pipeline. + +Integrated functionality (All 19 Stages): +- Stage 1-2: Seed selection, LLM prompting, response processing, PDF rendering, bbox extraction +- Stage 3: Handwriting & visual element synthesis (WordStylist diffusion, stamps, barcodes, logos) +- Stage 4: Image finalization & OCR (pdf2image, Microsoft Document Intelligence) +- Stage 5: Dataset packaging (bbox normalization, GT verification, analysis, debug viz) + +References generationfolder for core pipeline logic. +""" +import asyncio +import base64 +import json +import pathlib +import tempfile +import time +import uuid +import re +from typing import List, Tuple, Optional, Dict, Any +from io import BytesIO + +import requests +import httpx +from PIL import Image +from pdf2image import convert_from_path +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright +import fitz # PyMuPDF for PDF processing + +from docgenie.generation.constants import BS_PARSER, HANDWRITING_CLASS_NAME, VISUAL_ELEMENT_TYPE_SYNONYMS +from docgenie.generation.pipeline_01.claude_batching import ClaudeBatchedClient, create_message +from docgenie.generation.pipeline_03_process_response import ( + extract_html_documents_from_text, + extract_gt, +) +from docgenie.generation.pipeline_03.css import ( + increase_handwriting_font_size, + unmark_visual_elements, +) +from docgenie.generation.pipeline_04_render_pdf_and_extract_geos import ( + render_pdf_async, + preprocess_html_for_pdf, +) +from docgenie.generation.pipeline_04.extract_bbox import extract_bboxes_from_pdf + +# Stage 3 imports - we implement simplified versions directly in this file +# The full pipeline functions are available but require SynDatasetDefinition +# For API use, we extract elements directly from HTML/CSS +from docgenie.generation.utils.pdfjs import MEASURE_DIMENSIONS +from docgenie.generation.utils.stamp import create_stamp +from docgenie import ENV + +# Import config for handwriting service URL +from .config import settings + + +async def download_image_to_base64(url: str) -> str: + """ + Download image or PDF from URL and convert to base64 JPEG. + If URL points to a PDF, converts the first page to an image. + + Args: + url: Image or PDF URL + + Returns: + Base64-encoded JPEG image string + """ + response = requests.get(url, timeout=30) + response.raise_for_status() + + content_type = response.headers.get('Content-Type', '').lower() + is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') + + if is_pdf: + # Handle PDF: convert first page to image + print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") + + # Load PDF from bytes + pdf_document = fitz.open(stream=response.content, filetype="pdf") + + if len(pdf_document) == 0: + raise ValueError("PDF has no pages") + + # Render first page to image at high DPI + page = pdf_document[0] + # Use 300 DPI for high quality (matrix zoom factor = DPI/72) + zoom = 300 / 72 + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Convert pixmap to PIL Image + img_data = pix.tobytes("png") + img = Image.open(BytesIO(img_data)) + + pdf_document.close() + + print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") + else: + # Handle regular image + img = Image.open(BytesIO(response.content)) + + # Convert to RGB if necessary + if img.mode != 'RGB': + img = img.convert('RGB') + + # Save as JPEG in memory + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=95) + buffer.seek(0) + + # Encode to base64 + img_base64 = base64.b64encode(buffer.read()).decode('utf-8') + return img_base64 + + +def download_seed_images(urls: List[str]) -> List[str]: + """ + Download multiple seed images/PDFs and convert to base64 (synchronous version for worker). + If a URL points to a PDF, converts the first page to an image. + Implements retry logic for transient HTTP errors (503, 502, 504, 429). + + Args: + urls: List of image or PDF URLs + + Returns: + List of base64-encoded JPEG image strings + """ + images = [] + for url in urls: + # Retry logic for transient HTTP errors + max_retries = 3 + response = None + + for attempt in range(max_retries): + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + break # Success, exit retry loop + + except requests.exceptions.HTTPError as e: + # Retry on transient server errors + if e.response.status_code in [502, 503, 504, 429]: + if attempt < max_retries - 1: + wait_time = 2 * (2 ** attempt) # Exponential backoff: 2s, 4s, 8s + print(f" ⚠️ HTTP {e.response.status_code} error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})...") + time.sleep(wait_time) + continue + # Non-retryable error or last attempt + raise + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt < max_retries - 1: + wait_time = 2 * (2 ** attempt) + print(f" ⚠️ Network error downloading seed image, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries}): {e}") + time.sleep(wait_time) + continue + raise + + if response is None: + raise Exception(f"Failed to download seed image after {max_retries} attempts") + + content_type = response.headers.get('Content-Type', '').lower() + is_pdf = 'application/pdf' in content_type or url.lower().endswith('.pdf') + + if is_pdf: + # Handle PDF: convert first page to image + print(f" 📄 Detected PDF, converting first page to image: {url[:80]}...") + + # Load PDF from bytes + pdf_document = fitz.open(stream=response.content, filetype="pdf") + + if len(pdf_document) == 0: + raise ValueError("PDF has no pages") + + # Render first page to image at high DPI + page = pdf_document[0] + # Use 300 DPI for high quality (matrix zoom factor = DPI/72) + zoom = 300 / 72 + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Convert pixmap to PIL Image + img_data = pix.tobytes("png") + img = Image.open(BytesIO(img_data)) + + pdf_document.close() + + print(f" ✓ Converted PDF to image: {img.size[0]}x{img.size[1]}px") + else: + # Handle regular image + img = Image.open(BytesIO(response.content)) + + # Convert to RGB if necessary + if img.mode != 'RGB': + img = img.convert('RGB') + + # Save as JPEG in memory + buffer = BytesIO() + img.save(buffer, format='JPEG', quality=95) + buffer.seek(0) + + # Encode to base64 + img_base64 = base64.b64encode(buffer.read()).decode('utf-8') + images.append(img_base64) + + return images + + +def build_prompt( + language: str, + doc_type: str, + gt_type: str, + gt_format: str, + num_solutions: int, + num_seed_images: int, + prompt_template_path: pathlib.Path +) -> str: + """ + Build the system prompt by injecting parameters into template. + + Args: + language: Language for documents + doc_type: Type of documents + gt_type: Ground truth type description + gt_format: Ground truth format specification + num_solutions: Number of documents to generate + num_seed_images: Number of seed images provided + prompt_template_path: Path to prompt template file + + Returns: + Formatted prompt string + """ + template = prompt_template_path.read_text(encoding='utf-8') + + # Inject parameters into template + prompt = template.format( + language=language, + doc_type=doc_type, + gt_type=gt_type, + gt_format=gt_format, + num_solutions=num_solutions, + num_seed_images=num_seed_images + ) + + return prompt + + +async def call_claude_api_direct( + prompt: str, + seed_images_base64: List[str], + api_key: str, + model: str = "claude-sonnet-4-5-20250929", + max_tokens: int = 16384 +) -> str: + """ + Call Claude API directly (non-batched) with prompt and seed images. + Used for API endpoint for immediate synchronous responses. + + Args: + prompt: System prompt + seed_images_base64: List of base64-encoded seed images + api_key: Anthropic API key + model: Claude model name + max_tokens: Maximum tokens for response + + Returns: + Raw LLM response text + """ + import anthropic + + client = anthropic.Anthropic(api_key=api_key) + + # Build message using the same format as batched client + message_content = create_message(prompt=prompt, images_base64=seed_images_base64) + + # Call API with prompt caching enabled + message = client.messages.create( + model=model, + max_tokens=max_tokens, + messages=[message_content], + ) + + # Extract text response + response_text = "" + for block in message.content: + if block.type == "text": + response_text += block.text + + return response_text + + +def extract_html_documents_from_response(response_text: str) -> List[str]: + """ + Extract individual HTML documents from LLM response. + Uses pipeline_03 function for consistency. + + Args: + response_text: Raw LLM response + + Returns: + List of HTML document strings + """ + # Use the pipeline function for HTML extraction + return extract_html_documents_from_text(text=response_text) + + +def extract_ground_truth(html: str) -> Tuple[Optional[dict], str]: + """ + Extract ground truth JSON from HTML and return cleaned HTML. + Uses pipeline_03 function for consistency. + + Args: + html: HTML document with embedded GT + + Returns: + Tuple of (ground_truth_dict, html_without_gt) + """ + # Use the pipeline function + raw_json, html_clean, soup = extract_gt(html=html) + + if raw_json: + try: + gt_dict = json.loads(raw_json) + return gt_dict, html_clean + except json.JSONDecodeError: + return None, html + + return None, html + + +def extract_css_from_html(html: str) -> Tuple[str, str]: + """ + Extract CSS from HTML and return both separately. + + Args: + html: HTML document + + Returns: + Tuple of (css_string, html_string) + """ + soup = BeautifulSoup(html, BS_PARSER) + + css_parts = [] + + # Extract from