""" Evidence Builder for Document Grounding Creates evidence references for extracted information. Handles image cropping and base64 encoding. """ import base64 import io from typing import List, Optional, Dict, Any, Tuple from pydantic import BaseModel, Field import numpy as np from PIL import Image from loguru import logger from ..schemas.core import ( BoundingBox, DocumentChunk, EvidenceRef, OCRRegion, ) class GroundingConfig(BaseModel): """Configuration for grounding and evidence generation.""" # Image cropping include_images: bool = Field( default=True, description="Include cropped images in evidence" ) crop_padding: int = Field( default=10, ge=0, description="Padding around crop regions in pixels" ) max_image_size: int = Field( default=512, ge=64, description="Maximum dimension for cropped images" ) image_format: str = Field( default="PNG", description="Image format for encoding (PNG/JPEG)" ) image_quality: int = Field( default=85, ge=1, le=100, description="JPEG quality if using JPEG format" ) # Snippet settings max_snippet_length: int = Field( default=200, ge=50, description="Maximum length of text snippets" ) include_context: bool = Field( default=True, description="Include surrounding context in snippets" ) def crop_region_image( image: np.ndarray, bbox: BoundingBox, padding: int = 10, max_size: Optional[int] = None, ) -> np.ndarray: """ Crop a region from an image. Args: image: Source image (RGB, HWC format) bbox: Bounding box to crop padding: Padding around the crop max_size: Maximum dimension (will resize if larger) Returns: Cropped image as numpy array """ height, width = image.shape[:2] # Get coordinates with padding x1 = max(0, int(bbox.x_min) - padding) y1 = max(0, int(bbox.y_min) - padding) x2 = min(width, int(bbox.x_max) + padding) y2 = min(height, int(bbox.y_max) + padding) # Crop cropped = image[y1:y2, x1:x2] # Resize if needed if max_size and max(cropped.shape[:2]) > max_size: pil_img = Image.fromarray(cropped) pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) cropped = np.array(pil_img) return cropped def encode_image_base64( image: np.ndarray, format: str = "PNG", quality: int = 85, ) -> str: """ Encode image to base64 string. Args: image: Image as numpy array format: Image format (PNG/JPEG) quality: JPEG quality if applicable Returns: Base64-encoded string """ pil_img = Image.fromarray(image) # Convert to RGB if needed if pil_img.mode != "RGB": pil_img = pil_img.convert("RGB") # Encode buffer = io.BytesIO() if format.upper() == "JPEG": pil_img.save(buffer, format="JPEG", quality=quality) else: pil_img.save(buffer, format="PNG") buffer.seek(0) encoded = base64.b64encode(buffer.read()).decode("utf-8") return encoded def create_evidence_ref( chunk: DocumentChunk, source_type: str = "text", snippet: Optional[str] = None, confidence: float = 1.0, image: Optional[np.ndarray] = None, config: Optional[GroundingConfig] = None, ) -> EvidenceRef: """ Create an evidence reference from a document chunk. Args: chunk: Source chunk source_type: Type of source (text/table/figure) snippet: Optional specific snippet (defaults to chunk text) confidence: Confidence score image: Optional page image for cropping config: Grounding configuration Returns: EvidenceRef instance """ config = config or GroundingConfig() # Create snippet if snippet is None: snippet = chunk.text[:config.max_snippet_length] if len(chunk.text) > config.max_snippet_length: snippet += "..." # Create base evidence evidence = EvidenceRef( chunk_id=chunk.chunk_id, page=chunk.page, bbox=chunk.bbox, source_type=source_type, snippet=snippet, confidence=confidence, ) # Add image if available and configured if image is not None and config.include_images: try: cropped = crop_region_image( image, chunk.bbox, padding=config.crop_padding, max_size=config.max_image_size, ) evidence.image_base64 = encode_image_base64( cropped, format=config.image_format, quality=config.image_quality, ) except Exception as e: logger.warning(f"Failed to crop evidence image: {e}") return evidence class EvidenceBuilder: """ Builder for creating evidence references. Handles: - Evidence from chunks - Evidence from OCR regions - Evidence aggregation - Image cropping and encoding """ def __init__(self, config: Optional[GroundingConfig] = None): """Initialize evidence builder.""" self.config = config or GroundingConfig() def from_chunk( self, chunk: DocumentChunk, image: Optional[np.ndarray] = None, additional_context: Optional[str] = None, ) -> EvidenceRef: """ Create evidence reference from a chunk. Args: chunk: Source chunk image: Optional page image for visual evidence additional_context: Optional additional context Returns: EvidenceRef """ # Determine source type source_type = chunk.chunk_type.value # Build snippet with optional context snippet = chunk.text[:self.config.max_snippet_length] if additional_context: snippet = f"{additional_context}\n{snippet}" if len(chunk.text) > self.config.max_snippet_length: snippet += "..." return create_evidence_ref( chunk=chunk, source_type=source_type, snippet=snippet, confidence=chunk.confidence, image=image, config=self.config, ) def from_ocr_region( self, region: OCRRegion, chunk_id: str, document_id: str, image: Optional[np.ndarray] = None, ) -> EvidenceRef: """ Create evidence reference from an OCR region. Args: region: OCR region chunk_id: ID to assign document_id: Parent document ID image: Optional page image Returns: EvidenceRef """ # Create a temporary chunk for the evidence from ..schemas.core import DocumentChunk, ChunkType chunk = DocumentChunk( chunk_id=chunk_id, chunk_type=ChunkType.TEXT, text=region.text, bbox=region.bbox, page=region.page, document_id=document_id, source_path=None, sequence_index=0, confidence=region.confidence, ) return self.from_chunk(chunk, image) def aggregate_evidence( self, evidence_list: List[EvidenceRef], combine_snippets: bool = True, ) -> List[EvidenceRef]: """ Aggregate and deduplicate evidence references. Args: evidence_list: List of evidence references combine_snippets: Whether to combine snippets from same chunk Returns: Deduplicated evidence list """ if not evidence_list: return [] # Group by chunk_id by_chunk: Dict[str, List[EvidenceRef]] = {} for ev in evidence_list: if ev.chunk_id not in by_chunk: by_chunk[ev.chunk_id] = [] by_chunk[ev.chunk_id].append(ev) # Combine or select best result = [] for chunk_id, evidences in by_chunk.items(): if len(evidences) == 1: result.append(evidences[0]) else: # Take highest confidence, combine snippets best = max(evidences, key=lambda e: e.confidence) if combine_snippets: all_snippets = list(set(e.snippet for e in evidences)) combined = " ... ".join(all_snippets[:3]) best = EvidenceRef( chunk_id=best.chunk_id, page=best.page, bbox=best.bbox, source_type=best.source_type, snippet=combined[:self.config.max_snippet_length], confidence=best.confidence, image_base64=best.image_base64, ) result.append(best) # Sort by page and position result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min)) return result def create_grounding_context( self, evidence_list: List[EvidenceRef], include_images: bool = False, ) -> str: """ Create a text context from evidence for LLM prompting. Args: evidence_list: Evidence references include_images: Whether to include image markers Returns: Formatted context string """ if not evidence_list: return "" lines = ["Evidence from document:"] for i, ev in enumerate(evidence_list, 1): lines.append( f"\n[{i}] Page {ev.page + 1}, {ev.source_type} " f"(confidence: {ev.confidence:.2f}):" ) lines.append(f' "{ev.snippet}"') if include_images and ev.image_base64: lines.append(" [Image available]") return "\n".join(lines)