MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
PDF Document Loading and Rendering
Uses PyMuPDF (fitz) for PDF operations.
Falls back to pdf2image + poppler if needed.
"""
import logging
from pathlib import Path
from typing import Iterator, List, Optional, Tuple, Union
import numpy as np
from PIL import Image
from .base import (
DocumentFormat,
DocumentInfo,
DocumentLoader,
PageInfo,
PageRenderer,
RenderOptions,
)
logger = logging.getLogger(__name__)
class PDFLoader(DocumentLoader):
"""
PDF document loader using PyMuPDF.
Extracts metadata and provides page information.
"""
def __init__(self):
self._doc = None
self._info: Optional[DocumentInfo] = None
self._path: Optional[Path] = None
def load(self, path: Union[str, Path]) -> DocumentInfo:
"""Load PDF and extract metadata."""
try:
import fitz # PyMuPDF
except ImportError:
raise ImportError(
"PyMuPDF (fitz) is required for PDF loading. "
"Install with: pip install pymupdf"
)
self._path = Path(path)
if not self._path.exists():
raise FileNotFoundError(f"PDF file not found: {self._path}")
# Close any previously opened document
self.close()
# Open PDF
self._doc = fitz.open(str(self._path))
# Extract metadata
metadata = self._doc.metadata or {}
# Build page info list
pages = []
has_text_layer = False
has_images = False
for page_num in range(len(self._doc)):
page = self._doc[page_num]
rect = page.rect
# Check for text content
page_has_text = len(page.get_text().strip()) > 0
if page_has_text:
has_text_layer = True
# Check for images
image_list = page.get_images(full=True)
if image_list:
has_images = True
page_info = PageInfo(
page_number=page_num + 1, # 1-indexed
width_pixels=int(rect.width),
height_pixels=int(rect.height),
width_points=rect.width,
height_points=rect.height,
dpi=72, # PDF native resolution
rotation=page.rotation,
has_text=page_has_text,
has_images=len(image_list) > 0
)
pages.append(page_info)
# Determine if scanned (has images but no text)
is_scanned = has_images and not has_text_layer
self._info = DocumentInfo(
path=self._path,
format=DocumentFormat.PDF,
num_pages=len(self._doc),
pages=pages,
title=metadata.get("title"),
author=metadata.get("author"),
subject=metadata.get("subject"),
creator=metadata.get("creator"),
creation_date=metadata.get("creationDate"),
modification_date=metadata.get("modDate"),
file_size_bytes=self._path.stat().st_size,
is_encrypted=self._doc.is_encrypted,
has_text_layer=has_text_layer,
is_scanned=is_scanned,
has_forms=self._doc.is_form_pdf,
has_annotations=any(
len(self._doc[i].annots()) > 0
for i in range(len(self._doc))
if self._doc[i].annots() is not None
)
)
return self._info
def close(self) -> None:
"""Close the PDF document."""
if self._doc is not None:
self._doc.close()
self._doc = None
def is_loaded(self) -> bool:
"""Check if a document is loaded."""
return self._doc is not None
@property
def info(self) -> Optional[DocumentInfo]:
"""Get document info."""
return self._info
@property
def document(self):
"""Get the underlying fitz document (for advanced use)."""
return self._doc
class PDFRenderer(PageRenderer):
"""
PDF page renderer using PyMuPDF.
Renders PDF pages to images at specified DPI.
"""
def __init__(self, loader: PDFLoader):
self._loader = loader
def render_page(
self,
page_number: int,
options: Optional[RenderOptions] = None
) -> np.ndarray:
"""Render a PDF page to an image."""
if not self._loader.is_loaded():
raise RuntimeError("No document loaded")
options = options or RenderOptions()
doc = self._loader.document
# Validate page number
if page_number < 1 or page_number > len(doc):
raise ValueError(f"Invalid page number: {page_number}")
page = doc[page_number - 1] # Convert to 0-indexed
# Calculate zoom factor for desired DPI
# PDF native is 72 DPI
zoom = options.dpi / 72.0
matrix = self._get_matrix(zoom)
# Set color mode
if options.color_mode == "L":
colorspace = self._get_grayscale_colorspace()
else:
colorspace = self._get_rgb_colorspace()
# Render page
try:
import fitz
pixmap = page.get_pixmap(
matrix=matrix,
colorspace=colorspace,
alpha=options.color_mode == "RGBA"
)
# Convert to numpy array
if options.color_mode == "L":
img = np.frombuffer(pixmap.samples, dtype=np.uint8)
img = img.reshape(pixmap.height, pixmap.width)
elif options.color_mode == "RGBA":
img = np.frombuffer(pixmap.samples, dtype=np.uint8)
img = img.reshape(pixmap.height, pixmap.width, 4)
else: # RGB
img = np.frombuffer(pixmap.samples, dtype=np.uint8)
img = img.reshape(pixmap.height, pixmap.width, 3)
return img
except Exception as e:
logger.error(f"Error rendering page {page_number}: {e}")
raise
def _get_matrix(self, zoom: float):
"""Get transformation matrix for rendering."""
import fitz
return fitz.Matrix(zoom, zoom)
def _get_rgb_colorspace(self):
"""Get RGB colorspace."""
import fitz
return fitz.csRGB
def _get_grayscale_colorspace(self):
"""Get grayscale colorspace."""
import fitz
return fitz.csGRAY
def render_pages(
self,
page_numbers: Optional[List[int]] = None,
options: Optional[RenderOptions] = None
) -> Iterator[Tuple[int, np.ndarray]]:
"""Render multiple pages."""
if not self._loader.is_loaded():
raise RuntimeError("No document loaded")
info = self._loader.info
if page_numbers is None:
page_numbers = list(range(1, info.num_pages + 1))
for page_num in page_numbers:
yield page_num, self.render_page(page_num, options)
class PDFTextExtractor:
"""
Extract text and text positions from PDF.
Useful for PDFs with embedded text layer.
"""
def __init__(self, loader: PDFLoader):
self._loader = loader
def extract_text(self, page_number: int) -> str:
"""Extract plain text from a page."""
if not self._loader.is_loaded():
raise RuntimeError("No document loaded")
doc = self._loader.document
page = doc[page_number - 1]
return page.get_text()
def extract_text_with_positions(
self,
page_number: int
) -> List[dict]:
"""
Extract text with bounding box positions.
Returns list of dicts with:
- text: The text content
- bbox: (x0, y0, x1, y1) in page coordinates
- block_no: Block number
- line_no: Line number within block
- word_no: Word number within line
"""
if not self._loader.is_loaded():
raise RuntimeError("No document loaded")
doc = self._loader.document
page = doc[page_number - 1]
# Get text as dict with positions
text_dict = page.get_text("dict")
words = []
for block in text_dict.get("blocks", []):
if block.get("type") != 0: # Only text blocks
continue
block_no = block.get("number", 0)
for line_no, line in enumerate(block.get("lines", [])):
for word_no, span in enumerate(line.get("spans", [])):
bbox = span.get("bbox", (0, 0, 0, 0))
words.append({
"text": span.get("text", ""),
"bbox": bbox,
"block_no": block_no,
"line_no": line_no,
"word_no": word_no,
"font": span.get("font", ""),
"size": span.get("size", 0),
"flags": span.get("flags", 0),
})
return words
def get_page_dimensions(self, page_number: int) -> Tuple[float, float]:
"""Get page dimensions in points."""
if not self._loader.is_loaded():
raise RuntimeError("No document loaded")
doc = self._loader.document
page = doc[page_number - 1]
rect = page.rect
return rect.width, rect.height
def load_pdf(path: Union[str, Path]) -> Tuple[PDFLoader, PDFRenderer]:
"""
Convenience function to load a PDF.
Returns:
Tuple of (loader, renderer)
Example:
loader, renderer = load_pdf("document.pdf")
info = loader.info
for page_num in range(1, info.num_pages + 1):
image = renderer.render_page(page_num)
"""
loader = PDFLoader()
loader.load(path)
renderer = PDFRenderer(loader)
return loader, renderer