Spaces:

MHamdan
/

SPARKNET

Sleeping

Initial commit: SPARKNET framework

d520909 13 days ago

1.78 kB

	"""
	Document Intelligence IO Module

	Document loading, rendering, and caching:
	- PDF loading with PyMuPDF
	- Image loading (JPEG, PNG, TIFF)
	- Page rendering at configurable DPI
	- File-based caching with LRU eviction
	"""

	from .base import (
	# Format detection
	DocumentFormat,
	# Metadata
	PageInfo,
	DocumentInfo,
	# Options
	RenderOptions,
	# Base classes
	DocumentLoader,
	PageRenderer,
	DocumentProcessor,
	)

	from .pdf import (
	PDFLoader,
	PDFRenderer,
	PDFTextExtractor,
	load_pdf,
	)

	from .image import (
	ImageLoader,
	ImageRenderer,
	load_image,
	)

	from .cache import (
	CacheConfig,
	CacheEntry,
	DocumentCache,
	get_document_cache,
	cached_page,
	)

	__all__ = [
	# Format
	"DocumentFormat",
	# Metadata
	"PageInfo",
	"DocumentInfo",
	"RenderOptions",
	# Base
	"DocumentLoader",
	"PageRenderer",
	"DocumentProcessor",
	# PDF
	"PDFLoader",
	"PDFRenderer",
	"PDFTextExtractor",
	"load_pdf",
	# Image
	"ImageLoader",
	"ImageRenderer",
	"load_image",
	# Cache
	"CacheConfig",
	"CacheEntry",
	"DocumentCache",
	"get_document_cache",
	"cached_page",
	]


	def load_document(path):
	"""
	Load a document based on its format.

	Auto-detects format from file extension.

	Args:
	path: Path to document file

	Returns:
	Tuple of (loader, renderer)
	"""
	from pathlib import Path as PathLib
	path = PathLib(path)

	fmt = DocumentFormat.from_path(path)

	if fmt == DocumentFormat.PDF:
	return load_pdf(path)
	elif fmt in {DocumentFormat.IMAGE, DocumentFormat.TIFF_MULTIPAGE}:
	return load_image(path)
	else:
	raise ValueError(f"Unsupported document format: {path.suffix}")