Spaces:

build-small-hackathon
/

FinSightAI

Running

App Files Files Community

FinSightAI / backend /utils /pdf_parser.py

Aniket2003333333

start

7248d39 7 days ago

Raw

History Blame Contribute Delete

4.67 kB

	"""PyMuPDF-based PDF parsing utilities."""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import List, Tuple

	import fitz
	from PIL import Image

	SPARSE_TEXT_THRESHOLD = 100
	_LINE_Y_TOLERANCE = 4.0
	_SPACE_POINTS = 3.5


	@dataclass
	class PDFPage:
	page_number: int
	embedded_text: str
	image: Image.Image
	is_sparse: bool


	def extract_pdf_pages(file_bytes: bytes, dpi_scale: float = 2.0) -> List[PDFPage]:
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	pages = []
	try:
	for page_num, page in enumerate(doc):
	embedded_text = page.get_text("text")
	mat = fitz.Matrix(dpi_scale, dpi_scale)
	pix = page.get_pixmap(matrix=mat)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	is_sparse = len(embedded_text.strip()) < SPARSE_TEXT_THRESHOLD
	pages.append(
	PDFPage(
	page_number=page_num + 1,
	embedded_text=embedded_text,
	image=img,
	is_sparse=is_sparse,
	)
	)
	finally:
	doc.close()
	return pages


	def _group_blocks_into_lines(
	blocks: List[Tuple[float, float, float, str]],
	) -> List[List[Tuple[float, float, str]]]:
	blocks.sort(key=lambda item: (round(item[0], 1), item[1]))
	lines: List[List[Tuple[float, float, str]]] = []
	current_y: float \| None = None
	current_line: List[Tuple[float, float, str]] = []

	for y0, x0, x1, text in blocks:
	if current_y is None or abs(y0 - current_y) > _LINE_Y_TOLERANCE:
	if current_line:
	lines.append(current_line)
	current_line = [(x0, x1, text)]
	current_y = y0
	else:
	current_line.append((x0, x1, text))

	if current_line:
	lines.append(current_line)
	return lines


	def extract_page_spatial_text(page: fitz.Page) -> str:
	"""Rebuild page text with column spacing from native PDF text blocks."""
	raw_blocks = page.get_text("blocks")
	text_blocks: List[Tuple[float, float, float, str]] = []

	for block in raw_blocks:
	if block[6] != 0:
	continue
	x0, y0, x1, y1, text, *_ = block
	cleaned = text.replace("\n", " ").strip()
	if cleaned:
	text_blocks.append((y0, x0, x1, cleaned))

	if not text_blocks:
	return page.get_text("text", sort=True).strip()

	lines_out: List[str] = []
	for line_blocks in _group_blocks_into_lines(text_blocks):
	line_blocks.sort(key=lambda item: item[0])
	parts: List[str] = []
	cursor_x = 0.0

	for x0, x1, text in line_blocks:
	if parts:
	gap = max(1, int((x0 - cursor_x) / _SPACE_POINTS))
	parts.append(" " * gap)
	else:
	leading = max(0, int(x0 / _SPACE_POINTS))
	if leading:
	parts.append(" " * leading)
	parts.append(text)
	cursor_x = x1

	lines_out.append("".join(parts).rstrip())

	return "\n".join(lines_out).strip()


	def extract_pdf_spatial_pages(file_bytes: bytes) -> List[Tuple[int, str, bool]]:
	"""Return (page_num, spatial_text, is_sparse) for each PDF page."""
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	pages: List[Tuple[int, str, bool]] = []
	try:
	for page_num, page in enumerate(doc, start=1):
	embedded = page.get_text("text").strip()
	is_sparse = len(embedded) < SPARSE_TEXT_THRESHOLD
	if is_sparse:
	pages.append((page_num, embedded, True))
	else:
	pages.append((page_num, extract_page_spatial_text(page), False))
	finally:
	doc.close()
	return pages


	def render_page_image(
	file_bytes: bytes, page_num: int, dpi_scale: float = 2.0
	) -> Image.Image:
	"""Render a single PDF page — used only when chart OCR is needed."""
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	try:
	page = doc[page_num - 1]
	mat = fitz.Matrix(dpi_scale, dpi_scale)
	pix = page.get_pixmap(matrix=mat)
	return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	finally:
	doc.close()


	def render_page_png_base64(file_bytes: bytes, page_num: int = 1, dpi_scale: float = 2.0) -> str:
	import base64

	doc = fitz.open(stream=file_bytes, filetype="pdf")
	try:
	page = doc[page_num - 1]
	mat = fitz.Matrix(dpi_scale, dpi_scale)
	pix = page.get_pixmap(matrix=mat)
	return base64.b64encode(pix.tobytes("png")).decode("ascii")
	finally:
	doc.close()