FinSightAI / backend /utils /pdf_parser.py
Aniket2003333333's picture
start
7248d39
Raw
History Blame Contribute Delete
4.67 kB
"""PyMuPDF-based PDF parsing utilities."""
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Tuple
import fitz
from PIL import Image
SPARSE_TEXT_THRESHOLD = 100
_LINE_Y_TOLERANCE = 4.0
_SPACE_POINTS = 3.5
@dataclass
class PDFPage:
page_number: int
embedded_text: str
image: Image.Image
is_sparse: bool
def extract_pdf_pages(file_bytes: bytes, dpi_scale: float = 2.0) -> List[PDFPage]:
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages = []
try:
for page_num, page in enumerate(doc):
embedded_text = page.get_text("text")
mat = fitz.Matrix(dpi_scale, dpi_scale)
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
is_sparse = len(embedded_text.strip()) < SPARSE_TEXT_THRESHOLD
pages.append(
PDFPage(
page_number=page_num + 1,
embedded_text=embedded_text,
image=img,
is_sparse=is_sparse,
)
)
finally:
doc.close()
return pages
def _group_blocks_into_lines(
blocks: List[Tuple[float, float, float, str]],
) -> List[List[Tuple[float, float, str]]]:
blocks.sort(key=lambda item: (round(item[0], 1), item[1]))
lines: List[List[Tuple[float, float, str]]] = []
current_y: float | None = None
current_line: List[Tuple[float, float, str]] = []
for y0, x0, x1, text in blocks:
if current_y is None or abs(y0 - current_y) > _LINE_Y_TOLERANCE:
if current_line:
lines.append(current_line)
current_line = [(x0, x1, text)]
current_y = y0
else:
current_line.append((x0, x1, text))
if current_line:
lines.append(current_line)
return lines
def extract_page_spatial_text(page: fitz.Page) -> str:
"""Rebuild page text with column spacing from native PDF text blocks."""
raw_blocks = page.get_text("blocks")
text_blocks: List[Tuple[float, float, float, str]] = []
for block in raw_blocks:
if block[6] != 0:
continue
x0, y0, x1, y1, text, *_ = block
cleaned = text.replace("\n", " ").strip()
if cleaned:
text_blocks.append((y0, x0, x1, cleaned))
if not text_blocks:
return page.get_text("text", sort=True).strip()
lines_out: List[str] = []
for line_blocks in _group_blocks_into_lines(text_blocks):
line_blocks.sort(key=lambda item: item[0])
parts: List[str] = []
cursor_x = 0.0
for x0, x1, text in line_blocks:
if parts:
gap = max(1, int((x0 - cursor_x) / _SPACE_POINTS))
parts.append(" " * gap)
else:
leading = max(0, int(x0 / _SPACE_POINTS))
if leading:
parts.append(" " * leading)
parts.append(text)
cursor_x = x1
lines_out.append("".join(parts).rstrip())
return "\n".join(lines_out).strip()
def extract_pdf_spatial_pages(file_bytes: bytes) -> List[Tuple[int, str, bool]]:
"""Return (page_num, spatial_text, is_sparse) for each PDF page."""
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages: List[Tuple[int, str, bool]] = []
try:
for page_num, page in enumerate(doc, start=1):
embedded = page.get_text("text").strip()
is_sparse = len(embedded) < SPARSE_TEXT_THRESHOLD
if is_sparse:
pages.append((page_num, embedded, True))
else:
pages.append((page_num, extract_page_spatial_text(page), False))
finally:
doc.close()
return pages
def render_page_image(
file_bytes: bytes, page_num: int, dpi_scale: float = 2.0
) -> Image.Image:
"""Render a single PDF page — used only when chart OCR is needed."""
doc = fitz.open(stream=file_bytes, filetype="pdf")
try:
page = doc[page_num - 1]
mat = fitz.Matrix(dpi_scale, dpi_scale)
pix = page.get_pixmap(matrix=mat)
return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
finally:
doc.close()
def render_page_png_base64(file_bytes: bytes, page_num: int = 1, dpi_scale: float = 2.0) -> str:
import base64
doc = fitz.open(stream=file_bytes, filetype="pdf")
try:
page = doc[page_num - 1]
mat = fitz.Matrix(dpi_scale, dpi_scale)
pix = page.get_pixmap(matrix=mat)
return base64.b64encode(pix.tobytes("png")).decode("ascii")
finally:
doc.close()