Spaces:

build-small-hackathon
/

PaperProf

Running on Zero

File size: 891 Bytes

"""
core/parser.py — PDF text extraction using PyMuPDF (fitz).

Responsibility:
    Open a PDF file and extract its full plain-text content, page by page,
    returning a single concatenated string.  All downstream modules depend
    on this raw text as their starting point.

Public API:
    extract_text(pdf_path: str) -> str
"""

import fitz  # PyMuPDF


def extract_text(pdf_path: str) -> str:
    """Return the concatenated plain text of every page in *pdf_path*."""
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        raise ValueError(f"Cannot open PDF: {e}")

    pages = [page.get_text() for page in doc]
    doc.close()
    text = "\n".join(pages)

    if not text.strip():
        raise ValueError(
            "No text found in this PDF. It may be a scanned document (image-only). "
            "Try a PDF with a text layer."
        )
    return text