PaperProf / core /parser.py
Mehdi
feat: switch to MiniCPM4-8B, add robustness, score tracker, language selector
83ec3f5
raw
history blame contribute delete
891 Bytes
"""
core/parser.py — PDF text extraction using PyMuPDF (fitz).
Responsibility:
Open a PDF file and extract its full plain-text content, page by page,
returning a single concatenated string. All downstream modules depend
on this raw text as their starting point.
Public API:
extract_text(pdf_path: str) -> str
"""
import fitz # PyMuPDF
def extract_text(pdf_path: str) -> str:
"""Return the concatenated plain text of every page in *pdf_path*."""
try:
doc = fitz.open(pdf_path)
except Exception as e:
raise ValueError(f"Cannot open PDF: {e}")
pages = [page.get_text() for page in doc]
doc.close()
text = "\n".join(pages)
if not text.strip():
raise ValueError(
"No text found in this PDF. It may be a scanned document (image-only). "
"Try a PDF with a text layer."
)
return text