""" core/parser.py — PDF text extraction using PyMuPDF (fitz). Responsibility: Open a PDF file and extract its full plain-text content, page by page, returning a single concatenated string. All downstream modules depend on this raw text as their starting point. Public API: extract_text(pdf_path: str) -> str """ import fitz # PyMuPDF def extract_text(pdf_path: str) -> str: """Return the concatenated plain text of every page in *pdf_path*.""" try: doc = fitz.open(pdf_path) except Exception as e: raise ValueError(f"Cannot open PDF: {e}") pages = [page.get_text() for page in doc] doc.close() text = "\n".join(pages) if not text.strip(): raise ValueError( "No text found in this PDF. It may be a scanned document (image-only). " "Try a PDF with a text layer." ) return text