Spaces:
Running on Zero
Running on Zero
| """ | |
| core/parser.py — PDF text extraction using PyMuPDF (fitz). | |
| Responsibility: | |
| Open a PDF file and extract its full plain-text content, page by page, | |
| returning a single concatenated string. All downstream modules depend | |
| on this raw text as their starting point. | |
| Public API: | |
| extract_text(pdf_path: str) -> str | |
| """ | |
| import fitz # PyMuPDF | |
| def extract_text(pdf_path: str) -> str: | |
| """Return the concatenated plain text of every page in *pdf_path*.""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| except Exception as e: | |
| raise ValueError(f"Cannot open PDF: {e}") | |
| pages = [page.get_text() for page in doc] | |
| doc.close() | |
| text = "\n".join(pages) | |
| if not text.strip(): | |
| raise ValueError( | |
| "No text found in this PDF. It may be a scanned document (image-only). " | |
| "Try a PDF with a text layer." | |
| ) | |
| return text | |