File size: 891 Bytes
e1c0b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83ec3f5
 
 
 
 
e1c0b77
 
83ec3f5
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
"""
core/parser.py — PDF text extraction using PyMuPDF (fitz).

Responsibility:
    Open a PDF file and extract its full plain-text content, page by page,
    returning a single concatenated string.  All downstream modules depend
    on this raw text as their starting point.

Public API:
    extract_text(pdf_path: str) -> str
"""

import fitz  # PyMuPDF


def extract_text(pdf_path: str) -> str:
    """Return the concatenated plain text of every page in *pdf_path*."""
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        raise ValueError(f"Cannot open PDF: {e}")

    pages = [page.get_text() for page in doc]
    doc.close()
    text = "\n".join(pages)

    if not text.strip():
        raise ValueError(
            "No text found in this PDF. It may be a scanned document (image-only). "
            "Try a PDF with a text layer."
        )
    return text