Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import fitz # pymupdf | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from PIL import ImageOps, ImageEnhance | |
| OCR_LANG = "eng+ara" | |
| OCR_DPI = 180 | |
| NATIVE_MIN_CHARS_PER_PAGE = 60 # if native extracted text < this => OCR that page | |
| _SENT_BOUNDARY_RE = re.compile(r"(?<=[\.\!\?\u061F\u06D4\u061B…])\s+") # . ! ? ؟ ۔ ؛ … | |
| def normalize_text(text: str) -> str: | |
| """Normalizes text by removing excessive whitespace and fixing newlines.""" | |
| text = text.replace("\r\n", "\n").replace("\r", "\n") | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |
| def ocr_image_pil(img): | |
| """Applies light preprocessing to improve OCR accuracy.""" | |
| img = img.convert("RGB") | |
| img = ImageOps.grayscale(img) | |
| img = ImageEnhance.Contrast(img).enhance(1.6) | |
| return img | |
| def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lang: str = OCR_LANG) -> str: | |
| """OCRs a single PDF page.""" | |
| images = convert_from_path( | |
| str(pdf_path), | |
| dpi=dpi, | |
| first_page=page_number_1based, | |
| last_page=page_number_1based, | |
| fmt="png", | |
| thread_count=2, | |
| ) | |
| if not images: | |
| return "" | |
| img = images[0] | |
| img = ocr_image_pil(img) | |
| return pytesseract.image_to_string(img, lang=lang) | |
| def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str: | |
| """Extracts text from PDF, falling back to OCR for scanned pages. | |
| Optimized to avoid OCR on native PDFs with sparse pages (like title pages).""" | |
| doc = fitz.open(str(pdf_path)) | |
| parts = [] | |
| # Quick check: is this likely a native PDF? | |
| # Sample up to 10 pages to see if any has a good amount of native text. | |
| is_native_pdf = False | |
| sample_pages = min(10, doc.page_count) | |
| for i in range(sample_pages): | |
| page = doc.load_page(i) | |
| native = (page.get_text("text") or "").strip() | |
| if len(re.sub(r"\s+", "", native)) > 200: | |
| is_native_pdf = True | |
| break | |
| for i in range(doc.page_count): | |
| page = doc.load_page(i) | |
| native = (page.get_text("text") or "").strip() | |
| native_compact_len = len(re.sub(r"\s+", "", native)) | |
| if native_compact_len >= native_min_chars_per_page or is_native_pdf: | |
| # If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR | |
| parts.append(native) | |
| else: | |
| # Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page) | |
| ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1) | |
| parts.append(ocr) | |
| doc.close() | |
| return normalize_text("\n\n".join(parts)) | |
| def extract_text_from_file(file_path: str) -> str: | |
| """Extracts text from a .txt or .pdf file.""" | |
| path = Path(file_path) | |
| suf = path.suffix.lower() | |
| if suf == ".txt": | |
| raw = path.read_text(encoding="utf-8", errors="ignore") | |
| return normalize_text(raw) | |
| if suf == ".pdf": | |
| return pdf_to_text_smart(str(path)) | |
| raise ValueError(f"Unsupported file type '{suf}'. Please upload .pdf or .txt only.") | |
| def split_into_chapters(text: str) -> List[Tuple[str, str]]: | |
| """ | |
| Best effort chapter split: | |
| - Detect lines that look like: CHAPTER 1 / Chapter One / CHAPTER ONE etc. | |
| - If not found, return one chapter = full text. | |
| Returns: list of (title, body) | |
| """ | |
| text = normalize_text(text) | |
| lines = text.splitlines() | |
| chapter_re = re.compile(r"^\s*(chapter|CHAPTER)\s+([0-9]+|[IVXLC]+|[A-Za-z]+)\b.*$", re.IGNORECASE) | |
| idxs = [] | |
| titles = [] | |
| for i, ln in enumerate(lines): | |
| if chapter_re.match(ln.strip()): | |
| idxs.append(i) | |
| titles.append(ln.strip()) | |
| if len(idxs) < 2: | |
| return [("BOOK", text)] | |
| chapters = [] | |
| for k in range(len(idxs)): | |
| start = idxs[k] | |
| end = idxs[k+1] if k+1 < len(idxs) else len(lines) | |
| title = titles[k] | |
| body = "\n".join(lines[start:end]).strip() | |
| chapters.append((title, body)) | |
| return chapters | |
| def split_sentences(paragraph: str) -> List[str]: | |
| """Splits a paragraph into sentences.""" | |
| paragraph = paragraph.strip() | |
| if not paragraph: | |
| return [] | |
| if not any(ch in paragraph for ch in ".!?\u061F\u06D4\u061B…"): | |
| ls = [ln.strip() for ln in paragraph.split("\n") if ln.strip()] | |
| return ls if ls else [paragraph] | |
| return [s.strip() for s in _SENT_BOUNDARY_RE.split(paragraph) if s.strip()] | |
| def iter_paragraphs(text: str): | |
| """Yields paragraphs from text.""" | |
| for p in re.split(r"\n\s*\n+", text): | |
| p = p.strip() | |
| if p: | |
| yield p | |