import os import re from pathlib import Path from typing import List, Tuple import fitz # pymupdf from pdf2image import convert_from_path import pytesseract from PIL import ImageOps, ImageEnhance OCR_LANG = "eng+ara" OCR_DPI = 180 NATIVE_MIN_CHARS_PER_PAGE = 60 # if native extracted text < this => OCR that page _SENT_BOUNDARY_RE = re.compile(r"(?<=[\.\!\?\u061F\u06D4\u061B…])\s+") # . ! ? ؟ ۔ ؛ … def normalize_text(text: str) -> str: """Normalizes text by removing excessive whitespace and fixing newlines.""" text = text.replace("\r\n", "\n").replace("\r", "\n") text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def ocr_image_pil(img): """Applies light preprocessing to improve OCR accuracy.""" img = img.convert("RGB") img = ImageOps.grayscale(img) img = ImageEnhance.Contrast(img).enhance(1.6) return img def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lang: str = OCR_LANG) -> str: """OCRs a single PDF page.""" images = convert_from_path( str(pdf_path), dpi=dpi, first_page=page_number_1based, last_page=page_number_1based, fmt="png", thread_count=2, ) if not images: return "" img = images[0] img = ocr_image_pil(img) return pytesseract.image_to_string(img, lang=lang) def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str: """Extracts text from PDF, falling back to OCR for scanned pages. Optimized to avoid OCR on native PDFs with sparse pages (like title pages).""" doc = fitz.open(str(pdf_path)) parts = [] # Quick check: is this likely a native PDF? # Sample up to 10 pages to see if any has a good amount of native text. is_native_pdf = False sample_pages = min(10, doc.page_count) for i in range(sample_pages): page = doc.load_page(i) native = (page.get_text("text") or "").strip() if len(re.sub(r"\s+", "", native)) > 200: is_native_pdf = True break for i in range(doc.page_count): page = doc.load_page(i) native = (page.get_text("text") or "").strip() native_compact_len = len(re.sub(r"\s+", "", native)) if native_compact_len >= native_min_chars_per_page or is_native_pdf: # If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR parts.append(native) else: # Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page) ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1) parts.append(ocr) doc.close() return normalize_text("\n\n".join(parts)) def extract_text_from_file(file_path: str) -> str: """Extracts text from a .txt or .pdf file.""" path = Path(file_path) suf = path.suffix.lower() if suf == ".txt": raw = path.read_text(encoding="utf-8", errors="ignore") return normalize_text(raw) if suf == ".pdf": return pdf_to_text_smart(str(path)) raise ValueError(f"Unsupported file type '{suf}'. Please upload .pdf or .txt only.") def split_into_chapters(text: str) -> List[Tuple[str, str]]: """ Best effort chapter split: - Detect lines that look like: CHAPTER 1 / Chapter One / CHAPTER ONE etc. - If not found, return one chapter = full text. Returns: list of (title, body) """ text = normalize_text(text) lines = text.splitlines() chapter_re = re.compile(r"^\s*(chapter|CHAPTER)\s+([0-9]+|[IVXLC]+|[A-Za-z]+)\b.*$", re.IGNORECASE) idxs = [] titles = [] for i, ln in enumerate(lines): if chapter_re.match(ln.strip()): idxs.append(i) titles.append(ln.strip()) if len(idxs) < 2: return [("BOOK", text)] chapters = [] for k in range(len(idxs)): start = idxs[k] end = idxs[k+1] if k+1 < len(idxs) else len(lines) title = titles[k] body = "\n".join(lines[start:end]).strip() chapters.append((title, body)) return chapters def split_sentences(paragraph: str) -> List[str]: """Splits a paragraph into sentences.""" paragraph = paragraph.strip() if not paragraph: return [] if not any(ch in paragraph for ch in ".!?\u061F\u06D4\u061B…"): ls = [ln.strip() for ln in paragraph.split("\n") if ln.strip()] return ls if ls else [paragraph] return [s.strip() for s in _SENT_BOUNDARY_RE.split(paragraph) if s.strip()] def iter_paragraphs(text: str): """Yields paragraphs from text.""" for p in re.split(r"\n\s*\n+", text): p = p.strip() if p: yield p