USF00's picture
Optimize summarization generation and PDF OCR processing
81a53ea
import os
import re
from pathlib import Path
from typing import List, Tuple
import fitz # pymupdf
from pdf2image import convert_from_path
import pytesseract
from PIL import ImageOps, ImageEnhance
OCR_LANG = "eng+ara"
OCR_DPI = 180
NATIVE_MIN_CHARS_PER_PAGE = 60 # if native extracted text < this => OCR that page
_SENT_BOUNDARY_RE = re.compile(r"(?<=[\.\!\?\u061F\u06D4\u061B…])\s+") # . ! ? ؟ ۔ ؛ …
def normalize_text(text: str) -> str:
"""Normalizes text by removing excessive whitespace and fixing newlines."""
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def ocr_image_pil(img):
"""Applies light preprocessing to improve OCR accuracy."""
img = img.convert("RGB")
img = ImageOps.grayscale(img)
img = ImageEnhance.Contrast(img).enhance(1.6)
return img
def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lang: str = OCR_LANG) -> str:
"""OCRs a single PDF page."""
images = convert_from_path(
str(pdf_path),
dpi=dpi,
first_page=page_number_1based,
last_page=page_number_1based,
fmt="png",
thread_count=2,
)
if not images:
return ""
img = images[0]
img = ocr_image_pil(img)
return pytesseract.image_to_string(img, lang=lang)
def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
"""Extracts text from PDF, falling back to OCR for scanned pages.
Optimized to avoid OCR on native PDFs with sparse pages (like title pages)."""
doc = fitz.open(str(pdf_path))
parts = []
# Quick check: is this likely a native PDF?
# Sample up to 10 pages to see if any has a good amount of native text.
is_native_pdf = False
sample_pages = min(10, doc.page_count)
for i in range(sample_pages):
page = doc.load_page(i)
native = (page.get_text("text") or "").strip()
if len(re.sub(r"\s+", "", native)) > 200:
is_native_pdf = True
break
for i in range(doc.page_count):
page = doc.load_page(i)
native = (page.get_text("text") or "").strip()
native_compact_len = len(re.sub(r"\s+", "", native))
if native_compact_len >= native_min_chars_per_page or is_native_pdf:
# If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR
parts.append(native)
else:
# Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page)
ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
parts.append(ocr)
doc.close()
return normalize_text("\n\n".join(parts))
def extract_text_from_file(file_path: str) -> str:
"""Extracts text from a .txt or .pdf file."""
path = Path(file_path)
suf = path.suffix.lower()
if suf == ".txt":
raw = path.read_text(encoding="utf-8", errors="ignore")
return normalize_text(raw)
if suf == ".pdf":
return pdf_to_text_smart(str(path))
raise ValueError(f"Unsupported file type '{suf}'. Please upload .pdf or .txt only.")
def split_into_chapters(text: str) -> List[Tuple[str, str]]:
"""
Best effort chapter split:
- Detect lines that look like: CHAPTER 1 / Chapter One / CHAPTER ONE etc.
- If not found, return one chapter = full text.
Returns: list of (title, body)
"""
text = normalize_text(text)
lines = text.splitlines()
chapter_re = re.compile(r"^\s*(chapter|CHAPTER)\s+([0-9]+|[IVXLC]+|[A-Za-z]+)\b.*$", re.IGNORECASE)
idxs = []
titles = []
for i, ln in enumerate(lines):
if chapter_re.match(ln.strip()):
idxs.append(i)
titles.append(ln.strip())
if len(idxs) < 2:
return [("BOOK", text)]
chapters = []
for k in range(len(idxs)):
start = idxs[k]
end = idxs[k+1] if k+1 < len(idxs) else len(lines)
title = titles[k]
body = "\n".join(lines[start:end]).strip()
chapters.append((title, body))
return chapters
def split_sentences(paragraph: str) -> List[str]:
"""Splits a paragraph into sentences."""
paragraph = paragraph.strip()
if not paragraph:
return []
if not any(ch in paragraph for ch in ".!?\u061F\u06D4\u061B…"):
ls = [ln.strip() for ln in paragraph.split("\n") if ln.strip()]
return ls if ls else [paragraph]
return [s.strip() for s in _SENT_BOUNDARY_RE.split(paragraph) if s.strip()]
def iter_paragraphs(text: str):
"""Yields paragraphs from text."""
for p in re.split(r"\n\s*\n+", text):
p = p.strip()
if p:
yield p