Spaces:

USF00
/

Summarization_Deploy

Sleeping

File size: 4,793 Bytes

import os
import re
from pathlib import Path
from typing import List, Tuple

import fitz  # pymupdf
from pdf2image import convert_from_path
import pytesseract
from PIL import ImageOps, ImageEnhance

OCR_LANG = "eng+ara"
OCR_DPI = 180
NATIVE_MIN_CHARS_PER_PAGE = 60  # if native extracted text < this => OCR that page

_SENT_BOUNDARY_RE = re.compile(r"(?<=[\.\!\?\u061F\u06D4\u061B…])\s+")  # . ! ? ؟ ۔ ؛ …

def normalize_text(text: str) -> str:
    """Normalizes text by removing excessive whitespace and fixing newlines."""
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def ocr_image_pil(img):
    """Applies light preprocessing to improve OCR accuracy."""
    img = img.convert("RGB")
    img = ImageOps.grayscale(img)
    img = ImageEnhance.Contrast(img).enhance(1.6)
    return img

def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lang: str = OCR_LANG) -> str:
    """OCRs a single PDF page."""
    images = convert_from_path(
        str(pdf_path),
        dpi=dpi,
        first_page=page_number_1based,
        last_page=page_number_1based,
        fmt="png",
        thread_count=2,
    )
    if not images:
        return ""
    img = images[0]
    img = ocr_image_pil(img)
    return pytesseract.image_to_string(img, lang=lang)

def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
    """Extracts text from PDF, falling back to OCR for scanned pages.
       Optimized to avoid OCR on native PDFs with sparse pages (like title pages)."""
    doc = fitz.open(str(pdf_path))
    parts = []
    
    # Quick check: is this likely a native PDF?
    # Sample up to 10 pages to see if any has a good amount of native text.
    is_native_pdf = False
    sample_pages = min(10, doc.page_count)
    for i in range(sample_pages):
        page = doc.load_page(i)
        native = (page.get_text("text") or "").strip()
        if len(re.sub(r"\s+", "", native)) > 200:
            is_native_pdf = True
            break

    for i in range(doc.page_count):
        page = doc.load_page(i)
        native = (page.get_text("text") or "").strip()
        native_compact_len = len(re.sub(r"\s+", "", native))

        if native_compact_len >= native_min_chars_per_page or is_native_pdf:
            # If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR
            parts.append(native)
        else:
            # Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page)
            ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
            parts.append(ocr)

    doc.close()
    return normalize_text("\n\n".join(parts))

def extract_text_from_file(file_path: str) -> str:
    """Extracts text from a .txt or .pdf file."""
    path = Path(file_path)
    suf = path.suffix.lower()

    if suf == ".txt":
        raw = path.read_text(encoding="utf-8", errors="ignore")
        return normalize_text(raw)
    
    if suf == ".pdf":
        return pdf_to_text_smart(str(path))
        
    raise ValueError(f"Unsupported file type '{suf}'. Please upload .pdf or .txt only.")

def split_into_chapters(text: str) -> List[Tuple[str, str]]:
    """
    Best effort chapter split:
    - Detect lines that look like: CHAPTER 1 / Chapter One / CHAPTER ONE etc.
    - If not found, return one chapter = full text.
    Returns: list of (title, body)
    """
    text = normalize_text(text)
    lines = text.splitlines()

    chapter_re = re.compile(r"^\s*(chapter|CHAPTER)\s+([0-9]+|[IVXLC]+|[A-Za-z]+)\b.*$", re.IGNORECASE)

    idxs = []
    titles = []
    for i, ln in enumerate(lines):
        if chapter_re.match(ln.strip()):
            idxs.append(i)
            titles.append(ln.strip())

    if len(idxs) < 2:
        return [("BOOK", text)]

    chapters = []
    for k in range(len(idxs)):
        start = idxs[k]
        end = idxs[k+1] if k+1 < len(idxs) else len(lines)
        title = titles[k]
        body = "\n".join(lines[start:end]).strip()
        chapters.append((title, body))
    return chapters

def split_sentences(paragraph: str) -> List[str]:
    """Splits a paragraph into sentences."""
    paragraph = paragraph.strip()
    if not paragraph:
        return []
    if not any(ch in paragraph for ch in ".!?\u061F\u06D4\u061B…"):
        ls = [ln.strip() for ln in paragraph.split("\n") if ln.strip()]
        return ls if ls else [paragraph]
    return [s.strip() for s in _SENT_BOUNDARY_RE.split(paragraph) if s.strip()]

def iter_paragraphs(text: str):
    """Yields paragraphs from text."""
    for p in re.split(r"\n\s*\n+", text):
        p = p.strip()
        if p:
            yield p