Spaces:
Sleeping
Sleeping
Optimize summarization generation and PDF OCR processing
Browse files- summarizer.py +1 -1
- utils.py +16 -2
summarizer.py
CHANGED
|
@@ -11,7 +11,7 @@ MODEL_NAME = "facebook/bart-large-cnn"
|
|
| 11 |
BATCH_SIZE = 4
|
| 12 |
NUM_BEAMS = 4
|
| 13 |
NO_REPEAT_NGRAM_SIZE = 3
|
| 14 |
-
EARLY_STOPPING =
|
| 15 |
|
| 16 |
# Chunking config
|
| 17 |
MAX_INPUT_TOKENS = 1024
|
|
|
|
| 11 |
BATCH_SIZE = 4
|
| 12 |
NUM_BEAMS = 4
|
| 13 |
NO_REPEAT_NGRAM_SIZE = 3
|
| 14 |
+
EARLY_STOPPING = True
|
| 15 |
|
| 16 |
# Chunking config
|
| 17 |
MAX_INPUT_TOKENS = 1024
|
utils.py
CHANGED
|
@@ -45,18 +45,32 @@ def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lan
|
|
| 45 |
return pytesseract.image_to_string(img, lang=lang)
|
| 46 |
|
| 47 |
def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
|
| 48 |
-
"""Extracts text from PDF, falling back to OCR for scanned pages.
|
|
|
|
| 49 |
doc = fitz.open(str(pdf_path))
|
| 50 |
parts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
for i in range(doc.page_count):
|
| 53 |
page = doc.load_page(i)
|
| 54 |
native = (page.get_text("text") or "").strip()
|
| 55 |
native_compact_len = len(re.sub(r"\s+", "", native))
|
| 56 |
|
| 57 |
-
if native_compact_len >= native_min_chars_per_page:
|
|
|
|
| 58 |
parts.append(native)
|
| 59 |
else:
|
|
|
|
| 60 |
ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
|
| 61 |
parts.append(ocr)
|
| 62 |
|
|
|
|
| 45 |
return pytesseract.image_to_string(img, lang=lang)
|
| 46 |
|
| 47 |
def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
|
| 48 |
+
"""Extracts text from PDF, falling back to OCR for scanned pages.
|
| 49 |
+
Optimized to avoid OCR on native PDFs with sparse pages (like title pages)."""
|
| 50 |
doc = fitz.open(str(pdf_path))
|
| 51 |
parts = []
|
| 52 |
+
|
| 53 |
+
# Quick check: is this likely a native PDF?
|
| 54 |
+
# Sample up to 10 pages to see if any has a good amount of native text.
|
| 55 |
+
is_native_pdf = False
|
| 56 |
+
sample_pages = min(10, doc.page_count)
|
| 57 |
+
for i in range(sample_pages):
|
| 58 |
+
page = doc.load_page(i)
|
| 59 |
+
native = (page.get_text("text") or "").strip()
|
| 60 |
+
if len(re.sub(r"\s+", "", native)) > 200:
|
| 61 |
+
is_native_pdf = True
|
| 62 |
+
break
|
| 63 |
|
| 64 |
for i in range(doc.page_count):
|
| 65 |
page = doc.load_page(i)
|
| 66 |
native = (page.get_text("text") or "").strip()
|
| 67 |
native_compact_len = len(re.sub(r"\s+", "", native))
|
| 68 |
|
| 69 |
+
if native_compact_len >= native_min_chars_per_page or is_native_pdf:
|
| 70 |
+
# If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR
|
| 71 |
parts.append(native)
|
| 72 |
else:
|
| 73 |
+
# Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page)
|
| 74 |
ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
|
| 75 |
parts.append(ocr)
|
| 76 |
|