USF00 commited on
Commit
81a53ea
·
1 Parent(s): d7630f2

Optimize summarization generation and PDF OCR processing

Browse files
Files changed (2) hide show
  1. summarizer.py +1 -1
  2. utils.py +16 -2
summarizer.py CHANGED
@@ -11,7 +11,7 @@ MODEL_NAME = "facebook/bart-large-cnn"
11
  BATCH_SIZE = 4
12
  NUM_BEAMS = 4
13
  NO_REPEAT_NGRAM_SIZE = 3
14
- EARLY_STOPPING = False
15
 
16
  # Chunking config
17
  MAX_INPUT_TOKENS = 1024
 
11
  BATCH_SIZE = 4
12
  NUM_BEAMS = 4
13
  NO_REPEAT_NGRAM_SIZE = 3
14
+ EARLY_STOPPING = True
15
 
16
  # Chunking config
17
  MAX_INPUT_TOKENS = 1024
utils.py CHANGED
@@ -45,18 +45,32 @@ def ocr_pdf_page(pdf_path: str, page_number_1based: int, dpi: int = OCR_DPI, lan
45
  return pytesseract.image_to_string(img, lang=lang)
46
 
47
  def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
48
- """Extracts text from PDF, falling back to OCR for scanned pages."""
 
49
  doc = fitz.open(str(pdf_path))
50
  parts = []
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  for i in range(doc.page_count):
53
  page = doc.load_page(i)
54
  native = (page.get_text("text") or "").strip()
55
  native_compact_len = len(re.sub(r"\s+", "", native))
56
 
57
- if native_compact_len >= native_min_chars_per_page:
 
58
  parts.append(native)
59
  else:
 
60
  ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
61
  parts.append(ocr)
62
 
 
45
  return pytesseract.image_to_string(img, lang=lang)
46
 
47
  def pdf_to_text_smart(pdf_path: str, native_min_chars_per_page: int = NATIVE_MIN_CHARS_PER_PAGE) -> str:
48
+ """Extracts text from PDF, falling back to OCR for scanned pages.
49
+ Optimized to avoid OCR on native PDFs with sparse pages (like title pages)."""
50
  doc = fitz.open(str(pdf_path))
51
  parts = []
52
+
53
+ # Quick check: is this likely a native PDF?
54
+ # Sample up to 10 pages to see if any has a good amount of native text.
55
+ is_native_pdf = False
56
+ sample_pages = min(10, doc.page_count)
57
+ for i in range(sample_pages):
58
+ page = doc.load_page(i)
59
+ native = (page.get_text("text") or "").strip()
60
+ if len(re.sub(r"\s+", "", native)) > 200:
61
+ is_native_pdf = True
62
+ break
63
 
64
  for i in range(doc.page_count):
65
  page = doc.load_page(i)
66
  native = (page.get_text("text") or "").strip()
67
  native_compact_len = len(re.sub(r"\s+", "", native))
68
 
69
+ if native_compact_len >= native_min_chars_per_page or is_native_pdf:
70
+ # If we know it's a native PDF, even sparse pages (like titles/blank pages) don't need OCR
71
  parts.append(native)
72
  else:
73
+ # Only OCR if it's not a known native PDF and native text is sparse (could be a scanned page)
74
  ocr = ocr_pdf_page(pdf_path, page_number_1based=i+1)
75
  parts.append(ocr)
76