Spaces:

triospacehub
/

ocr

Sleeping

App Files Files Community

triospacehub commited on Apr 18

Commit

c8cbfa0

verified ·

1 Parent(s): 4bcf2c9

Update doc_reader.py

Browse files

Files changed (1) hide show

doc_reader.py +108 -85

doc_reader.py CHANGED Viewed

@@ -1,15 +1,17 @@
 """
 doc_reader.py
 -------------
-Extracts full text from .docx and .pdf files.
-For scanned PDFs: converts each page to an image and uses GPT-4o vision
-to extract all text. Falls back to pdfplumber for text-based PDFs.
 For DOCX: recursive XML walk to catch nested tables.
 """
 import os
 import base64
 import pdfplumber
 from docx import Document
 from docx.oxml.ns import qn
@@ -17,120 +19,110 @@ from pathlib import Path
 from openai import OpenAI
-# ─── PDF: detect if scanned ───────────────────────────────────────────────────
 def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
-    """Return True if PDF has little/no extractable text (i.e. scanned)."""
     try:
         with pdfplumber.open(file_path) as pdf:
             pages_to_check = min(sample_pages, len(pdf.pages))
-            total_chars = 0
-            for i in range(pages_to_check):
-                text = pdf.pages[i].extract_text() or ""
-                total_chars += len(text.strip())
             avg = total_chars / max(pages_to_check, 1)
-            print(f"   Avg chars/page (first {pages_to_check} pages): {avg:.0f}")
-            return avg < 100  # scanned if less than 100 chars per page
     except Exception:
         return True
-# ─── PDF: vision extraction via GPT-4o ───────────────────────────────────────
 def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
-    """Convert a single PDF page to base64 PNG using pdf2image."""
     from pdf2image import convert_from_path
-    images = convert_from_path(
-        file_path,
-        first_page=page_num + 1,
-        last_page=page_num + 1,
-        dpi=200
-    )
     if not images:
         return ""
-    import io
     buf = io.BytesIO()
     images[0].save(buf, format="PNG")
     return base64.b64encode(buf.getvalue()).decode("utf-8")
 def _extract_text_from_scanned_pdf(file_path: str) -> str:
-    """Use GPT-4o vision to extract text from each page of a scanned PDF."""
     api_key = os.getenv("OPENAI_API_KEY")
     if not api_key:
-        raise ValueError("OPENAI_API_KEY not set — required for scanned PDF extraction.")
     client = OpenAI(api_key=api_key)
-    # Get page count
     with pdfplumber.open(file_path) as pdf:
         num_pages = len(pdf.pages)
-    print(f"   Scanned PDF detected — {num_pages} pages, using GPT-4o vision...")
     all_text = []
     for page_num in range(num_pages):
-        print(f"   Processing page {page_num + 1}/{num_pages}...")
         try:
             b64 = _pdf_page_to_base64(file_path, page_num)
             if not b64:
                 continue
             response = client.chat.completions.create(
                 model="gpt-4o",
-                max_tokens=4096,
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/png;base64,{b64}",
-                                    "detail": "high"
-                                }
-                            },
-                            {
-                                "type": "text",
-                                "text": (
-                                    "This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE). "
-                                    "Extract ALL text from this page exactly as it appears. "
-                                    "Preserve table structure using | separators for columns. "
-                                    "Preserve all numbers, dates, percentages, names, addresses. "
-                                    "Do NOT summarize. Output raw extracted text only."
-                                )
-                            }
-                        ]
-                    }
-                ]
             )
-            page_text = response.choices[0].message.content
-            all_text.append(f"\n--- Page {page_num + 1} ---\n{page_text}")
         except Exception as e:
             print(f"   Warning: page {page_num + 1} failed: {e}")
-            all_text.append(f"\n--- Page {page_num + 1} --- [extraction failed: {e}]")
     return "\n".join(all_text).strip()
-# ─── PDF: text-based extraction ───────────────────────────────────────────────
 def extract_text_from_pdf(file_path: str) -> str:
-    """Extract text from PDF — vision for scanned, pdfplumber for text-based."""
     if _is_scanned_pdf(file_path):
         return _extract_text_from_scanned_pdf(file_path)
-    # Text-based PDF — use pdfplumber
-    print("   Text-based PDF detected — using pdfplumber...")
     text_parts = []
     with pdfplumber.open(file_path) as pdf:
         for i, page in enumerate(pdf.pages):
-            page_text = page.extract_text()
             if page_text:
-                text_parts.append(f"\n--- Page {i + 1} ---\n{page_text}")
             tables = page.extract_tables()
             for table in tables:
                 for row in table:
@@ -141,13 +133,9 @@ def extract_text_from_pdf(file_path: str) -> str:
     return "\n".join(text_parts).strip()
-# ─── DOCX helpers ─────────────────────────────────────────────────────────────
 def _extract_cell_text(tc_element, depth: int = 0) -> str:
-    """
-    Recursively walk a <w:tc> XML element and return all text, including
-    text inside nested <w:tbl> elements (tables-within-cells).
-    """
     parts = []
     for child in tc_element:
         tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
@@ -163,12 +151,10 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
                 for tc in tr.findall(qn("w:tc")):
                     cell_text = _extract_cell_text(tc, depth + 1)
                     row_cells.append(cell_text)
                 deduped = []
                 for val in row_cells:
                     if not deduped or val != deduped[-1]:
                         deduped.append(val)
                 row_str = " | ".join(deduped)
                 if row_str.strip(" |"):
                     parts.append(row_str)
@@ -176,6 +162,42 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
     return "\n".join(parts)
 def extract_text_from_docx(file_path: str) -> str:
     doc = Document(file_path)
     chunks = []
@@ -185,26 +207,24 @@ def extract_text_from_docx(file_path: str) -> str:
             chunks.append(para.text.strip())
     for t_idx, table in enumerate(doc.tables):
-        chunks.append(f"\n--- Table {t_idx + 1} ---")
         for row in table.rows:
             row_cells = []
             for cell in row.cells:
                 cell_text = _extract_cell_text(cell._tc)
                 row_cells.append(cell_text)
             deduped = []
             for val in row_cells:
                 if not deduped or val != deduped[-1]:
                     deduped.append(val)
             row_str = " | ".join(deduped)
             if row_str.strip(" |"):
                 chunks.append(row_str)
-    return "\n".join(chunks).strip()
-# ─── Public API ───────────────────────────────────────────────────────────────
 def extract_text(file_path: str) -> str:
     ext = Path(file_path).suffix.lower()
@@ -217,16 +237,17 @@ def extract_text(file_path: str) -> str:
         print("   Format: DOCX")
         return extract_text_from_docx(file_path)
     elif ext == ".doc":
-        raise ValueError(
-            ".doc (old Word format) is not supported. "
-            "Please save as .docx and re-upload."
-        )
-    else:
-        raise ValueError(f"Unsupported file format: {ext}. Supported: .pdf, .docx")
-# ─── Quick test ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     import sys
@@ -234,8 +255,10 @@ if __name__ == "__main__":
         path = sys.argv[1]
         print(f"[TEST] Reading: {path}")
         text = extract_text(path)
-        print(f"[TEST] Extracted {len(text):,} characters")
-        print("\n--- First 3000 chars ---")
-        print(text[:3000])
     else:
-        print("Usage: python doc_reader.py yourfile.pdf/docx")

 """
 doc_reader.py
 -------------
+Extracts full text from .docx, .pdf, and .txt files.
+For scanned PDFs: converts each page to image and uses GPT-4o vision.
+Falls back to pdfplumber for text-based PDFs.
 For DOCX: recursive XML walk to catch nested tables.
+Outputs clear section markers so doc_sectioner can locate annexures.
 """
 import os
 import base64
+import io
+import re
 import pdfplumber
 from docx import Document
 from docx.oxml.ns import qn
 from openai import OpenAI
+# ── PDF: detect if scanned ────────────────────────────────────────────────────
 def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
     try:
         with pdfplumber.open(file_path) as pdf:
             pages_to_check = min(sample_pages, len(pdf.pages))
+            total_chars = sum(
+                len((pdf.pages[i].extract_text() or "").strip())
+                for i in range(pages_to_check)
+            )
             avg = total_chars / max(pages_to_check, 1)
+            print(f"   Avg chars/page (first {pages_to_check}): {avg:.0f}")
+            return avg < 100
     except Exception:
         return True
+# ── PDF: vision OCR via GPT-4o ────────────────────────────────────────────────
 def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
     from pdf2image import convert_from_path
+    images = convert_from_path(file_path, first_page=page_num + 1, last_page=page_num + 1, dpi=180)
     if not images:
         return ""
     buf = io.BytesIO()
     images[0].save(buf, format="PNG")
     return base64.b64encode(buf.getvalue()).decode("utf-8")
+# Broad prompt used for most pages
+_VISION_PROMPT_BODY = (
+    "This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE/Annexure). "
+    "Extract ALL text exactly as it appears. "
+    "For tables, output each row on one line with columns separated by ' | '. "
+    "Preserve all numbers, dates, rupee amounts, percentages, PAN numbers, addresses. "
+    "Do NOT summarize. Output raw extracted text only."
+)
+# Targeted prompts for specific page types
+_VISION_PROMPT_TABLE = (
+    "This page contains a table from an Indian loan document. "
+    "Extract ALL rows of the table with columns separated by ' | '. "
+    "Keep every row including headers and totals. "
+    "Also include any heading text above or below the table. "
+    "Do NOT summarize or skip any row."
+)
 def _extract_text_from_scanned_pdf(file_path: str) -> str:
     api_key = os.getenv("OPENAI_API_KEY")
     if not api_key:
+        raise ValueError("OPENAI_API_KEY not set — required for scanned PDF OCR.")
     client = OpenAI(api_key=api_key)
     with pdfplumber.open(file_path) as pdf:
         num_pages = len(pdf.pages)
+    print(f"   Scanned PDF — {num_pages} pages, using GPT-4o vision...")
     all_text = []
     for page_num in range(num_pages):
+        print(f"   Page {page_num + 1}/{num_pages}...")
         try:
             b64 = _pdf_page_to_base64(file_path, page_num)
             if not b64:
                 continue
+            # Use table prompt for pages likely to have dense tables (annexures)
+            # We don't know which pages have tables, so use body prompt for all,
+            # but request explicit table row formatting
             response = client.chat.completions.create(
                 model="gpt-4o",
+                max_tokens=3000,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}", "detail": "high"}},
+                        {"type": "text", "text": _VISION_PROMPT_BODY},
+                    ]
+                }]
             )
+            page_text = response.choices[0].message.content or ""
+            all_text.append(f"\n=== PDF PAGE {page_num + 1} ===\n{page_text}")
         except Exception as e:
             print(f"   Warning: page {page_num + 1} failed: {e}")
+            all_text.append(f"\n=== PDF PAGE {page_num + 1} === [extraction failed: {e}]")
     return "\n".join(all_text).strip()
+# ── PDF: text-based extraction ────────────────────────────────────────────────
 def extract_text_from_pdf(file_path: str) -> str:
     if _is_scanned_pdf(file_path):
         return _extract_text_from_scanned_pdf(file_path)
+    print("   Text-based PDF — using pdfplumber...")
     text_parts = []
     with pdfplumber.open(file_path) as pdf:
         for i, page in enumerate(pdf.pages):
+            page_text = page.extract_text() or ""
             if page_text:
+                text_parts.append(f"\n=== PDF PAGE {i + 1} ===\n{page_text}")
             tables = page.extract_tables()
             for table in tables:
                 for row in table:
     return "\n".join(text_parts).strip()
+# ── DOCX helpers ──────────────────────────────────────────────────────────────
 def _extract_cell_text(tc_element, depth: int = 0) -> str:
     parts = []
     for child in tc_element:
         tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
                 for tc in tr.findall(qn("w:tc")):
                     cell_text = _extract_cell_text(tc, depth + 1)
                     row_cells.append(cell_text)
                 deduped = []
                 for val in row_cells:
                     if not deduped or val != deduped[-1]:
                         deduped.append(val)
                 row_str = " | ".join(deduped)
                 if row_str.strip(" |"):
                     parts.append(row_str)
     return "\n".join(parts)
+# Known heading patterns that mark important document sections
+_SECTION_HEADINGS = [
+    ("term sheet",            "=== TERM SHEET ==="),
+    ("terms of facility",     "=== TERM SHEET ==="),
+    ("annexure ii a",         "=== ANNEXURE II A — SECURITY UNITS P1 ==="),
+    ("annexure ii b",         "=== ANNEXURE II B — SECURITY UNITS P2 ==="),
+    ("annexure ii",           "=== ANNEXURE II — SECURITY UNITS ==="),
+    ("list of unsold units",  "=== SECURITY UNITS TABLE ==="),
+    ("list of unsold apartment", "=== SECURITY UNITS TABLE ==="),
+    ("repayment schedule",    "=== REPAYMENT SCHEDULE ==="),
+    ("details of co-borrower","=== CO-BORROWERS ==="),
+    ("details of co borrower","=== CO-BORROWERS ==="),
+    ("pre-disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
+    ("pre disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
+    ("other monitoring condition", "=== MONITORING CONDITIONS ==="),
+    ("special conditions",    "=== SPECIAL CONDITIONS ==="),
+    ("exit table",            "=== EXIT TABLE ==="),
+    ("collection slot",       "=== SI / EXIT TABLE ==="),
+    ("cash flow analysis",    "=== CASH FLOW ANALYSIS ==="),
+]
+def _inject_section_markers(text: str) -> str:
+    """Insert section markers before lines that match known headings."""
+    lines = text.split("\n")
+    out = []
+    for line in lines:
+        ll = line.lower().strip()
+        for pattern, marker in _SECTION_HEADINGS:
+            if pattern in ll and len(ll) < 120:
+                out.append(f"\n{marker}")
+                break
+        out.append(line)
+    return "\n".join(out)
 def extract_text_from_docx(file_path: str) -> str:
     doc = Document(file_path)
     chunks = []
             chunks.append(para.text.strip())
     for t_idx, table in enumerate(doc.tables):
         for row in table.rows:
             row_cells = []
             for cell in row.cells:
                 cell_text = _extract_cell_text(cell._tc)
                 row_cells.append(cell_text)
             deduped = []
             for val in row_cells:
                 if not deduped or val != deduped[-1]:
                     deduped.append(val)
             row_str = " | ".join(deduped)
             if row_str.strip(" |"):
                 chunks.append(row_str)
+    raw = "\n".join(chunks).strip()
+    return _inject_section_markers(raw)
+# ── Public API ────────────────────────────────────────────────────────────────
 def extract_text(file_path: str) -> str:
     ext = Path(file_path).suffix.lower()
         print("   Format: DOCX")
         return extract_text_from_docx(file_path)
+    elif ext == ".txt":
+        print("   Format: TXT")
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+            return f.read().strip()
     elif ext == ".doc":
+        raise ValueError(".doc is not supported. Save as .docx and re-upload.")
+    else:
+        raise ValueError(f"Unsupported format: {ext}. Supported: .pdf, .docx, .txt")
 if __name__ == "__main__":
     import sys
         path = sys.argv[1]
         print(f"[TEST] Reading: {path}")
         text = extract_text(path)
+        print(f"[TEST] Extracted {len(text):,} chars")
+        print("\n--- First 2000 chars ---")
+        print(text[:2000])
+        print("\n--- Last 2000 chars ---")
+        print(text[-2000:])
     else:
+        print("Usage: python doc_reader.py yourfile.pdf/docx/txt")