triospacehub commited on
Commit
c8cbfa0
Β·
verified Β·
1 Parent(s): 4bcf2c9

Update doc_reader.py

Browse files
Files changed (1) hide show
  1. doc_reader.py +108 -85
doc_reader.py CHANGED
@@ -1,15 +1,17 @@
1
  """
2
  doc_reader.py
3
  -------------
4
- Extracts full text from .docx and .pdf files.
5
-
6
- For scanned PDFs: converts each page to an image and uses GPT-4o vision
7
- to extract all text. Falls back to pdfplumber for text-based PDFs.
8
  For DOCX: recursive XML walk to catch nested tables.
 
9
  """
10
 
11
  import os
12
  import base64
 
 
13
  import pdfplumber
14
  from docx import Document
15
  from docx.oxml.ns import qn
@@ -17,120 +19,110 @@ from pathlib import Path
17
  from openai import OpenAI
18
 
19
 
20
- # ─── PDF: detect if scanned ───────────────────────────────────────────────────
21
 
22
  def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
23
- """Return True if PDF has little/no extractable text (i.e. scanned)."""
24
  try:
25
  with pdfplumber.open(file_path) as pdf:
26
  pages_to_check = min(sample_pages, len(pdf.pages))
27
- total_chars = 0
28
- for i in range(pages_to_check):
29
- text = pdf.pages[i].extract_text() or ""
30
- total_chars += len(text.strip())
31
  avg = total_chars / max(pages_to_check, 1)
32
- print(f" Avg chars/page (first {pages_to_check} pages): {avg:.0f}")
33
- return avg < 100 # scanned if less than 100 chars per page
34
  except Exception:
35
  return True
36
 
37
 
38
- # ─── PDF: vision extraction via GPT-4o ───────────────────────────────────────
39
 
40
  def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
41
- """Convert a single PDF page to base64 PNG using pdf2image."""
42
  from pdf2image import convert_from_path
43
- images = convert_from_path(
44
- file_path,
45
- first_page=page_num + 1,
46
- last_page=page_num + 1,
47
- dpi=200
48
- )
49
  if not images:
50
  return ""
51
- import io
52
  buf = io.BytesIO()
53
  images[0].save(buf, format="PNG")
54
  return base64.b64encode(buf.getvalue()).decode("utf-8")
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def _extract_text_from_scanned_pdf(file_path: str) -> str:
58
- """Use GPT-4o vision to extract text from each page of a scanned PDF."""
59
  api_key = os.getenv("OPENAI_API_KEY")
60
  if not api_key:
61
- raise ValueError("OPENAI_API_KEY not set β€” required for scanned PDF extraction.")
62
 
63
  client = OpenAI(api_key=api_key)
64
 
65
- # Get page count
66
  with pdfplumber.open(file_path) as pdf:
67
  num_pages = len(pdf.pages)
68
 
69
- print(f" Scanned PDF detected β€” {num_pages} pages, using GPT-4o vision...")
70
-
71
  all_text = []
72
 
73
  for page_num in range(num_pages):
74
- print(f" Processing page {page_num + 1}/{num_pages}...")
75
  try:
76
  b64 = _pdf_page_to_base64(file_path, page_num)
77
  if not b64:
78
  continue
79
 
 
 
 
80
  response = client.chat.completions.create(
81
  model="gpt-4o",
82
- max_tokens=4096,
83
- messages=[
84
- {
85
- "role": "user",
86
- "content": [
87
- {
88
- "type": "image_url",
89
- "image_url": {
90
- "url": f"data:image/png;base64,{b64}",
91
- "detail": "high"
92
- }
93
- },
94
- {
95
- "type": "text",
96
- "text": (
97
- "This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE). "
98
- "Extract ALL text from this page exactly as it appears. "
99
- "Preserve table structure using | separators for columns. "
100
- "Preserve all numbers, dates, percentages, names, addresses. "
101
- "Do NOT summarize. Output raw extracted text only."
102
- )
103
- }
104
- ]
105
- }
106
- ]
107
  )
108
- page_text = response.choices[0].message.content
109
- all_text.append(f"\n--- Page {page_num + 1} ---\n{page_text}")
110
 
111
  except Exception as e:
112
  print(f" Warning: page {page_num + 1} failed: {e}")
113
- all_text.append(f"\n--- Page {page_num + 1} --- [extraction failed: {e}]")
114
 
115
  return "\n".join(all_text).strip()
116
 
117
 
118
- # ─── PDF: text-based extraction ───────────────────────────────────────────────
119
 
120
  def extract_text_from_pdf(file_path: str) -> str:
121
- """Extract text from PDF β€” vision for scanned, pdfplumber for text-based."""
122
-
123
  if _is_scanned_pdf(file_path):
124
  return _extract_text_from_scanned_pdf(file_path)
125
 
126
- # Text-based PDF β€” use pdfplumber
127
- print(" Text-based PDF detected β€” using pdfplumber...")
128
  text_parts = []
129
  with pdfplumber.open(file_path) as pdf:
130
  for i, page in enumerate(pdf.pages):
131
- page_text = page.extract_text()
132
  if page_text:
133
- text_parts.append(f"\n--- Page {i + 1} ---\n{page_text}")
134
  tables = page.extract_tables()
135
  for table in tables:
136
  for row in table:
@@ -141,13 +133,9 @@ def extract_text_from_pdf(file_path: str) -> str:
141
  return "\n".join(text_parts).strip()
142
 
143
 
144
- # ─── DOCX helpers ─────────────────────────────────────────────────────────────
145
 
146
  def _extract_cell_text(tc_element, depth: int = 0) -> str:
147
- """
148
- Recursively walk a <w:tc> XML element and return all text, including
149
- text inside nested <w:tbl> elements (tables-within-cells).
150
- """
151
  parts = []
152
  for child in tc_element:
153
  tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
@@ -163,12 +151,10 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
163
  for tc in tr.findall(qn("w:tc")):
164
  cell_text = _extract_cell_text(tc, depth + 1)
165
  row_cells.append(cell_text)
166
-
167
  deduped = []
168
  for val in row_cells:
169
  if not deduped or val != deduped[-1]:
170
  deduped.append(val)
171
-
172
  row_str = " | ".join(deduped)
173
  if row_str.strip(" |"):
174
  parts.append(row_str)
@@ -176,6 +162,42 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
176
  return "\n".join(parts)
177
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def extract_text_from_docx(file_path: str) -> str:
180
  doc = Document(file_path)
181
  chunks = []
@@ -185,26 +207,24 @@ def extract_text_from_docx(file_path: str) -> str:
185
  chunks.append(para.text.strip())
186
 
187
  for t_idx, table in enumerate(doc.tables):
188
- chunks.append(f"\n--- Table {t_idx + 1} ---")
189
  for row in table.rows:
190
  row_cells = []
191
  for cell in row.cells:
192
  cell_text = _extract_cell_text(cell._tc)
193
  row_cells.append(cell_text)
194
-
195
  deduped = []
196
  for val in row_cells:
197
  if not deduped or val != deduped[-1]:
198
  deduped.append(val)
199
-
200
  row_str = " | ".join(deduped)
201
  if row_str.strip(" |"):
202
  chunks.append(row_str)
203
 
204
- return "\n".join(chunks).strip()
 
205
 
206
 
207
- # ─── Public API ───────────────────────────────────────────────────────────────
208
 
209
  def extract_text(file_path: str) -> str:
210
  ext = Path(file_path).suffix.lower()
@@ -217,16 +237,17 @@ def extract_text(file_path: str) -> str:
217
  print(" Format: DOCX")
218
  return extract_text_from_docx(file_path)
219
 
 
 
 
 
 
220
  elif ext == ".doc":
221
- raise ValueError(
222
- ".doc (old Word format) is not supported. "
223
- "Please save as .docx and re-upload."
224
- )
225
- else:
226
- raise ValueError(f"Unsupported file format: {ext}. Supported: .pdf, .docx")
227
 
 
 
228
 
229
- # ─── Quick test ───────────────────────────────────────────────────────────────
230
 
231
  if __name__ == "__main__":
232
  import sys
@@ -234,8 +255,10 @@ if __name__ == "__main__":
234
  path = sys.argv[1]
235
  print(f"[TEST] Reading: {path}")
236
  text = extract_text(path)
237
- print(f"[TEST] Extracted {len(text):,} characters")
238
- print("\n--- First 3000 chars ---")
239
- print(text[:3000])
 
 
240
  else:
241
- print("Usage: python doc_reader.py yourfile.pdf/docx")
 
1
  """
2
  doc_reader.py
3
  -------------
4
+ Extracts full text from .docx, .pdf, and .txt files.
5
+ For scanned PDFs: converts each page to image and uses GPT-4o vision.
6
+ Falls back to pdfplumber for text-based PDFs.
 
7
  For DOCX: recursive XML walk to catch nested tables.
8
+ Outputs clear section markers so doc_sectioner can locate annexures.
9
  """
10
 
11
  import os
12
  import base64
13
+ import io
14
+ import re
15
  import pdfplumber
16
  from docx import Document
17
  from docx.oxml.ns import qn
 
19
  from openai import OpenAI
20
 
21
 
22
+ # ── PDF: detect if scanned ────────────────────────────────────────────────────
23
 
24
  def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
 
25
  try:
26
  with pdfplumber.open(file_path) as pdf:
27
  pages_to_check = min(sample_pages, len(pdf.pages))
28
+ total_chars = sum(
29
+ len((pdf.pages[i].extract_text() or "").strip())
30
+ for i in range(pages_to_check)
31
+ )
32
  avg = total_chars / max(pages_to_check, 1)
33
+ print(f" Avg chars/page (first {pages_to_check}): {avg:.0f}")
34
+ return avg < 100
35
  except Exception:
36
  return True
37
 
38
 
39
+ # ── PDF: vision OCR via GPT-4o ────────────────────────────────────────────────
40
 
41
  def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
 
42
  from pdf2image import convert_from_path
43
+ images = convert_from_path(file_path, first_page=page_num + 1, last_page=page_num + 1, dpi=180)
 
 
 
 
 
44
  if not images:
45
  return ""
 
46
  buf = io.BytesIO()
47
  images[0].save(buf, format="PNG")
48
  return base64.b64encode(buf.getvalue()).decode("utf-8")
49
 
50
 
51
+ # Broad prompt used for most pages
52
+ _VISION_PROMPT_BODY = (
53
+ "This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE/Annexure). "
54
+ "Extract ALL text exactly as it appears. "
55
+ "For tables, output each row on one line with columns separated by ' | '. "
56
+ "Preserve all numbers, dates, rupee amounts, percentages, PAN numbers, addresses. "
57
+ "Do NOT summarize. Output raw extracted text only."
58
+ )
59
+
60
+ # Targeted prompts for specific page types
61
+ _VISION_PROMPT_TABLE = (
62
+ "This page contains a table from an Indian loan document. "
63
+ "Extract ALL rows of the table with columns separated by ' | '. "
64
+ "Keep every row including headers and totals. "
65
+ "Also include any heading text above or below the table. "
66
+ "Do NOT summarize or skip any row."
67
+ )
68
+
69
  def _extract_text_from_scanned_pdf(file_path: str) -> str:
 
70
  api_key = os.getenv("OPENAI_API_KEY")
71
  if not api_key:
72
+ raise ValueError("OPENAI_API_KEY not set β€” required for scanned PDF OCR.")
73
 
74
  client = OpenAI(api_key=api_key)
75
 
 
76
  with pdfplumber.open(file_path) as pdf:
77
  num_pages = len(pdf.pages)
78
 
79
+ print(f" Scanned PDF β€” {num_pages} pages, using GPT-4o vision...")
 
80
  all_text = []
81
 
82
  for page_num in range(num_pages):
83
+ print(f" Page {page_num + 1}/{num_pages}...")
84
  try:
85
  b64 = _pdf_page_to_base64(file_path, page_num)
86
  if not b64:
87
  continue
88
 
89
+ # Use table prompt for pages likely to have dense tables (annexures)
90
+ # We don't know which pages have tables, so use body prompt for all,
91
+ # but request explicit table row formatting
92
  response = client.chat.completions.create(
93
  model="gpt-4o",
94
+ max_tokens=3000,
95
+ messages=[{
96
+ "role": "user",
97
+ "content": [
98
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}", "detail": "high"}},
99
+ {"type": "text", "text": _VISION_PROMPT_BODY},
100
+ ]
101
+ }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
+ page_text = response.choices[0].message.content or ""
104
+ all_text.append(f"\n=== PDF PAGE {page_num + 1} ===\n{page_text}")
105
 
106
  except Exception as e:
107
  print(f" Warning: page {page_num + 1} failed: {e}")
108
+ all_text.append(f"\n=== PDF PAGE {page_num + 1} === [extraction failed: {e}]")
109
 
110
  return "\n".join(all_text).strip()
111
 
112
 
113
+ # ── PDF: text-based extraction ────────────────────────────────────────────────
114
 
115
  def extract_text_from_pdf(file_path: str) -> str:
 
 
116
  if _is_scanned_pdf(file_path):
117
  return _extract_text_from_scanned_pdf(file_path)
118
 
119
+ print(" Text-based PDF β€” using pdfplumber...")
 
120
  text_parts = []
121
  with pdfplumber.open(file_path) as pdf:
122
  for i, page in enumerate(pdf.pages):
123
+ page_text = page.extract_text() or ""
124
  if page_text:
125
+ text_parts.append(f"\n=== PDF PAGE {i + 1} ===\n{page_text}")
126
  tables = page.extract_tables()
127
  for table in tables:
128
  for row in table:
 
133
  return "\n".join(text_parts).strip()
134
 
135
 
136
+ # ── DOCX helpers ──────────────────────────────────────────────────────────────
137
 
138
  def _extract_cell_text(tc_element, depth: int = 0) -> str:
 
 
 
 
139
  parts = []
140
  for child in tc_element:
141
  tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
 
151
  for tc in tr.findall(qn("w:tc")):
152
  cell_text = _extract_cell_text(tc, depth + 1)
153
  row_cells.append(cell_text)
 
154
  deduped = []
155
  for val in row_cells:
156
  if not deduped or val != deduped[-1]:
157
  deduped.append(val)
 
158
  row_str = " | ".join(deduped)
159
  if row_str.strip(" |"):
160
  parts.append(row_str)
 
162
  return "\n".join(parts)
163
 
164
 
165
+ # Known heading patterns that mark important document sections
166
+ _SECTION_HEADINGS = [
167
+ ("term sheet", "=== TERM SHEET ==="),
168
+ ("terms of facility", "=== TERM SHEET ==="),
169
+ ("annexure ii a", "=== ANNEXURE II A β€” SECURITY UNITS P1 ==="),
170
+ ("annexure ii b", "=== ANNEXURE II B β€” SECURITY UNITS P2 ==="),
171
+ ("annexure ii", "=== ANNEXURE II β€” SECURITY UNITS ==="),
172
+ ("list of unsold units", "=== SECURITY UNITS TABLE ==="),
173
+ ("list of unsold apartment", "=== SECURITY UNITS TABLE ==="),
174
+ ("repayment schedule", "=== REPAYMENT SCHEDULE ==="),
175
+ ("details of co-borrower","=== CO-BORROWERS ==="),
176
+ ("details of co borrower","=== CO-BORROWERS ==="),
177
+ ("pre-disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
178
+ ("pre disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
179
+ ("other monitoring condition", "=== MONITORING CONDITIONS ==="),
180
+ ("special conditions", "=== SPECIAL CONDITIONS ==="),
181
+ ("exit table", "=== EXIT TABLE ==="),
182
+ ("collection slot", "=== SI / EXIT TABLE ==="),
183
+ ("cash flow analysis", "=== CASH FLOW ANALYSIS ==="),
184
+ ]
185
+
186
+
187
+ def _inject_section_markers(text: str) -> str:
188
+ """Insert section markers before lines that match known headings."""
189
+ lines = text.split("\n")
190
+ out = []
191
+ for line in lines:
192
+ ll = line.lower().strip()
193
+ for pattern, marker in _SECTION_HEADINGS:
194
+ if pattern in ll and len(ll) < 120:
195
+ out.append(f"\n{marker}")
196
+ break
197
+ out.append(line)
198
+ return "\n".join(out)
199
+
200
+
201
  def extract_text_from_docx(file_path: str) -> str:
202
  doc = Document(file_path)
203
  chunks = []
 
207
  chunks.append(para.text.strip())
208
 
209
  for t_idx, table in enumerate(doc.tables):
 
210
  for row in table.rows:
211
  row_cells = []
212
  for cell in row.cells:
213
  cell_text = _extract_cell_text(cell._tc)
214
  row_cells.append(cell_text)
 
215
  deduped = []
216
  for val in row_cells:
217
  if not deduped or val != deduped[-1]:
218
  deduped.append(val)
 
219
  row_str = " | ".join(deduped)
220
  if row_str.strip(" |"):
221
  chunks.append(row_str)
222
 
223
+ raw = "\n".join(chunks).strip()
224
+ return _inject_section_markers(raw)
225
 
226
 
227
+ # ── Public API ────────────────────────────────────────────────────────────────
228
 
229
  def extract_text(file_path: str) -> str:
230
  ext = Path(file_path).suffix.lower()
 
237
  print(" Format: DOCX")
238
  return extract_text_from_docx(file_path)
239
 
240
+ elif ext == ".txt":
241
+ print(" Format: TXT")
242
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
243
+ return f.read().strip()
244
+
245
  elif ext == ".doc":
246
+ raise ValueError(".doc is not supported. Save as .docx and re-upload.")
 
 
 
 
 
247
 
248
+ else:
249
+ raise ValueError(f"Unsupported format: {ext}. Supported: .pdf, .docx, .txt")
250
 
 
251
 
252
  if __name__ == "__main__":
253
  import sys
 
255
  path = sys.argv[1]
256
  print(f"[TEST] Reading: {path}")
257
  text = extract_text(path)
258
+ print(f"[TEST] Extracted {len(text):,} chars")
259
+ print("\n--- First 2000 chars ---")
260
+ print(text[:2000])
261
+ print("\n--- Last 2000 chars ---")
262
+ print(text[-2000:])
263
  else:
264
+ print("Usage: python doc_reader.py yourfile.pdf/docx/txt")