Spaces:
Sleeping
Sleeping
Update doc_reader.py
Browse files- doc_reader.py +108 -85
doc_reader.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
doc_reader.py
|
| 3 |
-------------
|
| 4 |
-
Extracts full text from .docx and .
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
to extract all text. Falls back to pdfplumber for text-based PDFs.
|
| 8 |
For DOCX: recursive XML walk to catch nested tables.
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
| 12 |
import base64
|
|
|
|
|
|
|
| 13 |
import pdfplumber
|
| 14 |
from docx import Document
|
| 15 |
from docx.oxml.ns import qn
|
|
@@ -17,120 +19,110 @@ from pathlib import Path
|
|
| 17 |
from openai import OpenAI
|
| 18 |
|
| 19 |
|
| 20 |
-
# ββ
|
| 21 |
|
| 22 |
def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
|
| 23 |
-
"""Return True if PDF has little/no extractable text (i.e. scanned)."""
|
| 24 |
try:
|
| 25 |
with pdfplumber.open(file_path) as pdf:
|
| 26 |
pages_to_check = min(sample_pages, len(pdf.pages))
|
| 27 |
-
total_chars =
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
avg = total_chars / max(pages_to_check, 1)
|
| 32 |
-
print(f" Avg chars/page (first {pages_to_check}
|
| 33 |
-
return avg < 100
|
| 34 |
except Exception:
|
| 35 |
return True
|
| 36 |
|
| 37 |
|
| 38 |
-
# ββ
|
| 39 |
|
| 40 |
def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
|
| 41 |
-
"""Convert a single PDF page to base64 PNG using pdf2image."""
|
| 42 |
from pdf2image import convert_from_path
|
| 43 |
-
images = convert_from_path(
|
| 44 |
-
file_path,
|
| 45 |
-
first_page=page_num + 1,
|
| 46 |
-
last_page=page_num + 1,
|
| 47 |
-
dpi=200
|
| 48 |
-
)
|
| 49 |
if not images:
|
| 50 |
return ""
|
| 51 |
-
import io
|
| 52 |
buf = io.BytesIO()
|
| 53 |
images[0].save(buf, format="PNG")
|
| 54 |
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def _extract_text_from_scanned_pdf(file_path: str) -> str:
|
| 58 |
-
"""Use GPT-4o vision to extract text from each page of a scanned PDF."""
|
| 59 |
api_key = os.getenv("OPENAI_API_KEY")
|
| 60 |
if not api_key:
|
| 61 |
-
raise ValueError("OPENAI_API_KEY not set β required for scanned PDF
|
| 62 |
|
| 63 |
client = OpenAI(api_key=api_key)
|
| 64 |
|
| 65 |
-
# Get page count
|
| 66 |
with pdfplumber.open(file_path) as pdf:
|
| 67 |
num_pages = len(pdf.pages)
|
| 68 |
|
| 69 |
-
print(f" Scanned PDF
|
| 70 |
-
|
| 71 |
all_text = []
|
| 72 |
|
| 73 |
for page_num in range(num_pages):
|
| 74 |
-
print(f"
|
| 75 |
try:
|
| 76 |
b64 = _pdf_page_to_base64(file_path, page_num)
|
| 77 |
if not b64:
|
| 78 |
continue
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
response = client.chat.completions.create(
|
| 81 |
model="gpt-4o",
|
| 82 |
-
max_tokens=
|
| 83 |
-
messages=[
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
"url": f"data:image/png;base64,{b64}",
|
| 91 |
-
"detail": "high"
|
| 92 |
-
}
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"type": "text",
|
| 96 |
-
"text": (
|
| 97 |
-
"This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE). "
|
| 98 |
-
"Extract ALL text from this page exactly as it appears. "
|
| 99 |
-
"Preserve table structure using | separators for columns. "
|
| 100 |
-
"Preserve all numbers, dates, percentages, names, addresses. "
|
| 101 |
-
"Do NOT summarize. Output raw extracted text only."
|
| 102 |
-
)
|
| 103 |
-
}
|
| 104 |
-
]
|
| 105 |
-
}
|
| 106 |
-
]
|
| 107 |
)
|
| 108 |
-
page_text = response.choices[0].message.content
|
| 109 |
-
all_text.append(f"\n
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
print(f" Warning: page {page_num + 1} failed: {e}")
|
| 113 |
-
all_text.append(f"\n
|
| 114 |
|
| 115 |
return "\n".join(all_text).strip()
|
| 116 |
|
| 117 |
|
| 118 |
-
# ββ
|
| 119 |
|
| 120 |
def extract_text_from_pdf(file_path: str) -> str:
|
| 121 |
-
"""Extract text from PDF β vision for scanned, pdfplumber for text-based."""
|
| 122 |
-
|
| 123 |
if _is_scanned_pdf(file_path):
|
| 124 |
return _extract_text_from_scanned_pdf(file_path)
|
| 125 |
|
| 126 |
-
|
| 127 |
-
print(" Text-based PDF detected β using pdfplumber...")
|
| 128 |
text_parts = []
|
| 129 |
with pdfplumber.open(file_path) as pdf:
|
| 130 |
for i, page in enumerate(pdf.pages):
|
| 131 |
-
page_text = page.extract_text()
|
| 132 |
if page_text:
|
| 133 |
-
text_parts.append(f"\n
|
| 134 |
tables = page.extract_tables()
|
| 135 |
for table in tables:
|
| 136 |
for row in table:
|
|
@@ -141,13 +133,9 @@ def extract_text_from_pdf(file_path: str) -> str:
|
|
| 141 |
return "\n".join(text_parts).strip()
|
| 142 |
|
| 143 |
|
| 144 |
-
# ββ
|
| 145 |
|
| 146 |
def _extract_cell_text(tc_element, depth: int = 0) -> str:
|
| 147 |
-
"""
|
| 148 |
-
Recursively walk a <w:tc> XML element and return all text, including
|
| 149 |
-
text inside nested <w:tbl> elements (tables-within-cells).
|
| 150 |
-
"""
|
| 151 |
parts = []
|
| 152 |
for child in tc_element:
|
| 153 |
tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
|
|
@@ -163,12 +151,10 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
|
|
| 163 |
for tc in tr.findall(qn("w:tc")):
|
| 164 |
cell_text = _extract_cell_text(tc, depth + 1)
|
| 165 |
row_cells.append(cell_text)
|
| 166 |
-
|
| 167 |
deduped = []
|
| 168 |
for val in row_cells:
|
| 169 |
if not deduped or val != deduped[-1]:
|
| 170 |
deduped.append(val)
|
| 171 |
-
|
| 172 |
row_str = " | ".join(deduped)
|
| 173 |
if row_str.strip(" |"):
|
| 174 |
parts.append(row_str)
|
|
@@ -176,6 +162,42 @@ def _extract_cell_text(tc_element, depth: int = 0) -> str:
|
|
| 176 |
return "\n".join(parts)
|
| 177 |
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
def extract_text_from_docx(file_path: str) -> str:
|
| 180 |
doc = Document(file_path)
|
| 181 |
chunks = []
|
|
@@ -185,26 +207,24 @@ def extract_text_from_docx(file_path: str) -> str:
|
|
| 185 |
chunks.append(para.text.strip())
|
| 186 |
|
| 187 |
for t_idx, table in enumerate(doc.tables):
|
| 188 |
-
chunks.append(f"\n--- Table {t_idx + 1} ---")
|
| 189 |
for row in table.rows:
|
| 190 |
row_cells = []
|
| 191 |
for cell in row.cells:
|
| 192 |
cell_text = _extract_cell_text(cell._tc)
|
| 193 |
row_cells.append(cell_text)
|
| 194 |
-
|
| 195 |
deduped = []
|
| 196 |
for val in row_cells:
|
| 197 |
if not deduped or val != deduped[-1]:
|
| 198 |
deduped.append(val)
|
| 199 |
-
|
| 200 |
row_str = " | ".join(deduped)
|
| 201 |
if row_str.strip(" |"):
|
| 202 |
chunks.append(row_str)
|
| 203 |
|
| 204 |
-
|
|
|
|
| 205 |
|
| 206 |
|
| 207 |
-
# ββ
|
| 208 |
|
| 209 |
def extract_text(file_path: str) -> str:
|
| 210 |
ext = Path(file_path).suffix.lower()
|
|
@@ -217,16 +237,17 @@ def extract_text(file_path: str) -> str:
|
|
| 217 |
print(" Format: DOCX")
|
| 218 |
return extract_text_from_docx(file_path)
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
elif ext == ".doc":
|
| 221 |
-
raise ValueError(
|
| 222 |
-
".doc (old Word format) is not supported. "
|
| 223 |
-
"Please save as .docx and re-upload."
|
| 224 |
-
)
|
| 225 |
-
else:
|
| 226 |
-
raise ValueError(f"Unsupported file format: {ext}. Supported: .pdf, .docx")
|
| 227 |
|
|
|
|
|
|
|
| 228 |
|
| 229 |
-
# βββ Quick test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 230 |
|
| 231 |
if __name__ == "__main__":
|
| 232 |
import sys
|
|
@@ -234,8 +255,10 @@ if __name__ == "__main__":
|
|
| 234 |
path = sys.argv[1]
|
| 235 |
print(f"[TEST] Reading: {path}")
|
| 236 |
text = extract_text(path)
|
| 237 |
-
print(f"[TEST] Extracted {len(text):,}
|
| 238 |
-
print("\n--- First
|
| 239 |
-
print(text[:
|
|
|
|
|
|
|
| 240 |
else:
|
| 241 |
-
print("Usage: python doc_reader.py yourfile.pdf/docx")
|
|
|
|
| 1 |
"""
|
| 2 |
doc_reader.py
|
| 3 |
-------------
|
| 4 |
+
Extracts full text from .docx, .pdf, and .txt files.
|
| 5 |
+
For scanned PDFs: converts each page to image and uses GPT-4o vision.
|
| 6 |
+
Falls back to pdfplumber for text-based PDFs.
|
|
|
|
| 7 |
For DOCX: recursive XML walk to catch nested tables.
|
| 8 |
+
Outputs clear section markers so doc_sectioner can locate annexures.
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
| 12 |
import base64
|
| 13 |
+
import io
|
| 14 |
+
import re
|
| 15 |
import pdfplumber
|
| 16 |
from docx import Document
|
| 17 |
from docx.oxml.ns import qn
|
|
|
|
| 19 |
from openai import OpenAI
|
| 20 |
|
| 21 |
|
| 22 |
+
# ββ PDF: detect if scanned ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
|
| 24 |
def _is_scanned_pdf(file_path: str, sample_pages: int = 3) -> bool:
|
|
|
|
| 25 |
try:
|
| 26 |
with pdfplumber.open(file_path) as pdf:
|
| 27 |
pages_to_check = min(sample_pages, len(pdf.pages))
|
| 28 |
+
total_chars = sum(
|
| 29 |
+
len((pdf.pages[i].extract_text() or "").strip())
|
| 30 |
+
for i in range(pages_to_check)
|
| 31 |
+
)
|
| 32 |
avg = total_chars / max(pages_to_check, 1)
|
| 33 |
+
print(f" Avg chars/page (first {pages_to_check}): {avg:.0f}")
|
| 34 |
+
return avg < 100
|
| 35 |
except Exception:
|
| 36 |
return True
|
| 37 |
|
| 38 |
|
| 39 |
+
# ββ PDF: vision OCR via GPT-4o ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
|
| 41 |
def _pdf_page_to_base64(file_path: str, page_num: int) -> str:
|
|
|
|
| 42 |
from pdf2image import convert_from_path
|
| 43 |
+
images = convert_from_path(file_path, first_page=page_num + 1, last_page=page_num + 1, dpi=180)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
if not images:
|
| 45 |
return ""
|
|
|
|
| 46 |
buf = io.BytesIO()
|
| 47 |
images[0].save(buf, format="PNG")
|
| 48 |
return base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 49 |
|
| 50 |
|
| 51 |
+
# Broad prompt used for most pages
|
| 52 |
+
_VISION_PROMPT_BODY = (
|
| 53 |
+
"This is a page from an Indian HFC/NBFC loan document (CAL/CAM/COE/Annexure). "
|
| 54 |
+
"Extract ALL text exactly as it appears. "
|
| 55 |
+
"For tables, output each row on one line with columns separated by ' | '. "
|
| 56 |
+
"Preserve all numbers, dates, rupee amounts, percentages, PAN numbers, addresses. "
|
| 57 |
+
"Do NOT summarize. Output raw extracted text only."
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Targeted prompts for specific page types
|
| 61 |
+
_VISION_PROMPT_TABLE = (
|
| 62 |
+
"This page contains a table from an Indian loan document. "
|
| 63 |
+
"Extract ALL rows of the table with columns separated by ' | '. "
|
| 64 |
+
"Keep every row including headers and totals. "
|
| 65 |
+
"Also include any heading text above or below the table. "
|
| 66 |
+
"Do NOT summarize or skip any row."
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
def _extract_text_from_scanned_pdf(file_path: str) -> str:
|
|
|
|
| 70 |
api_key = os.getenv("OPENAI_API_KEY")
|
| 71 |
if not api_key:
|
| 72 |
+
raise ValueError("OPENAI_API_KEY not set β required for scanned PDF OCR.")
|
| 73 |
|
| 74 |
client = OpenAI(api_key=api_key)
|
| 75 |
|
|
|
|
| 76 |
with pdfplumber.open(file_path) as pdf:
|
| 77 |
num_pages = len(pdf.pages)
|
| 78 |
|
| 79 |
+
print(f" Scanned PDF β {num_pages} pages, using GPT-4o vision...")
|
|
|
|
| 80 |
all_text = []
|
| 81 |
|
| 82 |
for page_num in range(num_pages):
|
| 83 |
+
print(f" Page {page_num + 1}/{num_pages}...")
|
| 84 |
try:
|
| 85 |
b64 = _pdf_page_to_base64(file_path, page_num)
|
| 86 |
if not b64:
|
| 87 |
continue
|
| 88 |
|
| 89 |
+
# Use table prompt for pages likely to have dense tables (annexures)
|
| 90 |
+
# We don't know which pages have tables, so use body prompt for all,
|
| 91 |
+
# but request explicit table row formatting
|
| 92 |
response = client.chat.completions.create(
|
| 93 |
model="gpt-4o",
|
| 94 |
+
max_tokens=3000,
|
| 95 |
+
messages=[{
|
| 96 |
+
"role": "user",
|
| 97 |
+
"content": [
|
| 98 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}", "detail": "high"}},
|
| 99 |
+
{"type": "text", "text": _VISION_PROMPT_BODY},
|
| 100 |
+
]
|
| 101 |
+
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
+
page_text = response.choices[0].message.content or ""
|
| 104 |
+
all_text.append(f"\n=== PDF PAGE {page_num + 1} ===\n{page_text}")
|
| 105 |
|
| 106 |
except Exception as e:
|
| 107 |
print(f" Warning: page {page_num + 1} failed: {e}")
|
| 108 |
+
all_text.append(f"\n=== PDF PAGE {page_num + 1} === [extraction failed: {e}]")
|
| 109 |
|
| 110 |
return "\n".join(all_text).strip()
|
| 111 |
|
| 112 |
|
| 113 |
+
# ββ PDF: text-based extraction ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 114 |
|
| 115 |
def extract_text_from_pdf(file_path: str) -> str:
|
|
|
|
|
|
|
| 116 |
if _is_scanned_pdf(file_path):
|
| 117 |
return _extract_text_from_scanned_pdf(file_path)
|
| 118 |
|
| 119 |
+
print(" Text-based PDF β using pdfplumber...")
|
|
|
|
| 120 |
text_parts = []
|
| 121 |
with pdfplumber.open(file_path) as pdf:
|
| 122 |
for i, page in enumerate(pdf.pages):
|
| 123 |
+
page_text = page.extract_text() or ""
|
| 124 |
if page_text:
|
| 125 |
+
text_parts.append(f"\n=== PDF PAGE {i + 1} ===\n{page_text}")
|
| 126 |
tables = page.extract_tables()
|
| 127 |
for table in tables:
|
| 128 |
for row in table:
|
|
|
|
| 133 |
return "\n".join(text_parts).strip()
|
| 134 |
|
| 135 |
|
| 136 |
+
# ββ DOCX helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
|
| 138 |
def _extract_cell_text(tc_element, depth: int = 0) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
parts = []
|
| 140 |
for child in tc_element:
|
| 141 |
tag = child.tag.split("}")[1] if "}" in child.tag else child.tag
|
|
|
|
| 151 |
for tc in tr.findall(qn("w:tc")):
|
| 152 |
cell_text = _extract_cell_text(tc, depth + 1)
|
| 153 |
row_cells.append(cell_text)
|
|
|
|
| 154 |
deduped = []
|
| 155 |
for val in row_cells:
|
| 156 |
if not deduped or val != deduped[-1]:
|
| 157 |
deduped.append(val)
|
|
|
|
| 158 |
row_str = " | ".join(deduped)
|
| 159 |
if row_str.strip(" |"):
|
| 160 |
parts.append(row_str)
|
|
|
|
| 162 |
return "\n".join(parts)
|
| 163 |
|
| 164 |
|
| 165 |
+
# Known heading patterns that mark important document sections
|
| 166 |
+
_SECTION_HEADINGS = [
|
| 167 |
+
("term sheet", "=== TERM SHEET ==="),
|
| 168 |
+
("terms of facility", "=== TERM SHEET ==="),
|
| 169 |
+
("annexure ii a", "=== ANNEXURE II A β SECURITY UNITS P1 ==="),
|
| 170 |
+
("annexure ii b", "=== ANNEXURE II B β SECURITY UNITS P2 ==="),
|
| 171 |
+
("annexure ii", "=== ANNEXURE II β SECURITY UNITS ==="),
|
| 172 |
+
("list of unsold units", "=== SECURITY UNITS TABLE ==="),
|
| 173 |
+
("list of unsold apartment", "=== SECURITY UNITS TABLE ==="),
|
| 174 |
+
("repayment schedule", "=== REPAYMENT SCHEDULE ==="),
|
| 175 |
+
("details of co-borrower","=== CO-BORROWERS ==="),
|
| 176 |
+
("details of co borrower","=== CO-BORROWERS ==="),
|
| 177 |
+
("pre-disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
|
| 178 |
+
("pre disbursement condition", "=== PRE-DISBURSEMENT CONDITIONS ==="),
|
| 179 |
+
("other monitoring condition", "=== MONITORING CONDITIONS ==="),
|
| 180 |
+
("special conditions", "=== SPECIAL CONDITIONS ==="),
|
| 181 |
+
("exit table", "=== EXIT TABLE ==="),
|
| 182 |
+
("collection slot", "=== SI / EXIT TABLE ==="),
|
| 183 |
+
("cash flow analysis", "=== CASH FLOW ANALYSIS ==="),
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _inject_section_markers(text: str) -> str:
|
| 188 |
+
"""Insert section markers before lines that match known headings."""
|
| 189 |
+
lines = text.split("\n")
|
| 190 |
+
out = []
|
| 191 |
+
for line in lines:
|
| 192 |
+
ll = line.lower().strip()
|
| 193 |
+
for pattern, marker in _SECTION_HEADINGS:
|
| 194 |
+
if pattern in ll and len(ll) < 120:
|
| 195 |
+
out.append(f"\n{marker}")
|
| 196 |
+
break
|
| 197 |
+
out.append(line)
|
| 198 |
+
return "\n".join(out)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
def extract_text_from_docx(file_path: str) -> str:
|
| 202 |
doc = Document(file_path)
|
| 203 |
chunks = []
|
|
|
|
| 207 |
chunks.append(para.text.strip())
|
| 208 |
|
| 209 |
for t_idx, table in enumerate(doc.tables):
|
|
|
|
| 210 |
for row in table.rows:
|
| 211 |
row_cells = []
|
| 212 |
for cell in row.cells:
|
| 213 |
cell_text = _extract_cell_text(cell._tc)
|
| 214 |
row_cells.append(cell_text)
|
|
|
|
| 215 |
deduped = []
|
| 216 |
for val in row_cells:
|
| 217 |
if not deduped or val != deduped[-1]:
|
| 218 |
deduped.append(val)
|
|
|
|
| 219 |
row_str = " | ".join(deduped)
|
| 220 |
if row_str.strip(" |"):
|
| 221 |
chunks.append(row_str)
|
| 222 |
|
| 223 |
+
raw = "\n".join(chunks).strip()
|
| 224 |
+
return _inject_section_markers(raw)
|
| 225 |
|
| 226 |
|
| 227 |
+
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 228 |
|
| 229 |
def extract_text(file_path: str) -> str:
|
| 230 |
ext = Path(file_path).suffix.lower()
|
|
|
|
| 237 |
print(" Format: DOCX")
|
| 238 |
return extract_text_from_docx(file_path)
|
| 239 |
|
| 240 |
+
elif ext == ".txt":
|
| 241 |
+
print(" Format: TXT")
|
| 242 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 243 |
+
return f.read().strip()
|
| 244 |
+
|
| 245 |
elif ext == ".doc":
|
| 246 |
+
raise ValueError(".doc is not supported. Save as .docx and re-upload.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
+
else:
|
| 249 |
+
raise ValueError(f"Unsupported format: {ext}. Supported: .pdf, .docx, .txt")
|
| 250 |
|
|
|
|
| 251 |
|
| 252 |
if __name__ == "__main__":
|
| 253 |
import sys
|
|
|
|
| 255 |
path = sys.argv[1]
|
| 256 |
print(f"[TEST] Reading: {path}")
|
| 257 |
text = extract_text(path)
|
| 258 |
+
print(f"[TEST] Extracted {len(text):,} chars")
|
| 259 |
+
print("\n--- First 2000 chars ---")
|
| 260 |
+
print(text[:2000])
|
| 261 |
+
print("\n--- Last 2000 chars ---")
|
| 262 |
+
print(text[-2000:])
|
| 263 |
else:
|
| 264 |
+
print("Usage: python doc_reader.py yourfile.pdf/docx/txt")
|