Spaces:

triospacehub
/

ocr

Sleeping

App Files Files Community

ocr / doc_sectioner.py

triospacehub

Update doc_sectioner.py

0d6f39d verified about 2 months ago

raw

history blame contribute delete

8.93 kB

	"""
	doc_sectioner.py
	----------------
	Builds targeted text slices for each extraction pass.
	Key fix: annexures (security units, co-borrowers) live in the TAIL of
	long CAM documents (chars 90k+). We always include the tail explicitly
	for every pass instead of relying solely on keyword scoring.
	"""

	import re
	from typing import Dict, List
	from doc_reader import extract_text


	# ── Section markers to split on ──────────────────────────────────────────────

	SECTION_MARKERS = [
	"=== TERM SHEET ===",
	"=== ANNEXURE II A",
	"=== ANNEXURE II B",
	"=== ANNEXURE II ",
	"=== SECURITY UNITS TABLE ===",
	"=== REPAYMENT SCHEDULE ===",
	"=== CO-BORROWERS ===",
	"=== PRE-DISBURSEMENT CONDITIONS ===",
	"=== MONITORING CONDITIONS ===",
	"=== SPECIAL CONDITIONS ===",
	"=== SI / EXIT TABLE ===",
	"=== CASH FLOW ANALYSIS ===",
	"=== PDF PAGE",
	# fallback natural headings
	"pre-disbursement condition",
	"pre disbursement condition",
	"conditions for first disbursement",
	"conditions to be satisfied",
	"other monitoring condition",
	"monitoring condition",
	"special condition",
	"tranche",
	"disbursal tranche",
	"repayment",
	"exit table",
	"standing instruction",
	"collection slot",
	"term sheet",
	"executive summary",
	"project detail",
	"last date of drawal",
	"last date of draw",
	"details of co-borrower",
	"details of the co borrower",
	"annexure ii",
	"list of unsold",
	]


	def split_into_sections(full_text: str) -> Dict[str, str]:
	lines = full_text.split("\n")
	sections: Dict[str, str] = {}
	current_section = "header"
	current_lines: List[str] = []

	for line in lines:
	line_lower = line.lower().strip()
	matched = False
	for marker in SECTION_MARKERS:
	marker_lower = marker.lower().lstrip("=").strip()
	if marker_lower in line_lower and len(line.strip()) < 150:
	if current_lines:
	sections[current_section] = "\n".join(current_lines)
	current_section = line.strip()[:80]
	current_lines = [line]
	matched = True
	break
	if not matched:
	current_lines.append(line)

	if current_lines:
	sections[current_section] = "\n".join(current_lines)

	return sections


	# ── Keyword mapping per field group ──────────────────────────────────────────

	FIELD_KEYWORDS: Dict[str, List[str]] = {
	"basic": [
	"cal no", "cal number", "hfccoe", "date", "borrower", "address",
	"gst", "pan", "sanction date", "loan amount", "lender",
	"credit arrangement letter", "executive summary", "applicant",
	"financial assistance", "kind attn"
	],
	"repayment": [
	"repayment", "moratorium", "interest", "rate of interest", "ihplr",
	"spread", "applicable rate", "loan amount", "tenure", "emi",
	"last date of drawal", "last date of draw", "processing fee",
	"penal", "isr", "dsra", "isra", "12.50", "11.50", "18.85"
	],
	"rera": [
	"rera", "registration no", "project end", "commencement",
	"odcco", "construction progress", "stage of construction", "soc",
	"project detail", "type of project", "valuation", "saleable area",
	"developer", "location", "land area", "expected completion"
	],
	"tranche": [
	"tranche", "disbursal", "disbursement tranche", "% of construction",
	"sales (plots)", "collection", "end use", "purpose of facility"
	],
	"exit_si": [
	"collection slot", "adjustment (si)", "standing instruction",
	"exit table", "si to be kept", "si %", "40%", "80%", "75%",
	"up to", "above"
	],
	"pre_conditions": [
	"pre-disbursement condition", "pre disbursement condition",
	"conditions for first disbursement", "conditions: for first",
	"1st tranche", "2nd tranche", "for first disbursal"
	],
	"post_conditions": [
	"within 15 days", "within 15 working days", "within 30 working days",
	"within 45 working days", "conditions to be satisfied within",
	"conditions to be complied", "within stipulated"
	],
	"monitoring": [
	"other monitoring condition", "monitoring condition",
	"monitoring of construction", "special condition", "regulatory",
	"mandatory condition"
	],
	"rating": [
	"rating", "sfrg", "structured finance risk", "bbb", "category a",
	"project rating", "internal rating"
	],
	# These two groups must use the document TAIL — they live at the end of CAM
	"security_units": [
	"=== annexure ii", "=== security units", "list of unsold",
	"unsold units", "mortgaged", "hypothecated", "unit no", "floor no",
	"saleable area", "rera carpet area", "207 a", "207 b", "401",
	"flat \| unsold", "office \| unsold", "annexure ii a", "annexure ii b"
	],
	"borrowers": [
	"=== co-borrowers", "details of co-borrower", "details of the co borrower",
	"promoter/ partner", "co-borrower details", "pan", "dob", "cibil",
	"net worth", "acxpr", "akypr", "biapr", "ayrpr", "cjbpr",
	"rajkumar", "dinesh", "kunal", "karan", "pratik",
	"yogeshbhai", "hareshkumar", "darshanbhai", "dhruvin"
	],
	}


	def get_chunk_for_fields(
	full_text: str,
	field_group: str,
	window_lines: int = 100,
	) -> str:
	keywords = FIELD_KEYWORDS.get(field_group, [])
	lines = full_text.split("\n")
	relevant_indices: set = set()

	for i, line in enumerate(lines):
	line_lower = line.lower()
	for kw in keywords:
	if kw.lower() in line_lower:
	start = max(0, i - 5)
	end = min(len(lines), i + window_lines)
	for j in range(start, end):
	relevant_indices.add(j)

	if not relevant_indices:
	return ""

	return "\n".join(lines[i] for i in sorted(relevant_indices))


	def build_targeted_context(
	full_text: str,
	max_chars_per_chunk: int = 14000,
	) -> Dict[str, str]:
	groups = [
	"basic", "repayment", "rera", "tranche", "exit_si",
	"pre_conditions", "post_conditions", "monitoring", "rating",
	"security_units", "borrowers",
	]

	chunks: Dict[str, str] = {}
	for group in groups:
	chunk = get_chunk_for_fields(full_text, group)
	chunks[group] = chunk[:max_chars_per_chunk] if chunk else ""

	return chunks


	def get_combined_context(full_text: str) -> str:
	"""
	Returns a smart-trimmed context string with all key sections.
	Guarantees the document TAIL is included (for annexures / co-borrowers).
	"""
	chunks = build_targeted_context(full_text)

	section_order = [
	("=== BASIC INFO / EXECUTIVE SUMMARY ===", "basic"),
	("=== REPAYMENT / INTEREST / TENURE ===", "repayment"),
	("=== RERA / CONSTRUCTION PROGRESS ===", "rera"),
	("=== TRANCHE DETAILS ===", "tranche"),
	("=== EXIT TABLE / SI ===", "exit_si"),
	("=== PRE-DISBURSEMENT CONDITIONS ===", "pre_conditions"),
	("=== POST-DISBURSEMENT CONDITIONS ===", "post_conditions"),
	("=== MONITORING CONDITIONS ===", "monitoring"),
	("=== PROJECT RATING ===", "rating"),
	("=== SECURITY UNITS (ANNEXURE) ===", "security_units"),
	("=== CO-BORROWERS / PARTNERS ===", "borrowers"),
	]

	combined = ""
	for header, key in section_order:
	text = chunks.get(key, "").strip()
	if text:
	combined += f"\n\n{header}\n{text}"

	return combined.strip()


	def get_document_tail(full_text: str, tail_chars: int = 20000) -> str:
	"""
	Returns the last `tail_chars` characters of the document.
	Critical for capturing annexures in long CAM documents where
	security unit tables and co-borrower tables live at the very end.
	"""
	if len(full_text) <= tail_chars:
	return full_text
	return full_text[-tail_chars:]


	def get_document_head(full_text: str, head_chars: int = 5000) -> str:
	return full_text[:head_chars]


	if __name__ == "__main__":
	import sys
	if len(sys.argv) > 1:
	path = sys.argv[1]
	print(f"Reading: {path}")
	full_text = extract_text(path)
	print(f"Full text: {len(full_text)} chars")
	context = get_combined_context(full_text)
	print(f"Smart context: {len(context)} chars")
	print(f"Tail (last 20k): {len(get_document_tail(full_text))} chars")
	print("\n--- Context preview ---")
	print(context[:3000])
	else:
	print("Usage: python doc_sectioner.py yourfile.docx")