Spaces:
Sleeping
Sleeping
| """ | |
| doc_sectioner.py | |
| ---------------- | |
| Builds targeted text slices for each extraction pass. | |
| Key fix: annexures (security units, co-borrowers) live in the TAIL of | |
| long CAM documents (chars 90k+). We always include the tail explicitly | |
| for every pass instead of relying solely on keyword scoring. | |
| """ | |
| import re | |
| from typing import Dict, List | |
| from doc_reader import extract_text | |
| # ── Section markers to split on ────────────────────────────────────────────── | |
| SECTION_MARKERS = [ | |
| "=== TERM SHEET ===", | |
| "=== ANNEXURE II A", | |
| "=== ANNEXURE II B", | |
| "=== ANNEXURE II ", | |
| "=== SECURITY UNITS TABLE ===", | |
| "=== REPAYMENT SCHEDULE ===", | |
| "=== CO-BORROWERS ===", | |
| "=== PRE-DISBURSEMENT CONDITIONS ===", | |
| "=== MONITORING CONDITIONS ===", | |
| "=== SPECIAL CONDITIONS ===", | |
| "=== SI / EXIT TABLE ===", | |
| "=== CASH FLOW ANALYSIS ===", | |
| "=== PDF PAGE", | |
| # fallback natural headings | |
| "pre-disbursement condition", | |
| "pre disbursement condition", | |
| "conditions for first disbursement", | |
| "conditions to be satisfied", | |
| "other monitoring condition", | |
| "monitoring condition", | |
| "special condition", | |
| "tranche", | |
| "disbursal tranche", | |
| "repayment", | |
| "exit table", | |
| "standing instruction", | |
| "collection slot", | |
| "term sheet", | |
| "executive summary", | |
| "project detail", | |
| "last date of drawal", | |
| "last date of draw", | |
| "details of co-borrower", | |
| "details of the co borrower", | |
| "annexure ii", | |
| "list of unsold", | |
| ] | |
| def split_into_sections(full_text: str) -> Dict[str, str]: | |
| lines = full_text.split("\n") | |
| sections: Dict[str, str] = {} | |
| current_section = "header" | |
| current_lines: List[str] = [] | |
| for line in lines: | |
| line_lower = line.lower().strip() | |
| matched = False | |
| for marker in SECTION_MARKERS: | |
| marker_lower = marker.lower().lstrip("=").strip() | |
| if marker_lower in line_lower and len(line.strip()) < 150: | |
| if current_lines: | |
| sections[current_section] = "\n".join(current_lines) | |
| current_section = line.strip()[:80] | |
| current_lines = [line] | |
| matched = True | |
| break | |
| if not matched: | |
| current_lines.append(line) | |
| if current_lines: | |
| sections[current_section] = "\n".join(current_lines) | |
| return sections | |
| # ── Keyword mapping per field group ────────────────────────────────────────── | |
| FIELD_KEYWORDS: Dict[str, List[str]] = { | |
| "basic": [ | |
| "cal no", "cal number", "hfccoe", "date", "borrower", "address", | |
| "gst", "pan", "sanction date", "loan amount", "lender", | |
| "credit arrangement letter", "executive summary", "applicant", | |
| "financial assistance", "kind attn" | |
| ], | |
| "repayment": [ | |
| "repayment", "moratorium", "interest", "rate of interest", "ihplr", | |
| "spread", "applicable rate", "loan amount", "tenure", "emi", | |
| "last date of drawal", "last date of draw", "processing fee", | |
| "penal", "isr", "dsra", "isra", "12.50", "11.50", "18.85" | |
| ], | |
| "rera": [ | |
| "rera", "registration no", "project end", "commencement", | |
| "odcco", "construction progress", "stage of construction", "soc", | |
| "project detail", "type of project", "valuation", "saleable area", | |
| "developer", "location", "land area", "expected completion" | |
| ], | |
| "tranche": [ | |
| "tranche", "disbursal", "disbursement tranche", "% of construction", | |
| "sales (plots)", "collection", "end use", "purpose of facility" | |
| ], | |
| "exit_si": [ | |
| "collection slot", "adjustment (si)", "standing instruction", | |
| "exit table", "si to be kept", "si %", "40%", "80%", "75%", | |
| "up to", "above" | |
| ], | |
| "pre_conditions": [ | |
| "pre-disbursement condition", "pre disbursement condition", | |
| "conditions for first disbursement", "conditions: for first", | |
| "1st tranche", "2nd tranche", "for first disbursal" | |
| ], | |
| "post_conditions": [ | |
| "within 15 days", "within 15 working days", "within 30 working days", | |
| "within 45 working days", "conditions to be satisfied within", | |
| "conditions to be complied", "within stipulated" | |
| ], | |
| "monitoring": [ | |
| "other monitoring condition", "monitoring condition", | |
| "monitoring of construction", "special condition", "regulatory", | |
| "mandatory condition" | |
| ], | |
| "rating": [ | |
| "rating", "sfrg", "structured finance risk", "bbb", "category a", | |
| "project rating", "internal rating" | |
| ], | |
| # These two groups must use the document TAIL — they live at the end of CAM | |
| "security_units": [ | |
| "=== annexure ii", "=== security units", "list of unsold", | |
| "unsold units", "mortgaged", "hypothecated", "unit no", "floor no", | |
| "saleable area", "rera carpet area", "207 a", "207 b", "401", | |
| "flat | unsold", "office | unsold", "annexure ii a", "annexure ii b" | |
| ], | |
| "borrowers": [ | |
| "=== co-borrowers", "details of co-borrower", "details of the co borrower", | |
| "promoter/ partner", "co-borrower details", "pan", "dob", "cibil", | |
| "net worth", "acxpr", "akypr", "biapr", "ayrpr", "cjbpr", | |
| "rajkumar", "dinesh", "kunal", "karan", "pratik", | |
| "yogeshbhai", "hareshkumar", "darshanbhai", "dhruvin" | |
| ], | |
| } | |
| def get_chunk_for_fields( | |
| full_text: str, | |
| field_group: str, | |
| window_lines: int = 100, | |
| ) -> str: | |
| keywords = FIELD_KEYWORDS.get(field_group, []) | |
| lines = full_text.split("\n") | |
| relevant_indices: set = set() | |
| for i, line in enumerate(lines): | |
| line_lower = line.lower() | |
| for kw in keywords: | |
| if kw.lower() in line_lower: | |
| start = max(0, i - 5) | |
| end = min(len(lines), i + window_lines) | |
| for j in range(start, end): | |
| relevant_indices.add(j) | |
| if not relevant_indices: | |
| return "" | |
| return "\n".join(lines[i] for i in sorted(relevant_indices)) | |
| def build_targeted_context( | |
| full_text: str, | |
| max_chars_per_chunk: int = 14000, | |
| ) -> Dict[str, str]: | |
| groups = [ | |
| "basic", "repayment", "rera", "tranche", "exit_si", | |
| "pre_conditions", "post_conditions", "monitoring", "rating", | |
| "security_units", "borrowers", | |
| ] | |
| chunks: Dict[str, str] = {} | |
| for group in groups: | |
| chunk = get_chunk_for_fields(full_text, group) | |
| chunks[group] = chunk[:max_chars_per_chunk] if chunk else "" | |
| return chunks | |
| def get_combined_context(full_text: str) -> str: | |
| """ | |
| Returns a smart-trimmed context string with all key sections. | |
| Guarantees the document TAIL is included (for annexures / co-borrowers). | |
| """ | |
| chunks = build_targeted_context(full_text) | |
| section_order = [ | |
| ("=== BASIC INFO / EXECUTIVE SUMMARY ===", "basic"), | |
| ("=== REPAYMENT / INTEREST / TENURE ===", "repayment"), | |
| ("=== RERA / CONSTRUCTION PROGRESS ===", "rera"), | |
| ("=== TRANCHE DETAILS ===", "tranche"), | |
| ("=== EXIT TABLE / SI ===", "exit_si"), | |
| ("=== PRE-DISBURSEMENT CONDITIONS ===", "pre_conditions"), | |
| ("=== POST-DISBURSEMENT CONDITIONS ===", "post_conditions"), | |
| ("=== MONITORING CONDITIONS ===", "monitoring"), | |
| ("=== PROJECT RATING ===", "rating"), | |
| ("=== SECURITY UNITS (ANNEXURE) ===", "security_units"), | |
| ("=== CO-BORROWERS / PARTNERS ===", "borrowers"), | |
| ] | |
| combined = "" | |
| for header, key in section_order: | |
| text = chunks.get(key, "").strip() | |
| if text: | |
| combined += f"\n\n{header}\n{text}" | |
| return combined.strip() | |
| def get_document_tail(full_text: str, tail_chars: int = 20000) -> str: | |
| """ | |
| Returns the last `tail_chars` characters of the document. | |
| Critical for capturing annexures in long CAM documents where | |
| security unit tables and co-borrower tables live at the very end. | |
| """ | |
| if len(full_text) <= tail_chars: | |
| return full_text | |
| return full_text[-tail_chars:] | |
| def get_document_head(full_text: str, head_chars: int = 5000) -> str: | |
| return full_text[:head_chars] | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) > 1: | |
| path = sys.argv[1] | |
| print(f"Reading: {path}") | |
| full_text = extract_text(path) | |
| print(f"Full text: {len(full_text)} chars") | |
| context = get_combined_context(full_text) | |
| print(f"Smart context: {len(context)} chars") | |
| print(f"Tail (last 20k): {len(get_document_tail(full_text))} chars") | |
| print("\n--- Context preview ---") | |
| print(context[:3000]) | |
| else: | |
| print("Usage: python doc_sectioner.py yourfile.docx") |