invoice-processor-ml / src /table_extraction.py
GSoumyajit2005's picture
feat: added bulk processing, html reporting, and geometric table extraction
90dbe20
# src/table_extraction.py
from typing import List, Dict, Any
import re
# Common phrases that indicate NON-item text (should be filtered out)
EXCLUDE_PHRASES = [
"thank you", "thank", "goods sold", "not returnable", "returnable",
"shopping at", "visit again", "customer copy", "merchant copy",
"powered by", "terms and conditions", "t&c apply", "cashier",
"counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc",
"gst summary", "tax summary", "payment", "change", "cash",
"credit card", "debit card", "subtotal", "sub total", "grand total",
"total includes", "includes gst", "tax invoice", "invoice"
]
def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]:
"""
Geometric Heuristic to extract table rows.
Logic:
1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty').
2. Find 'Footer' Y-position (where 'Total' usually sits).
3. Filter all words strictly BETWEEN Header and Footer.
4. Group remaining words into 'Rows' based on similar Y-coordinates.
"""
if not words or not boxes:
return []
# 1. Identify Anchor Points
header_y = 0
footer_y = float('inf')
header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"]
footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"]
# Scan for Header (Top boundary)
for i, word in enumerate(words):
if word.lower() in header_keywords:
y_bottom = boxes[i][1] + boxes[i][3]
if y_bottom > header_y:
header_y = y_bottom
# Scan for Footer (Bottom boundary)
for i, word in enumerate(words):
if word.lower() in footer_keywords:
y_top = boxes[i][1]
if y_top < footer_y and y_top > header_y:
footer_y = y_top
# If no header found, assume top 25% is header
if header_y == 0 and boxes:
max_y = max(b[1] for b in boxes)
header_y = max_y * 0.25
# If no footer found, assume bottom 25% is footer
if footer_y == float('inf') and boxes:
max_y = max(b[1] for b in boxes)
footer_y = max_y * 0.75
# 2. Filter Content (The "Sandwich" Meat)
table_words = []
for i, word in enumerate(words):
bx, by, bw, bh = boxes[i]
if by > header_y and (by + bh) < footer_y:
table_words.append({"text": word, "box": boxes[i]})
# 3. Group by Rows (Y-clustering)
rows = []
if not table_words:
return []
table_words.sort(key=lambda x: x["box"][1])
current_row = [table_words[0]]
current_y = table_words[0]["box"][1]
for item in table_words[1:]:
y = item["box"][1]
if abs(y - current_y) < 15:
current_row.append(item)
else:
current_row.sort(key=lambda x: x["box"][0])
rows.append(current_row)
current_row = [item]
current_y = y
if current_row:
current_row.sort(key=lambda x: x["box"][0])
rows.append(current_row)
# 4. Convert Rows to Structured Dicts with FILTERING
structured_items = []
for row in rows:
full_text = " ".join([w["text"] for w in row])
full_text_lower = full_text.lower()
# Skip rows that match exclude phrases
if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES):
continue
# Skip very short rows (likely noise)
if len(full_text.strip()) < 3:
continue
# Find all numbers (potential prices)
# Match patterns like: 0.90, 12.50, 1,234.56
numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text)
item_obj = {
"description": full_text,
"quantity": 1,
"unit_price": 0.0,
"total": 0.0
}
if numbers:
try:
# Clean and convert last number as price
val = float(numbers[-1].replace(',', ''))
# Skip if price is 0 or unreasonably small for a line item
if val <= 0:
continue
item_obj["total"] = val
item_obj["unit_price"] = val
# Remove the price from description
item_obj["description"] = full_text.replace(numbers[-1], "").strip()
# Skip if description is now empty or too short
if len(item_obj["description"].strip()) < 2:
continue
except:
continue
else:
# No numbers found = not a valid line item
continue
structured_items.append(item_obj)
return structured_items