Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

App Files Files Community

invoice-processor-ml / src /table_extraction.py

GSoumyajit2005

feat: added bulk processing, html reporting, and geometric table extraction

90dbe20 about 1 month ago

raw

history blame contribute delete

4.97 kB

	# src/table_extraction.py

	from typing import List, Dict, Any
	import re

	# Common phrases that indicate NON-item text (should be filtered out)
	EXCLUDE_PHRASES = [
	"thank you", "thank", "goods sold", "not returnable", "returnable",
	"shopping at", "visit again", "customer copy", "merchant copy",
	"powered by", "terms and conditions", "t&c apply", "cashier",
	"counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc",
	"gst summary", "tax summary", "payment", "change", "cash",
	"credit card", "debit card", "subtotal", "sub total", "grand total",
	"total includes", "includes gst", "tax invoice", "invoice"
	]

	def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]:
	"""
	Geometric Heuristic to extract table rows.
	Logic:
	1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty').
	2. Find 'Footer' Y-position (where 'Total' usually sits).
	3. Filter all words strictly BETWEEN Header and Footer.
	4. Group remaining words into 'Rows' based on similar Y-coordinates.
	"""

	if not words or not boxes:
	return []

	# 1. Identify Anchor Points
	header_y = 0
	footer_y = float('inf')

	header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"]
	footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"]

	# Scan for Header (Top boundary)
	for i, word in enumerate(words):
	if word.lower() in header_keywords:
	y_bottom = boxes[i][1] + boxes[i][3]
	if y_bottom > header_y:
	header_y = y_bottom

	# Scan for Footer (Bottom boundary)
	for i, word in enumerate(words):
	if word.lower() in footer_keywords:
	y_top = boxes[i][1]
	if y_top < footer_y and y_top > header_y:
	footer_y = y_top

	# If no header found, assume top 25% is header
	if header_y == 0 and boxes:
	max_y = max(b[1] for b in boxes)
	header_y = max_y * 0.25

	# If no footer found, assume bottom 25% is footer
	if footer_y == float('inf') and boxes:
	max_y = max(b[1] for b in boxes)
	footer_y = max_y * 0.75

	# 2. Filter Content (The "Sandwich" Meat)
	table_words = []
	for i, word in enumerate(words):
	bx, by, bw, bh = boxes[i]
	if by > header_y and (by + bh) < footer_y:
	table_words.append({"text": word, "box": boxes[i]})

	# 3. Group by Rows (Y-clustering)
	rows = []
	if not table_words:
	return []

	table_words.sort(key=lambda x: x["box"][1])

	current_row = [table_words[0]]
	current_y = table_words[0]["box"][1]

	for item in table_words[1:]:
	y = item["box"][1]
	if abs(y - current_y) < 15:
	current_row.append(item)
	else:
	current_row.sort(key=lambda x: x["box"][0])
	rows.append(current_row)
	current_row = [item]
	current_y = y

	if current_row:
	current_row.sort(key=lambda x: x["box"][0])
	rows.append(current_row)

	# 4. Convert Rows to Structured Dicts with FILTERING
	structured_items = []

	for row in rows:
	full_text = " ".join([w["text"] for w in row])
	full_text_lower = full_text.lower()

	# Skip rows that match exclude phrases
	if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES):
	continue

	# Skip very short rows (likely noise)
	if len(full_text.strip()) < 3:
	continue

	# Find all numbers (potential prices)
	# Match patterns like: 0.90, 12.50, 1,234.56
	numbers = re.findall(r'\d{1,3}(?:,\d{3})\.?\d', full_text)

	item_obj = {
	"description": full_text,
	"quantity": 1,
	"unit_price": 0.0,
	"total": 0.0
	}

	if numbers:
	try:
	# Clean and convert last number as price
	val = float(numbers[-1].replace(',', ''))

	# Skip if price is 0 or unreasonably small for a line item
	if val <= 0:
	continue

	item_obj["total"] = val
	item_obj["unit_price"] = val
	# Remove the price from description
	item_obj["description"] = full_text.replace(numbers[-1], "").strip()

	# Skip if description is now empty or too short
	if len(item_obj["description"].strip()) < 2:
	continue

	except:
	continue
	else:
	# No numbers found = not a valid line item
	continue

	structured_items.append(item_obj)

	return structured_items