Spaces:
Sleeping
Sleeping
| # src/table_extraction.py | |
| from typing import List, Dict, Any | |
| import re | |
| # Common phrases that indicate NON-item text (should be filtered out) | |
| EXCLUDE_PHRASES = [ | |
| "thank you", "thank", "goods sold", "not returnable", "returnable", | |
| "shopping at", "visit again", "customer copy", "merchant copy", | |
| "powered by", "terms and conditions", "t&c apply", "cashier", | |
| "counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc", | |
| "gst summary", "tax summary", "payment", "change", "cash", | |
| "credit card", "debit card", "subtotal", "sub total", "grand total", | |
| "total includes", "includes gst", "tax invoice", "invoice" | |
| ] | |
| def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]: | |
| """ | |
| Geometric Heuristic to extract table rows. | |
| Logic: | |
| 1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty'). | |
| 2. Find 'Footer' Y-position (where 'Total' usually sits). | |
| 3. Filter all words strictly BETWEEN Header and Footer. | |
| 4. Group remaining words into 'Rows' based on similar Y-coordinates. | |
| """ | |
| if not words or not boxes: | |
| return [] | |
| # 1. Identify Anchor Points | |
| header_y = 0 | |
| footer_y = float('inf') | |
| header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"] | |
| footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"] | |
| # Scan for Header (Top boundary) | |
| for i, word in enumerate(words): | |
| if word.lower() in header_keywords: | |
| y_bottom = boxes[i][1] + boxes[i][3] | |
| if y_bottom > header_y: | |
| header_y = y_bottom | |
| # Scan for Footer (Bottom boundary) | |
| for i, word in enumerate(words): | |
| if word.lower() in footer_keywords: | |
| y_top = boxes[i][1] | |
| if y_top < footer_y and y_top > header_y: | |
| footer_y = y_top | |
| # If no header found, assume top 25% is header | |
| if header_y == 0 and boxes: | |
| max_y = max(b[1] for b in boxes) | |
| header_y = max_y * 0.25 | |
| # If no footer found, assume bottom 25% is footer | |
| if footer_y == float('inf') and boxes: | |
| max_y = max(b[1] for b in boxes) | |
| footer_y = max_y * 0.75 | |
| # 2. Filter Content (The "Sandwich" Meat) | |
| table_words = [] | |
| for i, word in enumerate(words): | |
| bx, by, bw, bh = boxes[i] | |
| if by > header_y and (by + bh) < footer_y: | |
| table_words.append({"text": word, "box": boxes[i]}) | |
| # 3. Group by Rows (Y-clustering) | |
| rows = [] | |
| if not table_words: | |
| return [] | |
| table_words.sort(key=lambda x: x["box"][1]) | |
| current_row = [table_words[0]] | |
| current_y = table_words[0]["box"][1] | |
| for item in table_words[1:]: | |
| y = item["box"][1] | |
| if abs(y - current_y) < 15: | |
| current_row.append(item) | |
| else: | |
| current_row.sort(key=lambda x: x["box"][0]) | |
| rows.append(current_row) | |
| current_row = [item] | |
| current_y = y | |
| if current_row: | |
| current_row.sort(key=lambda x: x["box"][0]) | |
| rows.append(current_row) | |
| # 4. Convert Rows to Structured Dicts with FILTERING | |
| structured_items = [] | |
| for row in rows: | |
| full_text = " ".join([w["text"] for w in row]) | |
| full_text_lower = full_text.lower() | |
| # Skip rows that match exclude phrases | |
| if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES): | |
| continue | |
| # Skip very short rows (likely noise) | |
| if len(full_text.strip()) < 3: | |
| continue | |
| # Find all numbers (potential prices) | |
| # Match patterns like: 0.90, 12.50, 1,234.56 | |
| numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text) | |
| item_obj = { | |
| "description": full_text, | |
| "quantity": 1, | |
| "unit_price": 0.0, | |
| "total": 0.0 | |
| } | |
| if numbers: | |
| try: | |
| # Clean and convert last number as price | |
| val = float(numbers[-1].replace(',', '')) | |
| # Skip if price is 0 or unreasonably small for a line item | |
| if val <= 0: | |
| continue | |
| item_obj["total"] = val | |
| item_obj["unit_price"] = val | |
| # Remove the price from description | |
| item_obj["description"] = full_text.replace(numbers[-1], "").strip() | |
| # Skip if description is now empty or too short | |
| if len(item_obj["description"].strip()) < 2: | |
| continue | |
| except: | |
| continue | |
| else: | |
| # No numbers found = not a valid line item | |
| continue | |
| structured_items.append(item_obj) | |
| return structured_items |