Spaces:

GSoumyajit2005
/

invoice-processor-ml

Running

File size: 4,965 Bytes

90dbe20

# src/table_extraction.py

from typing import List, Dict, Any
import re

# Common phrases that indicate NON-item text (should be filtered out)
EXCLUDE_PHRASES = [
    "thank you", "thank", "goods sold", "not returnable", "returnable",
    "shopping at", "visit again", "customer copy", "merchant copy",
    "powered by", "terms and conditions", "t&c apply", "cashier",
    "counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc",
    "gst summary", "tax summary", "payment", "change", "cash",
    "credit card", "debit card", "subtotal", "sub total", "grand total",
    "total includes", "includes gst", "tax invoice", "invoice"
]

def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]:
    """
    Geometric Heuristic to extract table rows.
    Logic:
    1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty').
    2. Find 'Footer' Y-position (where 'Total' usually sits).
    3. Filter all words strictly BETWEEN Header and Footer.
    4. Group remaining words into 'Rows' based on similar Y-coordinates.
    """
    
    if not words or not boxes:
        return []
    
    # 1. Identify Anchor Points
    header_y = 0
    footer_y = float('inf')
    
    header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"]
    footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"]
    
    # Scan for Header (Top boundary)
    for i, word in enumerate(words):
        if word.lower() in header_keywords:
            y_bottom = boxes[i][1] + boxes[i][3]
            if y_bottom > header_y:
                header_y = y_bottom

    # Scan for Footer (Bottom boundary)
    for i, word in enumerate(words):
        if word.lower() in footer_keywords:
            y_top = boxes[i][1]
            if y_top < footer_y and y_top > header_y: 
                footer_y = y_top

    # If no header found, assume top 25% is header
    if header_y == 0 and boxes:
        max_y = max(b[1] for b in boxes)
        header_y = max_y * 0.25
    
    # If no footer found, assume bottom 25% is footer
    if footer_y == float('inf') and boxes:
        max_y = max(b[1] for b in boxes)
        footer_y = max_y * 0.75

    # 2. Filter Content (The "Sandwich" Meat)
    table_words = []
    for i, word in enumerate(words):
        bx, by, bw, bh = boxes[i]
        if by > header_y and (by + bh) < footer_y:
            table_words.append({"text": word, "box": boxes[i]})

    # 3. Group by Rows (Y-clustering)
    rows = []
    if not table_words:
        return []

    table_words.sort(key=lambda x: x["box"][1])
    
    current_row = [table_words[0]]
    current_y = table_words[0]["box"][1]
    
    for item in table_words[1:]:
        y = item["box"][1]
        if abs(y - current_y) < 15:
            current_row.append(item)
        else:
            current_row.sort(key=lambda x: x["box"][0])
            rows.append(current_row)
            current_row = [item]
            current_y = y
            
    if current_row:
        current_row.sort(key=lambda x: x["box"][0])
        rows.append(current_row)

    # 4. Convert Rows to Structured Dicts with FILTERING
    structured_items = []
    
    for row in rows:
        full_text = " ".join([w["text"] for w in row])
        full_text_lower = full_text.lower()
        
        # Skip rows that match exclude phrases
        if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES):
            continue
        
        # Skip very short rows (likely noise)
        if len(full_text.strip()) < 3:
            continue
        
        # Find all numbers (potential prices)
        # Match patterns like: 0.90, 12.50, 1,234.56
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text)
        
        item_obj = {
            "description": full_text, 
            "quantity": 1, 
            "unit_price": 0.0, 
            "total": 0.0
        }
        
        if numbers:
            try:
                # Clean and convert last number as price
                val = float(numbers[-1].replace(',', ''))
                
                # Skip if price is 0 or unreasonably small for a line item
                if val <= 0:
                    continue
                    
                item_obj["total"] = val
                item_obj["unit_price"] = val
                # Remove the price from description
                item_obj["description"] = full_text.replace(numbers[-1], "").strip()
                
                # Skip if description is now empty or too short
                if len(item_obj["description"].strip()) < 2:
                    continue
                    
            except:
                continue
        else:
            # No numbers found = not a valid line item
            continue
                
        structured_items.append(item_obj)

    return structured_items