File size: 4,965 Bytes
90dbe20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# src/table_extraction.py

from typing import List, Dict, Any
import re

# Common phrases that indicate NON-item text (should be filtered out)
EXCLUDE_PHRASES = [
    "thank you", "thank", "goods sold", "not returnable", "returnable",
    "shopping at", "visit again", "customer copy", "merchant copy",
    "powered by", "terms and conditions", "t&c apply", "cashier",
    "counter", "sdn bhd", "bhd", "pte ltd", "pvt ltd", "llc", "inc",
    "gst summary", "tax summary", "payment", "change", "cash",
    "credit card", "debit card", "subtotal", "sub total", "grand total",
    "total includes", "includes gst", "tax invoice", "invoice"
]

def extract_table_items(words: List[str], boxes: List[List[int]]) -> List[Dict[str, Any]]:
    """
    Geometric Heuristic to extract table rows.
    Logic:
    1. Find 'Header' Y-position (words like 'Description', 'Item', 'Qty').
    2. Find 'Footer' Y-position (where 'Total' usually sits).
    3. Filter all words strictly BETWEEN Header and Footer.
    4. Group remaining words into 'Rows' based on similar Y-coordinates.
    """
    
    if not words or not boxes:
        return []
    
    # 1. Identify Anchor Points
    header_y = 0
    footer_y = float('inf')
    
    header_keywords = ["description", "item", "particulars", "qty", "quantity", "price", "amount", "rate", "uom", "unit"]
    footer_keywords = ["total", "subtotal", "tax", "grand total", "payment", "cash", "change", "gst summary", "tax summary"]
    
    # Scan for Header (Top boundary)
    for i, word in enumerate(words):
        if word.lower() in header_keywords:
            y_bottom = boxes[i][1] + boxes[i][3]
            if y_bottom > header_y:
                header_y = y_bottom

    # Scan for Footer (Bottom boundary)
    for i, word in enumerate(words):
        if word.lower() in footer_keywords:
            y_top = boxes[i][1]
            if y_top < footer_y and y_top > header_y: 
                footer_y = y_top

    # If no header found, assume top 25% is header
    if header_y == 0 and boxes:
        max_y = max(b[1] for b in boxes)
        header_y = max_y * 0.25
    
    # If no footer found, assume bottom 25% is footer
    if footer_y == float('inf') and boxes:
        max_y = max(b[1] for b in boxes)
        footer_y = max_y * 0.75

    # 2. Filter Content (The "Sandwich" Meat)
    table_words = []
    for i, word in enumerate(words):
        bx, by, bw, bh = boxes[i]
        if by > header_y and (by + bh) < footer_y:
            table_words.append({"text": word, "box": boxes[i]})

    # 3. Group by Rows (Y-clustering)
    rows = []
    if not table_words:
        return []

    table_words.sort(key=lambda x: x["box"][1])
    
    current_row = [table_words[0]]
    current_y = table_words[0]["box"][1]
    
    for item in table_words[1:]:
        y = item["box"][1]
        if abs(y - current_y) < 15:
            current_row.append(item)
        else:
            current_row.sort(key=lambda x: x["box"][0])
            rows.append(current_row)
            current_row = [item]
            current_y = y
            
    if current_row:
        current_row.sort(key=lambda x: x["box"][0])
        rows.append(current_row)

    # 4. Convert Rows to Structured Dicts with FILTERING
    structured_items = []
    
    for row in rows:
        full_text = " ".join([w["text"] for w in row])
        full_text_lower = full_text.lower()
        
        # Skip rows that match exclude phrases
        if any(phrase in full_text_lower for phrase in EXCLUDE_PHRASES):
            continue
        
        # Skip very short rows (likely noise)
        if len(full_text.strip()) < 3:
            continue
        
        # Find all numbers (potential prices)
        # Match patterns like: 0.90, 12.50, 1,234.56
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*\.?\d*', full_text)
        
        item_obj = {
            "description": full_text, 
            "quantity": 1, 
            "unit_price": 0.0, 
            "total": 0.0
        }
        
        if numbers:
            try:
                # Clean and convert last number as price
                val = float(numbers[-1].replace(',', ''))
                
                # Skip if price is 0 or unreasonably small for a line item
                if val <= 0:
                    continue
                    
                item_obj["total"] = val
                item_obj["unit_price"] = val
                # Remove the price from description
                item_obj["description"] = full_text.replace(numbers[-1], "").strip()
                
                # Skip if description is now empty or too short
                if len(item_obj["description"].strip()) < 2:
                    continue
                    
            except:
                continue
        else:
            # No numbers found = not a valid line item
            continue
                
        structured_items.append(item_obj)

    return structured_items