File size: 5,669 Bytes
f75c5b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""recipe_nlp/extractor.py — ingredient extraction and normalization."""
from __future__ import annotations
import re, json
from dataclasses import dataclass, field
from typing import List, Dict, Any
from recipe_nlp.parser import RecipeParser, RawIngredientMention
from utils.config import config, NLPConfig
from utils.logger import logger

FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"}
INGREDIENT_BLACKLIST = {
    "recipe","dish","meal","food","step","minute","minutes","hour","hours",
    "degree","degrees","temperature","heat","pan","pot","oven","skillet",
    "bowl","plate","cup","spoon","knife","board","cutting",
}
HIGH_RISK = {
    "butter","lard","shortening","margarine","cream cheese","heavy cream",
    "double cream","bacon","sausage","white sugar","corn syrup","mayonnaise",
}
HEALTHY_MARKERS = {
    "spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana",
    "berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil",
    "chickpea","bean","almond","walnut","avocado","olive oil",
}

@dataclass
class Ingredient:
    name: str; quantity: str = ""; unit: str = ""
    method: str = ""; is_high_risk: bool = False; is_healthy: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method}

@dataclass
class RecipeStructure:
    ingredients: List[Ingredient] = field(default_factory=list)
    cooking_methods: List[str]    = field(default_factory=list)
    servings_hint: int            = 4
    raw_text: str                 = ""
    def to_dict(self) -> Dict[str, Any]:
        return {"ingredients":[i.to_dict() for i in self.ingredients],
                "cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint}
    def to_json(self, indent:int=2) -> str:
        return json.dumps(self.to_dict(), indent=indent)


class RecipeExtractor:
    def __init__(self, cfg: NLPConfig = None):
        self.cfg = cfg or config.nlp
        self.parser = RecipeParser(cfg)

    def extract(self, recipe_text: str) -> RecipeStructure:
        text = self._preprocess(recipe_text)
        mentions = self.parser.extract_raw_mentions(text)
        ings = self._normalize_mentions(mentions)
        ings = self._deduplicate(ings)
        ings = self._annotate_health_flags(ings)
        return RecipeStructure(
            ingredients=ings,
            cooking_methods=self._extract_all_methods(text),
            servings_hint=self._extract_servings(text),
            raw_text=text,
        )

    def _preprocess(self, text: str) -> str:
    # Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5"
        import re
    
    # "1-1-slash-3" or "1-1/3" → mixed number
        text = re.sub(
        r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)',
        lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)),
        text, flags=re.IGNORECASE
    )
    # "1-slash-2" or "1/2" spoken → fraction
        text = re.sub(
        r'(\d+)[\s\-]*slash[\s\-]*(\d+)',
        lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)),
        text, flags=re.IGNORECASE
    )
    # "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns)
        text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)', 
                  r'\1 \2 \3', text, flags=re.IGNORECASE)
        for ch, val in FRACTION_MAP.items():
            text = text.replace(ch, val)
        text = re.sub(r"\s+", " ", text).strip()
        text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE)
        text = re.sub(r"\btbs\b",  "tablespoon", text, flags=re.IGNORECASE)
        text = re.sub(r"\btsp\b",  "teaspoon",   text, flags=re.IGNORECASE)
        text = re.sub(r"\boz\b",   "ounce",      text, flags=re.IGNORECASE)
        text = re.sub(r"\blbs?\b", "pound",      text, flags=re.IGNORECASE)
        return text

    def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]:
        result = []
        for m in mentions:
            name = m.food_token.lower().strip()
            if name in INGREDIENT_BLACKLIST or len(name) <= 2:
                continue
            qty = " ".join(filter(None, [m.quantity_str, m.unit_str]))
            result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str))
        return result

    def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]:
        seen: Dict[str, Ingredient] = {}
        for ing in ings:
            if ing.name in seen:
                if not seen[ing.name].quantity and ing.quantity:
                    seen[ing.name] = ing
                elif not seen[ing.name].method and ing.method:
                    seen[ing.name].method = ing.method
            else:
                seen[ing.name] = ing
        return list(seen.values())

    def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]:
        for ing in ings:
            n = ing.name.lower()
            ing.is_high_risk = any(h in n for h in HIGH_RISK)
            ing.is_healthy   = any(h in n for h in HEALTHY_MARKERS)
        return ings

    def _extract_all_methods(self, text: str) -> List[str]:
        tl = text.lower()
        return list({m for m in self.cfg.cooking_methods if m.lower() in tl})

    def _extract_servings(self, text: str) -> int:
        for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]:
            m = re.search(p, text.lower())
            if m:
                return int(m.group(1))
        return config.default_servings