"""recipe_nlp/extractor.py — ingredient extraction and normalization.""" from __future__ import annotations import re, json from dataclasses import dataclass, field from typing import List, Dict, Any from recipe_nlp.parser import RecipeParser, RawIngredientMention from utils.config import config, NLPConfig from utils.logger import logger FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"} INGREDIENT_BLACKLIST = { "recipe","dish","meal","food","step","minute","minutes","hour","hours", "degree","degrees","temperature","heat","pan","pot","oven","skillet", "bowl","plate","cup","spoon","knife","board","cutting", } HIGH_RISK = { "butter","lard","shortening","margarine","cream cheese","heavy cream", "double cream","bacon","sausage","white sugar","corn syrup","mayonnaise", } HEALTHY_MARKERS = { "spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana", "berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil", "chickpea","bean","almond","walnut","avocado","olive oil", } @dataclass class Ingredient: name: str; quantity: str = ""; unit: str = "" method: str = ""; is_high_risk: bool = False; is_healthy: bool = False def to_dict(self) -> Dict[str, Any]: return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method} @dataclass class RecipeStructure: ingredients: List[Ingredient] = field(default_factory=list) cooking_methods: List[str] = field(default_factory=list) servings_hint: int = 4 raw_text: str = "" def to_dict(self) -> Dict[str, Any]: return {"ingredients":[i.to_dict() for i in self.ingredients], "cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint} def to_json(self, indent:int=2) -> str: return json.dumps(self.to_dict(), indent=indent) class RecipeExtractor: def __init__(self, cfg: NLPConfig = None): self.cfg = cfg or config.nlp self.parser = RecipeParser(cfg) def extract(self, recipe_text: str) -> RecipeStructure: text = self._preprocess(recipe_text) mentions = self.parser.extract_raw_mentions(text) ings = self._normalize_mentions(mentions) ings = self._deduplicate(ings) ings = self._annotate_health_flags(ings) return RecipeStructure( ingredients=ings, cooking_methods=self._extract_all_methods(text), servings_hint=self._extract_servings(text), raw_text=text, ) def _preprocess(self, text: str) -> str: # Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5" import re # "1-1-slash-3" or "1-1/3" → mixed number text = re.sub( r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)', lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)), text, flags=re.IGNORECASE ) # "1-slash-2" or "1/2" spoken → fraction text = re.sub( r'(\d+)[\s\-]*slash[\s\-]*(\d+)', lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)), text, flags=re.IGNORECASE ) # "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns) text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)', r'\1 \2 \3', text, flags=re.IGNORECASE) for ch, val in FRACTION_MAP.items(): text = text.replace(ch, val) text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE) text = re.sub(r"\btbs\b", "tablespoon", text, flags=re.IGNORECASE) text = re.sub(r"\btsp\b", "teaspoon", text, flags=re.IGNORECASE) text = re.sub(r"\boz\b", "ounce", text, flags=re.IGNORECASE) text = re.sub(r"\blbs?\b", "pound", text, flags=re.IGNORECASE) return text def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]: result = [] for m in mentions: name = m.food_token.lower().strip() if name in INGREDIENT_BLACKLIST or len(name) <= 2: continue qty = " ".join(filter(None, [m.quantity_str, m.unit_str])) result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str)) return result def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]: seen: Dict[str, Ingredient] = {} for ing in ings: if ing.name in seen: if not seen[ing.name].quantity and ing.quantity: seen[ing.name] = ing elif not seen[ing.name].method and ing.method: seen[ing.name].method = ing.method else: seen[ing.name] = ing return list(seen.values()) def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]: for ing in ings: n = ing.name.lower() ing.is_high_risk = any(h in n for h in HIGH_RISK) ing.is_healthy = any(h in n for h in HEALTHY_MARKERS) return ings def _extract_all_methods(self, text: str) -> List[str]: tl = text.lower() return list({m for m in self.cfg.cooking_methods if m.lower() in tl}) def _extract_servings(self, text: str) -> int: for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]: m = re.search(p, text.lower()) if m: return int(m.group(1)) return config.default_servings