| """recipe_nlp/extractor.py — ingredient extraction and normalization.""" |
| from __future__ import annotations |
| import re, json |
| from dataclasses import dataclass, field |
| from typing import List, Dict, Any |
| from recipe_nlp.parser import RecipeParser, RawIngredientMention |
| from utils.config import config, NLPConfig |
| from utils.logger import logger |
|
|
| FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"} |
| INGREDIENT_BLACKLIST = { |
| "recipe","dish","meal","food","step","minute","minutes","hour","hours", |
| "degree","degrees","temperature","heat","pan","pot","oven","skillet", |
| "bowl","plate","cup","spoon","knife","board","cutting", |
| } |
| HIGH_RISK = { |
| "butter","lard","shortening","margarine","cream cheese","heavy cream", |
| "double cream","bacon","sausage","white sugar","corn syrup","mayonnaise", |
| } |
| HEALTHY_MARKERS = { |
| "spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana", |
| "berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil", |
| "chickpea","bean","almond","walnut","avocado","olive oil", |
| } |
|
|
| @dataclass |
| class Ingredient: |
| name: str; quantity: str = ""; unit: str = "" |
| method: str = ""; is_high_risk: bool = False; is_healthy: bool = False |
| def to_dict(self) -> Dict[str, Any]: |
| return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method} |
|
|
| @dataclass |
| class RecipeStructure: |
| ingredients: List[Ingredient] = field(default_factory=list) |
| cooking_methods: List[str] = field(default_factory=list) |
| servings_hint: int = 4 |
| raw_text: str = "" |
| def to_dict(self) -> Dict[str, Any]: |
| return {"ingredients":[i.to_dict() for i in self.ingredients], |
| "cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint} |
| def to_json(self, indent:int=2) -> str: |
| return json.dumps(self.to_dict(), indent=indent) |
|
|
|
|
| class RecipeExtractor: |
| def __init__(self, cfg: NLPConfig = None): |
| self.cfg = cfg or config.nlp |
| self.parser = RecipeParser(cfg) |
|
|
| def extract(self, recipe_text: str) -> RecipeStructure: |
| text = self._preprocess(recipe_text) |
| mentions = self.parser.extract_raw_mentions(text) |
| ings = self._normalize_mentions(mentions) |
| ings = self._deduplicate(ings) |
| ings = self._annotate_health_flags(ings) |
| return RecipeStructure( |
| ingredients=ings, |
| cooking_methods=self._extract_all_methods(text), |
| servings_hint=self._extract_servings(text), |
| raw_text=text, |
| ) |
|
|
| def _preprocess(self, text: str) -> str: |
| |
| import re |
| |
| |
| text = re.sub( |
| r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)', |
| lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)), |
| text, flags=re.IGNORECASE |
| ) |
| |
| text = re.sub( |
| r'(\d+)[\s\-]*slash[\s\-]*(\d+)', |
| lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)), |
| text, flags=re.IGNORECASE |
| ) |
| |
| text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)', |
| r'\1 \2 \3', text, flags=re.IGNORECASE) |
| for ch, val in FRACTION_MAP.items(): |
| text = text.replace(ch, val) |
| text = re.sub(r"\s+", " ", text).strip() |
| text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE) |
| text = re.sub(r"\btbs\b", "tablespoon", text, flags=re.IGNORECASE) |
| text = re.sub(r"\btsp\b", "teaspoon", text, flags=re.IGNORECASE) |
| text = re.sub(r"\boz\b", "ounce", text, flags=re.IGNORECASE) |
| text = re.sub(r"\blbs?\b", "pound", text, flags=re.IGNORECASE) |
| return text |
|
|
| def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]: |
| result = [] |
| for m in mentions: |
| name = m.food_token.lower().strip() |
| if name in INGREDIENT_BLACKLIST or len(name) <= 2: |
| continue |
| qty = " ".join(filter(None, [m.quantity_str, m.unit_str])) |
| result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str)) |
| return result |
|
|
| def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]: |
| seen: Dict[str, Ingredient] = {} |
| for ing in ings: |
| if ing.name in seen: |
| if not seen[ing.name].quantity and ing.quantity: |
| seen[ing.name] = ing |
| elif not seen[ing.name].method and ing.method: |
| seen[ing.name].method = ing.method |
| else: |
| seen[ing.name] = ing |
| return list(seen.values()) |
|
|
| def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]: |
| for ing in ings: |
| n = ing.name.lower() |
| ing.is_high_risk = any(h in n for h in HIGH_RISK) |
| ing.is_healthy = any(h in n for h in HEALTHY_MARKERS) |
| return ings |
|
|
| def _extract_all_methods(self, text: str) -> List[str]: |
| tl = text.lower() |
| return list({m for m in self.cfg.cooking_methods if m.lower() in tl}) |
|
|
| def _extract_servings(self, text: str) -> int: |
| for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]: |
| m = re.search(p, text.lower()) |
| if m: |
| return int(m.group(1)) |
| return config.default_servings |
|
|