File size: 5,669 Bytes
f75c5b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | """recipe_nlp/extractor.py — ingredient extraction and normalization."""
from __future__ import annotations
import re, json
from dataclasses import dataclass, field
from typing import List, Dict, Any
from recipe_nlp.parser import RecipeParser, RawIngredientMention
from utils.config import config, NLPConfig
from utils.logger import logger
FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"}
INGREDIENT_BLACKLIST = {
"recipe","dish","meal","food","step","minute","minutes","hour","hours",
"degree","degrees","temperature","heat","pan","pot","oven","skillet",
"bowl","plate","cup","spoon","knife","board","cutting",
}
HIGH_RISK = {
"butter","lard","shortening","margarine","cream cheese","heavy cream",
"double cream","bacon","sausage","white sugar","corn syrup","mayonnaise",
}
HEALTHY_MARKERS = {
"spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana",
"berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil",
"chickpea","bean","almond","walnut","avocado","olive oil",
}
@dataclass
class Ingredient:
name: str; quantity: str = ""; unit: str = ""
method: str = ""; is_high_risk: bool = False; is_healthy: bool = False
def to_dict(self) -> Dict[str, Any]:
return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method}
@dataclass
class RecipeStructure:
ingredients: List[Ingredient] = field(default_factory=list)
cooking_methods: List[str] = field(default_factory=list)
servings_hint: int = 4
raw_text: str = ""
def to_dict(self) -> Dict[str, Any]:
return {"ingredients":[i.to_dict() for i in self.ingredients],
"cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint}
def to_json(self, indent:int=2) -> str:
return json.dumps(self.to_dict(), indent=indent)
class RecipeExtractor:
def __init__(self, cfg: NLPConfig = None):
self.cfg = cfg or config.nlp
self.parser = RecipeParser(cfg)
def extract(self, recipe_text: str) -> RecipeStructure:
text = self._preprocess(recipe_text)
mentions = self.parser.extract_raw_mentions(text)
ings = self._normalize_mentions(mentions)
ings = self._deduplicate(ings)
ings = self._annotate_health_flags(ings)
return RecipeStructure(
ingredients=ings,
cooking_methods=self._extract_all_methods(text),
servings_hint=self._extract_servings(text),
raw_text=text,
)
def _preprocess(self, text: str) -> str:
# Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5"
import re
# "1-1-slash-3" or "1-1/3" → mixed number
text = re.sub(
r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)',
lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)),
text, flags=re.IGNORECASE
)
# "1-slash-2" or "1/2" spoken → fraction
text = re.sub(
r'(\d+)[\s\-]*slash[\s\-]*(\d+)',
lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)),
text, flags=re.IGNORECASE
)
# "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns)
text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)',
r'\1 \2 \3', text, flags=re.IGNORECASE)
for ch, val in FRACTION_MAP.items():
text = text.replace(ch, val)
text = re.sub(r"\s+", " ", text).strip()
text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE)
text = re.sub(r"\btbs\b", "tablespoon", text, flags=re.IGNORECASE)
text = re.sub(r"\btsp\b", "teaspoon", text, flags=re.IGNORECASE)
text = re.sub(r"\boz\b", "ounce", text, flags=re.IGNORECASE)
text = re.sub(r"\blbs?\b", "pound", text, flags=re.IGNORECASE)
return text
def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]:
result = []
for m in mentions:
name = m.food_token.lower().strip()
if name in INGREDIENT_BLACKLIST or len(name) <= 2:
continue
qty = " ".join(filter(None, [m.quantity_str, m.unit_str]))
result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str))
return result
def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]:
seen: Dict[str, Ingredient] = {}
for ing in ings:
if ing.name in seen:
if not seen[ing.name].quantity and ing.quantity:
seen[ing.name] = ing
elif not seen[ing.name].method and ing.method:
seen[ing.name].method = ing.method
else:
seen[ing.name] = ing
return list(seen.values())
def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]:
for ing in ings:
n = ing.name.lower()
ing.is_high_risk = any(h in n for h in HIGH_RISK)
ing.is_healthy = any(h in n for h in HEALTHY_MARKERS)
return ings
def _extract_all_methods(self, text: str) -> List[str]:
tl = text.lower()
return list({m for m in self.cfg.cooking_methods if m.lower() in tl})
def _extract_servings(self, text: str) -> int:
for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]:
m = re.search(p, text.lower())
if m:
return int(m.group(1))
return config.default_servings
|