he99codes's picture
Clean deployment with LFS setup correctly
f75c5b2
"""recipe_nlp/extractor.py — ingredient extraction and normalization."""
from __future__ import annotations
import re, json
from dataclasses import dataclass, field
from typing import List, Dict, Any
from recipe_nlp.parser import RecipeParser, RawIngredientMention
from utils.config import config, NLPConfig
from utils.logger import logger
FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"}
INGREDIENT_BLACKLIST = {
"recipe","dish","meal","food","step","minute","minutes","hour","hours",
"degree","degrees","temperature","heat","pan","pot","oven","skillet",
"bowl","plate","cup","spoon","knife","board","cutting",
}
HIGH_RISK = {
"butter","lard","shortening","margarine","cream cheese","heavy cream",
"double cream","bacon","sausage","white sugar","corn syrup","mayonnaise",
}
HEALTHY_MARKERS = {
"spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana",
"berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil",
"chickpea","bean","almond","walnut","avocado","olive oil",
}
@dataclass
class Ingredient:
name: str; quantity: str = ""; unit: str = ""
method: str = ""; is_high_risk: bool = False; is_healthy: bool = False
def to_dict(self) -> Dict[str, Any]:
return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method}
@dataclass
class RecipeStructure:
ingredients: List[Ingredient] = field(default_factory=list)
cooking_methods: List[str] = field(default_factory=list)
servings_hint: int = 4
raw_text: str = ""
def to_dict(self) -> Dict[str, Any]:
return {"ingredients":[i.to_dict() for i in self.ingredients],
"cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint}
def to_json(self, indent:int=2) -> str:
return json.dumps(self.to_dict(), indent=indent)
class RecipeExtractor:
def __init__(self, cfg: NLPConfig = None):
self.cfg = cfg or config.nlp
self.parser = RecipeParser(cfg)
def extract(self, recipe_text: str) -> RecipeStructure:
text = self._preprocess(recipe_text)
mentions = self.parser.extract_raw_mentions(text)
ings = self._normalize_mentions(mentions)
ings = self._deduplicate(ings)
ings = self._annotate_health_flags(ings)
return RecipeStructure(
ingredients=ings,
cooking_methods=self._extract_all_methods(text),
servings_hint=self._extract_servings(text),
raw_text=text,
)
def _preprocess(self, text: str) -> str:
# Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5"
import re
# "1-1-slash-3" or "1-1/3" → mixed number
text = re.sub(
r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)',
lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)),
text, flags=re.IGNORECASE
)
# "1-slash-2" or "1/2" spoken → fraction
text = re.sub(
r'(\d+)[\s\-]*slash[\s\-]*(\d+)',
lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)),
text, flags=re.IGNORECASE
)
# "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns)
text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)',
r'\1 \2 \3', text, flags=re.IGNORECASE)
for ch, val in FRACTION_MAP.items():
text = text.replace(ch, val)
text = re.sub(r"\s+", " ", text).strip()
text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE)
text = re.sub(r"\btbs\b", "tablespoon", text, flags=re.IGNORECASE)
text = re.sub(r"\btsp\b", "teaspoon", text, flags=re.IGNORECASE)
text = re.sub(r"\boz\b", "ounce", text, flags=re.IGNORECASE)
text = re.sub(r"\blbs?\b", "pound", text, flags=re.IGNORECASE)
return text
def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]:
result = []
for m in mentions:
name = m.food_token.lower().strip()
if name in INGREDIENT_BLACKLIST or len(name) <= 2:
continue
qty = " ".join(filter(None, [m.quantity_str, m.unit_str]))
result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str))
return result
def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]:
seen: Dict[str, Ingredient] = {}
for ing in ings:
if ing.name in seen:
if not seen[ing.name].quantity and ing.quantity:
seen[ing.name] = ing
elif not seen[ing.name].method and ing.method:
seen[ing.name].method = ing.method
else:
seen[ing.name] = ing
return list(seen.values())
def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]:
for ing in ings:
n = ing.name.lower()
ing.is_high_risk = any(h in n for h in HIGH_RISK)
ing.is_healthy = any(h in n for h in HEALTHY_MARKERS)
return ings
def _extract_all_methods(self, text: str) -> List[str]:
tl = text.lower()
return list({m for m in self.cfg.cooking_methods if m.lower() in tl})
def _extract_servings(self, text: str) -> int:
for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]:
m = re.search(p, text.lower())
if m:
return int(m.group(1))
return config.default_servings