Spaces:

he99codes
/

Recipe_Health_Classification

Sleeping

File size: 5,669 Bytes

f75c5b2

"""recipe_nlp/extractor.py — ingredient extraction and normalization."""
from __future__ import annotations
import re, json
from dataclasses import dataclass, field
from typing import List, Dict, Any
from recipe_nlp.parser import RecipeParser, RawIngredientMention
from utils.config import config, NLPConfig
from utils.logger import logger

FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"}
INGREDIENT_BLACKLIST = {
    "recipe","dish","meal","food","step","minute","minutes","hour","hours",
    "degree","degrees","temperature","heat","pan","pot","oven","skillet",
    "bowl","plate","cup","spoon","knife","board","cutting",
}
HIGH_RISK = {
    "butter","lard","shortening","margarine","cream cheese","heavy cream",
    "double cream","bacon","sausage","white sugar","corn syrup","mayonnaise",
}
HEALTHY_MARKERS = {
    "spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana",
    "berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil",
    "chickpea","bean","almond","walnut","avocado","olive oil",
}

@dataclass
class Ingredient:
    name: str; quantity: str = ""; unit: str = ""
    method: str = ""; is_high_risk: bool = False; is_healthy: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method}

@dataclass
class RecipeStructure:
    ingredients: List[Ingredient] = field(default_factory=list)
    cooking_methods: List[str]    = field(default_factory=list)
    servings_hint: int            = 4
    raw_text: str                 = ""
    def to_dict(self) -> Dict[str, Any]:
        return {"ingredients":[i.to_dict() for i in self.ingredients],
                "cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint}
    def to_json(self, indent:int=2) -> str:
        return json.dumps(self.to_dict(), indent=indent)


class RecipeExtractor:
    def __init__(self, cfg: NLPConfig = None):
        self.cfg = cfg or config.nlp
        self.parser = RecipeParser(cfg)

    def extract(self, recipe_text: str) -> RecipeStructure:
        text = self._preprocess(recipe_text)
        mentions = self.parser.extract_raw_mentions(text)
        ings = self._normalize_mentions(mentions)
        ings = self._deduplicate(ings)
        ings = self._annotate_health_flags(ings)
        return RecipeStructure(
            ingredients=ings,
            cooking_methods=self._extract_all_methods(text),
            servings_hint=self._extract_servings(text),
            raw_text=text,
        )

    def _preprocess(self, text: str) -> str:
    # Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5"
        import re
    
    # "1-1-slash-3" or "1-1/3" → mixed number
        text = re.sub(
        r'(\d+)[\s\-]+(\d+)[\s\-]*slash[\s\-]*(\d+)',
        lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)),
        text, flags=re.IGNORECASE
    )
    # "1-slash-2" or "1/2" spoken → fraction
        text = re.sub(
        r'(\d+)[\s\-]*slash[\s\-]*(\d+)',
        lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)),
        text, flags=re.IGNORECASE
    )
    # "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns)
        text = re.sub(r'(\d+)-(\d+)-(ounce|gram|pound|oz|g|lb)', 
                  r'\1 \2 \3', text, flags=re.IGNORECASE)
        for ch, val in FRACTION_MAP.items():
            text = text.replace(ch, val)
        text = re.sub(r"\s+", " ", text).strip()
        text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE)
        text = re.sub(r"\btbs\b",  "tablespoon", text, flags=re.IGNORECASE)
        text = re.sub(r"\btsp\b",  "teaspoon",   text, flags=re.IGNORECASE)
        text = re.sub(r"\boz\b",   "ounce",      text, flags=re.IGNORECASE)
        text = re.sub(r"\blbs?\b", "pound",      text, flags=re.IGNORECASE)
        return text

    def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]:
        result = []
        for m in mentions:
            name = m.food_token.lower().strip()
            if name in INGREDIENT_BLACKLIST or len(name) <= 2:
                continue
            qty = " ".join(filter(None, [m.quantity_str, m.unit_str]))
            result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str))
        return result

    def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]:
        seen: Dict[str, Ingredient] = {}
        for ing in ings:
            if ing.name in seen:
                if not seen[ing.name].quantity and ing.quantity:
                    seen[ing.name] = ing
                elif not seen[ing.name].method and ing.method:
                    seen[ing.name].method = ing.method
            else:
                seen[ing.name] = ing
        return list(seen.values())

    def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]:
        for ing in ings:
            n = ing.name.lower()
            ing.is_high_risk = any(h in n for h in HIGH_RISK)
            ing.is_healthy   = any(h in n for h in HEALTHY_MARKERS)
        return ings

    def _extract_all_methods(self, text: str) -> List[str]:
        tl = text.lower()
        return list({m for m in self.cfg.cooking_methods if m.lower() in tl})

    def _extract_servings(self, text: str) -> int:
        for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]:
            m = re.search(p, text.lower())
            if m:
                return int(m.group(1))
        return config.default_servings