Spaces:

he99codes
/

Recipe_Health_Classification

Sleeping

App Files Files Community

Recipe_Health_Classification / recipe_nlp /extractor.py

he99codes

Clean deployment with LFS setup correctly

f75c5b2 about 1 month ago

raw

history blame contribute delete

5.67 kB

	"""recipe_nlp/extractor.py — ingredient extraction and normalization."""
	from __future__ import annotations
	import re, json
	from dataclasses import dataclass, field
	from typing import List, Dict, Any
	from recipe_nlp.parser import RecipeParser, RawIngredientMention
	from utils.config import config, NLPConfig
	from utils.logger import logger

	FRACTION_MAP = {"½":"0.5","⅓":"0.333","⅔":"0.667","¼":"0.25","¾":"0.75","⅛":"0.125","⅜":"0.375"}
	INGREDIENT_BLACKLIST = {
	"recipe","dish","meal","food","step","minute","minutes","hour","hours",
	"degree","degrees","temperature","heat","pan","pot","oven","skillet",
	"bowl","plate","cup","spoon","knife","board","cutting",
	}
	HIGH_RISK = {
	"butter","lard","shortening","margarine","cream cheese","heavy cream",
	"double cream","bacon","sausage","white sugar","corn syrup","mayonnaise",
	}
	HEALTHY_MARKERS = {
	"spinach","kale","broccoli","cauliflower","carrot","celery","apple","banana",
	"berry","blueberry","strawberry","salmon","tuna","quinoa","oat","lentil",
	"chickpea","bean","almond","walnut","avocado","olive oil",
	}

	@dataclass
	class Ingredient:
	name: str; quantity: str = ""; unit: str = ""
	method: str = ""; is_high_risk: bool = False; is_healthy: bool = False
	def to_dict(self) -> Dict[str, Any]:
	return {"name": self.name, "quantity": self.quantity, "unit": self.unit, "method": self.method}

	@dataclass
	class RecipeStructure:
	ingredients: List[Ingredient] = field(default_factory=list)
	cooking_methods: List[str] = field(default_factory=list)
	servings_hint: int = 4
	raw_text: str = ""
	def to_dict(self) -> Dict[str, Any]:
	return {"ingredients":[i.to_dict() for i in self.ingredients],
	"cooking_methods":self.cooking_methods,"servings_hint":self.servings_hint}
	def to_json(self, indent:int=2) -> str:
	return json.dumps(self.to_dict(), indent=indent)


	class RecipeExtractor:
	def __init__(self, cfg: NLPConfig = None):
	self.cfg = cfg or config.nlp
	self.parser = RecipeParser(cfg)

	def extract(self, recipe_text: str) -> RecipeStructure:
	text = self._preprocess(recipe_text)
	mentions = self.parser.extract_raw_mentions(text)
	ings = self._normalize_mentions(mentions)
	ings = self._deduplicate(ings)
	ings = self._annotate_health_flags(ings)
	return RecipeStructure(
	ingredients=ings,
	cooking_methods=self._extract_all_methods(text),
	servings_hint=self._extract_servings(text),
	raw_text=text,
	)

	def _preprocess(self, text: str) -> str:
	# Fix spoken fractions like "1-1-slash-3" → "1.333" and "1-slash-2" → "0.5"
	import re

	# "1-1-slash-3" or "1-1/3" → mixed number
	text = re.sub(
	r'(\d+)[\s\-]+(\d+)[\s\-]slash[\s\-](\d+)',
	lambda m: str(round(int(m.group(1)) + int(m.group(2)) / int(m.group(3)), 3)),
	text, flags=re.IGNORECASE
	)
	# "1-slash-2" or "1/2" spoken → fraction
	text = re.sub(
	r'(\d+)[\s\-]slash[\s\-](\d+)',
	lambda m: str(round(int(m.group(1)) / int(m.group(2)), 3)),
	text, flags=re.IGNORECASE
	)
	# "3-8-ounce" → "3 8 ounce" (quantity-size-unit patterns)
	text = re.sub(r'(\d+)-(\d+)-(ounce\|gram\|pound\|oz\|g\|lb)',
	r'\1 \2 \3', text, flags=re.IGNORECASE)
	for ch, val in FRACTION_MAP.items():
	text = text.replace(ch, val)
	text = re.sub(r"\s+", " ", text).strip()
	text = re.sub(r"\btbsp\b", "tablespoon", text, flags=re.IGNORECASE)
	text = re.sub(r"\btbs\b", "tablespoon", text, flags=re.IGNORECASE)
	text = re.sub(r"\btsp\b", "teaspoon", text, flags=re.IGNORECASE)
	text = re.sub(r"\boz\b", "ounce", text, flags=re.IGNORECASE)
	text = re.sub(r"\blbs?\b", "pound", text, flags=re.IGNORECASE)
	return text

	def _normalize_mentions(self, mentions: List[RawIngredientMention]) -> List[Ingredient]:
	result = []
	for m in mentions:
	name = m.food_token.lower().strip()
	if name in INGREDIENT_BLACKLIST or len(name) <= 2:
	continue
	qty = " ".join(filter(None, [m.quantity_str, m.unit_str]))
	result.append(Ingredient(name=name, quantity=qty, unit=m.unit_str, method=m.method_str))
	return result

	def _deduplicate(self, ings: List[Ingredient]) -> List[Ingredient]:
	seen: Dict[str, Ingredient] = {}
	for ing in ings:
	if ing.name in seen:
	if not seen[ing.name].quantity and ing.quantity:
	seen[ing.name] = ing
	elif not seen[ing.name].method and ing.method:
	seen[ing.name].method = ing.method
	else:
	seen[ing.name] = ing
	return list(seen.values())

	def _annotate_health_flags(self, ings: List[Ingredient]) -> List[Ingredient]:
	for ing in ings:
	n = ing.name.lower()
	ing.is_high_risk = any(h in n for h in HIGH_RISK)
	ing.is_healthy = any(h in n for h in HEALTHY_MARKERS)
	return ings

	def _extract_all_methods(self, text: str) -> List[str]:
	tl = text.lower()
	return list({m for m in self.cfg.cooking_methods if m.lower() in tl})

	def _extract_servings(self, text: str) -> int:
	for p in [r"serves?\s+(\d+)", r"(\d+)\s+servings?", r"makes?\s+(\d+)", r"for\s+(\d+)\s+people"]:
	m = re.search(p, text.lower())
	if m:
	return int(m.group(1))
	return config.default_servings