"""recipe_nlp/parser.py — spaCy NER + dependency parsing.""" from __future__ import annotations import re from dataclasses import dataclass, field from typing import List from utils.config import config, NLPConfig from utils.logger import logger UNIT_VOCAB = { "cup","cups","tablespoon","tablespoons","tbsp","tbs","teaspoon","teaspoons","tsp", "fluid ounce","fl oz","liter","liters","litre","litres","l","milliliter","milliliters","ml", "pint","pints","quart","quarts","gallon","gallons", "gram","grams","g","kilogram","kilograms","kg","ounce","ounces","oz","pound","pounds","lb","lbs", "piece","pieces","slice","slices","clove","cloves","head","heads","bunch","bunches", "handful","handfuls","can","cans","jar","jars","package","packages","pinch","dash","sprinkle", } @dataclass class ParsedToken: text: str; lemma: str; pos: str; dep: str is_food: bool = False; is_quantity: bool = False is_unit: bool = False; is_method: bool = False head_text: str = "" @dataclass class RawIngredientMention: food_token: str; quantity_str: str = ""; unit_str: str = "" method_str: str = ""; sentence: str = "" class RecipeParser: def __init__(self, cfg: NLPConfig = None): self.cfg = cfg or config.nlp self._nlp = None def _load_nlp(self): if self._nlp is None: import spacy try: self._nlp = spacy.load(self.cfg.spacy_model) except OSError: logger.info("Downloading spaCy model en_core_web_sm …") from spacy.cli import download download(self.cfg.spacy_model) self._nlp = spacy.load(self.cfg.spacy_model) return self._nlp def _is_fraction(self, text: str) -> bool: return bool(re.match(r"^\d+/\d+$", text)) def extract_raw_mentions(self, text: str) -> List[RawIngredientMention]: nlp = self._load_nlp() doc = nlp(text.lower()) methods_lower = {m.lower() for m in self.cfg.cooking_methods} mentions = [] for chunk in doc.noun_chunks: head = chunk.root if head.pos_ not in ("NOUN", "PROPN") or head.text in UNIT_VOCAB: continue sent_text = next((s.text for s in doc.sents if chunk.start >= s.start and chunk.end <= s.end), "") quantity_str = unit_str = method_str = "" for child in head.children: if child.dep_ in ("nummod", "quantmod") or child.like_num: quantity_str = child.text elif child.text in UNIT_VOCAB or child.lemma_ in UNIT_VOCAB: unit_str = child.text if not quantity_str: for token in chunk: if token.like_num or self._is_fraction(token.text): quantity_str = token.text; break for token in doc: if abs(token.i - head.i) <= 10 and (token.lemma_ in methods_lower or token.text in methods_lower): method_str = token.text; break mentions.append(RawIngredientMention(head.text, quantity_str, unit_str, method_str, sent_text)) return mentions