| """recipe_nlp/parser.py — spaCy NER + dependency parsing.""" |
| from __future__ import annotations |
| import re |
| from dataclasses import dataclass, field |
| from typing import List |
| from utils.config import config, NLPConfig |
| from utils.logger import logger |
|
|
| UNIT_VOCAB = { |
| "cup","cups","tablespoon","tablespoons","tbsp","tbs","teaspoon","teaspoons","tsp", |
| "fluid ounce","fl oz","liter","liters","litre","litres","l","milliliter","milliliters","ml", |
| "pint","pints","quart","quarts","gallon","gallons", |
| "gram","grams","g","kilogram","kilograms","kg","ounce","ounces","oz","pound","pounds","lb","lbs", |
| "piece","pieces","slice","slices","clove","cloves","head","heads","bunch","bunches", |
| "handful","handfuls","can","cans","jar","jars","package","packages","pinch","dash","sprinkle", |
| } |
|
|
| @dataclass |
| class ParsedToken: |
| text: str; lemma: str; pos: str; dep: str |
| is_food: bool = False; is_quantity: bool = False |
| is_unit: bool = False; is_method: bool = False |
| head_text: str = "" |
|
|
| @dataclass |
| class RawIngredientMention: |
| food_token: str; quantity_str: str = ""; unit_str: str = "" |
| method_str: str = ""; sentence: str = "" |
|
|
|
|
| class RecipeParser: |
| def __init__(self, cfg: NLPConfig = None): |
| self.cfg = cfg or config.nlp |
| self._nlp = None |
|
|
| def _load_nlp(self): |
| if self._nlp is None: |
| import spacy |
| try: |
| self._nlp = spacy.load(self.cfg.spacy_model) |
| except OSError: |
| logger.info("Downloading spaCy model en_core_web_sm …") |
| from spacy.cli import download |
| download(self.cfg.spacy_model) |
| self._nlp = spacy.load(self.cfg.spacy_model) |
| return self._nlp |
|
|
| def _is_fraction(self, text: str) -> bool: |
| return bool(re.match(r"^\d+/\d+$", text)) |
|
|
| def extract_raw_mentions(self, text: str) -> List[RawIngredientMention]: |
| nlp = self._load_nlp() |
| doc = nlp(text.lower()) |
| methods_lower = {m.lower() for m in self.cfg.cooking_methods} |
| mentions = [] |
| for chunk in doc.noun_chunks: |
| head = chunk.root |
| if head.pos_ not in ("NOUN", "PROPN") or head.text in UNIT_VOCAB: |
| continue |
| sent_text = next((s.text for s in doc.sents if chunk.start >= s.start and chunk.end <= s.end), "") |
| quantity_str = unit_str = method_str = "" |
| for child in head.children: |
| if child.dep_ in ("nummod", "quantmod") or child.like_num: |
| quantity_str = child.text |
| elif child.text in UNIT_VOCAB or child.lemma_ in UNIT_VOCAB: |
| unit_str = child.text |
| if not quantity_str: |
| for token in chunk: |
| if token.like_num or self._is_fraction(token.text): |
| quantity_str = token.text; break |
| for token in doc: |
| if abs(token.i - head.i) <= 10 and (token.lemma_ in methods_lower or token.text in methods_lower): |
| method_str = token.text; break |
| mentions.append(RawIngredientMention(head.text, quantity_str, unit_str, method_str, sent_text)) |
| return mentions |
|
|