""" Bambara phonetic normalizer. Unifies French-influenced and informal spellings to the standard N'Ko-derived Bambara orthography used in most NLP datasets. Key rules (most impactful for ASR training): ou → u French vowel → Bambara standard gn → ɲ French nasal palatal ny → ɲ English nasal palatal notation dj → j French palatal affricate ch → c French palatalized consonant oo → ɔ long open-o (common informal spelling) ee → ɛ long open-e (common informal spelling) These rules run left-to-right on lower-cased text. They are conservative: only unambiguous substitutions are applied so as not to corrupt words that happen to contain these letter sequences in a non-phonemic context. Usage: from src.data.bam_normalize import normalize text = normalize("I ni ce, a bɛ djourou la") # → "i ni ce, a bɛ juruu la" """ from __future__ import annotations import re import unicodedata # ── Replacement table (order matters — longest match first) ───────────────── _RULES: list[tuple[str, str]] = [ ("ou", "u"), # most frequent French influence ("dj", "j"), # palatal affricate ("gn", "ɲ"), # nasal palatal (French orthography) ("ny", "ɲ"), # nasal palatal (English-style notation) ("ch", "c"), # palatalized stop ("oo", "ɔ"), # long open-o (informal doubling) ("ee", "ɛ"), # long open-e (informal doubling) ] # Compile once for speed _PATTERN = re.compile( "|".join(re.escape(src) for src, _ in _RULES) ) _REPLACEMENTS = {src: dst for src, dst in _RULES} def normalize(text: str) -> str: """ Apply phonetic normalization to a Bambara text string. Steps: 1. Unicode NFC normalization (collapse combining characters). 2. Lowercase. 3. Apply phoneme substitution rules. 4. Collapse multiple spaces. """ text = unicodedata.normalize("NFC", text) text = text.lower() text = _PATTERN.sub(lambda m: _REPLACEMENTS[m.group(0)], text) text = re.sub(r" {2,}", " ", text).strip() return text def normalize_batch(texts: list[str]) -> list[str]: return [normalize(t) for t in texts]