ground-zero / src /data /bam_normalize.py
jefffffff9
Phase 3: Voice-to-Voice S2S pipeline — F5-TTS, LLM brain, CER metric
8952fff
"""
Bambara phonetic normalizer.
Unifies French-influenced and informal spellings to the standard
N'Ko-derived Bambara orthography used in most NLP datasets.
Key rules (most impactful for ASR training):
ou → u French vowel → Bambara standard
gn → ɲ French nasal palatal
ny → ɲ English nasal palatal notation
dj → j French palatal affricate
ch → c French palatalized consonant
oo → ɔ long open-o (common informal spelling)
ee → ɛ long open-e (common informal spelling)
These rules run left-to-right on lower-cased text. They are conservative:
only unambiguous substitutions are applied so as not to corrupt words that
happen to contain these letter sequences in a non-phonemic context.
Usage:
from src.data.bam_normalize import normalize
text = normalize("I ni ce, a bɛ djourou la")
# → "i ni ce, a bɛ juruu la"
"""
from __future__ import annotations
import re
import unicodedata
# ── Replacement table (order matters — longest match first) ─────────────────
_RULES: list[tuple[str, str]] = [
("ou", "u"), # most frequent French influence
("dj", "j"), # palatal affricate
("gn", "ɲ"), # nasal palatal (French orthography)
("ny", "ɲ"), # nasal palatal (English-style notation)
("ch", "c"), # palatalized stop
("oo", "ɔ"), # long open-o (informal doubling)
("ee", "ɛ"), # long open-e (informal doubling)
]
# Compile once for speed
_PATTERN = re.compile(
"|".join(re.escape(src) for src, _ in _RULES)
)
_REPLACEMENTS = {src: dst for src, dst in _RULES}
def normalize(text: str) -> str:
"""
Apply phonetic normalization to a Bambara text string.
Steps:
1. Unicode NFC normalization (collapse combining characters).
2. Lowercase.
3. Apply phoneme substitution rules.
4. Collapse multiple spaces.
"""
text = unicodedata.normalize("NFC", text)
text = text.lower()
text = _PATTERN.sub(lambda m: _REPLACEMENTS[m.group(0)], text)
text = re.sub(r" {2,}", " ", text).strip()
return text
def normalize_batch(texts: list[str]) -> list[str]:
return [normalize(t) for t in texts]