File size: 2,280 Bytes
8952fff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Bambara phonetic normalizer.

Unifies French-influenced and informal spellings to the standard
N'Ko-derived Bambara orthography used in most NLP datasets.

Key rules (most impactful for ASR training):
    ou  →  u        French vowel → Bambara standard
    gn  →  ɲ        French nasal palatal
    ny  →  ɲ        English nasal palatal notation
    dj  →  j        French palatal affricate
    ch  →  c        French palatalized consonant
    oo  →  ɔ        long open-o (common informal spelling)
    ee  →  ɛ        long open-e (common informal spelling)

These rules run left-to-right on lower-cased text.  They are conservative:
only unambiguous substitutions are applied so as not to corrupt words that
happen to contain these letter sequences in a non-phonemic context.

Usage:
    from src.data.bam_normalize import normalize
    text = normalize("I ni ce, a bɛ djourou la")
    # → "i ni ce, a bɛ juruu la"
"""
from __future__ import annotations

import re
import unicodedata


# ── Replacement table (order matters — longest match first) ─────────────────
_RULES: list[tuple[str, str]] = [
    ("ou",  "u"),    # most frequent French influence
    ("dj",  "j"),    # palatal affricate
    ("gn",  "ɲ"),    # nasal palatal (French orthography)
    ("ny",  "ɲ"),    # nasal palatal (English-style notation)
    ("ch",  "c"),    # palatalized stop
    ("oo",  "ɔ"),    # long open-o (informal doubling)
    ("ee",  "ɛ"),    # long open-e (informal doubling)
]

# Compile once for speed
_PATTERN = re.compile(
    "|".join(re.escape(src) for src, _ in _RULES)
)
_REPLACEMENTS = {src: dst for src, dst in _RULES}


def normalize(text: str) -> str:
    """
    Apply phonetic normalization to a Bambara text string.

    Steps:
        1. Unicode NFC normalization (collapse combining characters).
        2. Lowercase.
        3. Apply phoneme substitution rules.
        4. Collapse multiple spaces.
    """
    text = unicodedata.normalize("NFC", text)
    text = text.lower()
    text = _PATTERN.sub(lambda m: _REPLACEMENTS[m.group(0)], text)
    text = re.sub(r" {2,}", " ", text).strip()
    return text


def normalize_batch(texts: list[str]) -> list[str]:
    return [normalize(t) for t in texts]