File size: 6,473 Bytes
ced078c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Adlam ↔ Latin transliteration for Pular (Guinea Fula).

Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
for the Fula language family.  Unicode block U+1E900–U+1E95F.

This module provides:
  - adlam_to_latin(text)   — convert Adlam script → Latin romanization
  - latin_to_adlam(text)   — convert Latin romanization → Adlam script
  - normalize_pular(text)  — canonical pre-processing for ASR training:
                             strips diacritics variants, lowercases, unifies spacing
  - contains_adlam(text)   — detect whether a string has Adlam characters

Transliteration table follows the standard Pular (Guinea) orthography used in:
  - SIL/Fulfulde literacy materials
  - Pullo-Africa-Protagonist dataset
  - guizme/adlam_fulfulde dataset

Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
Adlam in pre-training text, so Adlam tokens produce garbage output.  Training
and ASR therefore always use Latin romanization; Adlam is converted to Latin
before feeding to the model, and Latin is kept as-is for display.
"""
from __future__ import annotations

import re
import unicodedata

# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
_ADLAM_TO_LATIN: list[tuple[str, str]] = [
    # Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
    ("\U0001e900", "A"),  # 𞤀 → A
    ("\U0001e901", "B"),  # 𞤁 → B
    ("\U0001e902", "B"),  # 𞤂 → B (Bhe)
    ("\U0001e903", "D"),  # 𞤃 → D
    ("\U0001e904", "D"),  # 𞤄 → D (Dhe)
    ("\U0001e905", "E"),  # 𞤅 → E
    ("\U0001e906", "F"),  # 𞤆 → F
    ("\U0001e907", "G"),  # 𞤇 → G
    ("\U0001e908", "H"),  # 𞤈 → H
    ("\U0001e909", "I"),  # 𞤉 → I
    ("\U0001e90a", "J"),  # 𞤊 → J
    ("\U0001e90b", "K"),  # 𞤋 → K
    ("\U0001e90c", "L"),  # 𞤌 → L
    ("\U0001e90d", "M"),  # 𞤍 → M
    ("\U0001e90e", "N"),  # 𞤎 → N
    ("\U0001e90f", "NG"), # 𞤏 → NG
    ("\U0001e910", "O"),  # 𞤐 → O
    ("\U0001e911", "P"),  # 𞤑 → P
    ("\U0001e912", "R"),  # 𞤒 → R
    ("\U0001e913", "S"),  # 𞤓 → S
    ("\U0001e914", "T"),  # 𞤔 → T
    ("\U0001e915", "U"),  # 𞤕 → U
    ("\U0001e916", "V"),  # 𞤖 → V
    ("\U0001e917", "W"),  # 𞤗 → W
    ("\U0001e918", "Y"),  # 𞤘 → Y
    ("\U0001e919", "Z"),  # 𞤙 → Z
    ("\U0001e91a", "KH"), # 𞤚 → KH
    ("\U0001e91b", "QU"), # 𞤛 → QU
    ("\U0001e91c", "SH"), # 𞤜 → SH
    ("\U0001e91d", "GH"), # 𞤝 → GH
    ("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
    ("\U0001e91f", "TH"), # 𞤟 → TH
    ("\U0001e920", "WH"), # 𞤠 → WH
    ("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
    # Lowercase
    ("\U0001e922", "a"),  # 𞤢 → a
    ("\U0001e923", "b"),  # 𞤣 → b
    ("\U0001e924", "b"),  # 𞤤 → b
    ("\U0001e925", "d"),  # 𞤥 → d
    ("\U0001e926", "d"),  # 𞤦 → d
    ("\U0001e927", "e"),  # 𞤧 → e
    ("\U0001e928", "f"),  # 𞤨 → f
    ("\U0001e929", "g"),  # 𞤩 → g
    ("\U0001e92a", "h"),  # 𞤪 → h
    ("\U0001e92b", "i"),  # 𞤫 → i
    ("\U0001e92c", "j"),  # 𞤬 → j
    ("\U0001e92d", "k"),  # 𞤭 → k
    ("\U0001e92e", "l"),  # 𞤮 → l
    ("\U0001e92f", "m"),  # 𞤯 → m
    ("\U0001e930", "n"),  # 𞤰 → n
    ("\U0001e931", "ng"), # 𞤱 → ng
    ("\U0001e932", "o"),  # 𞤲 → o
    ("\U0001e933", "p"),  # 𞤳 → p
    ("\U0001e934", "r"),  # 𞤴 → r
    ("\U0001e935", "s"),  # 𞤵 → s
    ("\U0001e936", "t"),  # 𞤶 → t
    ("\U0001e937", "u"),  # 𞤷 → u
    ("\U0001e938", "v"),  # 𞤸 → v
    ("\U0001e939", "w"),  # 𞤹 → w
    ("\U0001e93a", "y"),  # 𞤺 → y
    ("\U0001e93b", "z"),  # 𞤻 → z
    ("\U0001e93c", "kh"), # 𞤼 → kh
    ("\U0001e93d", "qu"), # 𞤽 → qu
    ("\U0001e93e", "sh"), # 𞤾 → sh
    ("\U0001e93f", "gh"), # 𞤿 → gh
    ("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
    ("\U0001e941", "th"), # 𞥁 → th
    ("\U0001e942", "wh"), # 𞥂 → wh
    ("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
    # Digits
    ("\U0001e950", "0"),  # 𞥐
    ("\U0001e951", "1"),  # 𞥑
    ("\U0001e952", "2"),  # 𞥒
    ("\U0001e953", "3"),  # 𞥓
    ("\U0001e954", "4"),  # 𞥔
    ("\U0001e955", "5"),  # 𞥕
    ("\U0001e956", "6"),  # 𞥖
    ("\U0001e957", "7"),  # 𞥗
    ("\U0001e958", "8"),  # 𞥘
    ("\U0001e959", "9"),  # 𞥙
]

# Build fast lookup dicts
_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
_L2A: dict[str, str] = {}
for _a, _l in reversed(_ADLAM_TO_LATIN):   # reversed so single-char wins over digraph
    _L2A[_l.lower()] = _a

# Adlam Unicode range for fast detection
_ADLAM_START = 0x1E900
_ADLAM_END   = 0x1E95F


def contains_adlam(text: str) -> bool:
    """Return True if text contains any Adlam character."""
    return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)


def adlam_to_latin(text: str) -> str:
    """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
    result = []
    for ch in text:
        result.append(_A2L.get(ch, ch))
    return "".join(result)


def latin_to_adlam(text: str) -> str:
    """
    Convert Latin romanization to Adlam script.
    Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
    """
    text  = text.lower()
    out   = []
    i     = 0
    # Digraphs sorted longest-first
    digraphs = sorted(
        [(k, v) for k, v in _L2A.items() if len(k) == 2],
        key=lambda x: -len(x[0]),
    )
    while i < len(text):
        matched = False
        for lat, adl in digraphs:
            if text[i:i + len(lat)] == lat:
                out.append(adl)
                i += len(lat)
                matched = True
                break
        if not matched:
            ch = text[i]
            out.append(_L2A.get(ch, ch))
            i += 1
    return "".join(out)


def normalize_pular(text: str) -> str:
    """
    Canonical pre-processing for Pular (Guinea Fula) ASR training:
      1. Convert Adlam → Latin if present
      2. Unicode NFC
      3. Lowercase
      4. Collapse whitespace
    """
    if contains_adlam(text):
        text = adlam_to_latin(text)
    text = unicodedata.normalize("NFC", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text