Spaces:
Running
Running
File size: 6,473 Bytes
ced078c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | """
Adlam ↔ Latin transliteration for Pular (Guinea Fula).
Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
for the Fula language family. Unicode block U+1E900–U+1E95F.
This module provides:
- adlam_to_latin(text) — convert Adlam script → Latin romanization
- latin_to_adlam(text) — convert Latin romanization → Adlam script
- normalize_pular(text) — canonical pre-processing for ASR training:
strips diacritics variants, lowercases, unifies spacing
- contains_adlam(text) — detect whether a string has Adlam characters
Transliteration table follows the standard Pular (Guinea) orthography used in:
- SIL/Fulfulde literacy materials
- Pullo-Africa-Protagonist dataset
- guizme/adlam_fulfulde dataset
Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
Adlam in pre-training text, so Adlam tokens produce garbage output. Training
and ASR therefore always use Latin romanization; Adlam is converted to Latin
before feeding to the model, and Latin is kept as-is for display.
"""
from __future__ import annotations
import re
import unicodedata
# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
_ADLAM_TO_LATIN: list[tuple[str, str]] = [
# Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
("\U0001e900", "A"), # 𞤀 → A
("\U0001e901", "B"), # 𞤁 → B
("\U0001e902", "B"), # 𞤂 → B (Bhe)
("\U0001e903", "D"), # 𞤃 → D
("\U0001e904", "D"), # 𞤄 → D (Dhe)
("\U0001e905", "E"), # 𞤅 → E
("\U0001e906", "F"), # 𞤆 → F
("\U0001e907", "G"), # 𞤇 → G
("\U0001e908", "H"), # 𞤈 → H
("\U0001e909", "I"), # 𞤉 → I
("\U0001e90a", "J"), # 𞤊 → J
("\U0001e90b", "K"), # 𞤋 → K
("\U0001e90c", "L"), # 𞤌 → L
("\U0001e90d", "M"), # 𞤍 → M
("\U0001e90e", "N"), # 𞤎 → N
("\U0001e90f", "NG"), # 𞤏 → NG
("\U0001e910", "O"), # 𞤐 → O
("\U0001e911", "P"), # 𞤑 → P
("\U0001e912", "R"), # 𞤒 → R
("\U0001e913", "S"), # 𞤓 → S
("\U0001e914", "T"), # 𞤔 → T
("\U0001e915", "U"), # 𞤕 → U
("\U0001e916", "V"), # 𞤖 → V
("\U0001e917", "W"), # 𞤗 → W
("\U0001e918", "Y"), # 𞤘 → Y
("\U0001e919", "Z"), # 𞤙 → Z
("\U0001e91a", "KH"), # 𞤚 → KH
("\U0001e91b", "QU"), # 𞤛 → QU
("\U0001e91c", "SH"), # 𞤜 → SH
("\U0001e91d", "GH"), # 𞤝 → GH
("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
("\U0001e91f", "TH"), # 𞤟 → TH
("\U0001e920", "WH"), # 𞤠 → WH
("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
# Lowercase
("\U0001e922", "a"), # 𞤢 → a
("\U0001e923", "b"), # 𞤣 → b
("\U0001e924", "b"), # 𞤤 → b
("\U0001e925", "d"), # 𞤥 → d
("\U0001e926", "d"), # 𞤦 → d
("\U0001e927", "e"), # 𞤧 → e
("\U0001e928", "f"), # 𞤨 → f
("\U0001e929", "g"), # 𞤩 → g
("\U0001e92a", "h"), # 𞤪 → h
("\U0001e92b", "i"), # 𞤫 → i
("\U0001e92c", "j"), # 𞤬 → j
("\U0001e92d", "k"), # 𞤭 → k
("\U0001e92e", "l"), # 𞤮 → l
("\U0001e92f", "m"), # 𞤯 → m
("\U0001e930", "n"), # 𞤰 → n
("\U0001e931", "ng"), # 𞤱 → ng
("\U0001e932", "o"), # 𞤲 → o
("\U0001e933", "p"), # 𞤳 → p
("\U0001e934", "r"), # 𞤴 → r
("\U0001e935", "s"), # 𞤵 → s
("\U0001e936", "t"), # 𞤶 → t
("\U0001e937", "u"), # 𞤷 → u
("\U0001e938", "v"), # 𞤸 → v
("\U0001e939", "w"), # 𞤹 → w
("\U0001e93a", "y"), # 𞤺 → y
("\U0001e93b", "z"), # 𞤻 → z
("\U0001e93c", "kh"), # 𞤼 → kh
("\U0001e93d", "qu"), # 𞤽 → qu
("\U0001e93e", "sh"), # 𞤾 → sh
("\U0001e93f", "gh"), # 𞤿 → gh
("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
("\U0001e941", "th"), # 𞥁 → th
("\U0001e942", "wh"), # 𞥂 → wh
("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
# Digits
("\U0001e950", "0"), # 𞥐
("\U0001e951", "1"), # 𞥑
("\U0001e952", "2"), # 𞥒
("\U0001e953", "3"), # 𞥓
("\U0001e954", "4"), # 𞥔
("\U0001e955", "5"), # 𞥕
("\U0001e956", "6"), # 𞥖
("\U0001e957", "7"), # 𞥗
("\U0001e958", "8"), # 𞥘
("\U0001e959", "9"), # 𞥙
]
# Build fast lookup dicts
_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
_L2A: dict[str, str] = {}
for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph
_L2A[_l.lower()] = _a
# Adlam Unicode range for fast detection
_ADLAM_START = 0x1E900
_ADLAM_END = 0x1E95F
def contains_adlam(text: str) -> bool:
"""Return True if text contains any Adlam character."""
return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)
def adlam_to_latin(text: str) -> str:
"""Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
result = []
for ch in text:
result.append(_A2L.get(ch, ch))
return "".join(result)
def latin_to_adlam(text: str) -> str:
"""
Convert Latin romanization to Adlam script.
Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
"""
text = text.lower()
out = []
i = 0
# Digraphs sorted longest-first
digraphs = sorted(
[(k, v) for k, v in _L2A.items() if len(k) == 2],
key=lambda x: -len(x[0]),
)
while i < len(text):
matched = False
for lat, adl in digraphs:
if text[i:i + len(lat)] == lat:
out.append(adl)
i += len(lat)
matched = True
break
if not matched:
ch = text[i]
out.append(_L2A.get(ch, ch))
i += 1
return "".join(out)
def normalize_pular(text: str) -> str:
"""
Canonical pre-processing for Pular (Guinea Fula) ASR training:
1. Convert Adlam → Latin if present
2. Unicode NFC
3. Lowercase
4. Collapse whitespace
"""
if contains_adlam(text):
text = adlam_to_latin(text)
text = unicodedata.normalize("NFC", text)
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
|