""" Adlam โ†” Latin transliteration for Pular (Guinea Fula). Adlam (๐žค€๐žคฃ๐žคค๐žคข๐žคฅ) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry for the Fula language family. Unicode block U+1E900โ€“U+1E95F. This module provides: - adlam_to_latin(text) โ€” convert Adlam script โ†’ Latin romanization - latin_to_adlam(text) โ€” convert Latin romanization โ†’ Adlam script - normalize_pular(text) โ€” canonical pre-processing for ASR training: strips diacritics variants, lowercases, unifies spacing - contains_adlam(text) โ€” detect whether a string has Adlam characters Transliteration table follows the standard Pular (Guinea) orthography used in: - SIL/Fulfulde literacy materials - Pullo-Africa-Protagonist dataset - guizme/adlam_fulfulde dataset Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen Adlam in pre-training text, so Adlam tokens produce garbage output. Training and ASR therefore always use Latin romanization; Adlam is converted to Latin before feeding to the model, and Latin is kept as-is for display. """ from __future__ import annotations import re import unicodedata # โ”€โ”€ Adlam โ†’ Latin mapping (uppercase + lowercase pairs) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ # Source: Unicode Adlam chart + SIL Pulaar keyboard standard _ADLAM_TO_LATIN: list[tuple[str, str]] = [ # Uppercase (U+1E900โ€“U+1E921), then lowercase (U+1E922โ€“U+1E943) ("\U0001e900", "A"), # ๐žค€ โ†’ A ("\U0001e901", "B"), # ๐žค โ†’ B ("\U0001e902", "B"), # ๐žค‚ โ†’ B (Bhe) ("\U0001e903", "D"), # ๐žคƒ โ†’ D ("\U0001e904", "D"), # ๐žค„ โ†’ D (Dhe) ("\U0001e905", "E"), # ๐žค… โ†’ E ("\U0001e906", "F"), # ๐žค† โ†’ F ("\U0001e907", "G"), # ๐žค‡ โ†’ G ("\U0001e908", "H"), # ๐žคˆ โ†’ H ("\U0001e909", "I"), # ๐žค‰ โ†’ I ("\U0001e90a", "J"), # ๐žคŠ โ†’ J ("\U0001e90b", "K"), # ๐žค‹ โ†’ K ("\U0001e90c", "L"), # ๐žคŒ โ†’ L ("\U0001e90d", "M"), # ๐žค โ†’ M ("\U0001e90e", "N"), # ๐žคŽ โ†’ N ("\U0001e90f", "NG"), # ๐žค โ†’ NG ("\U0001e910", "O"), # ๐žค โ†’ O ("\U0001e911", "P"), # ๐žค‘ โ†’ P ("\U0001e912", "R"), # ๐žค’ โ†’ R ("\U0001e913", "S"), # ๐žค“ โ†’ S ("\U0001e914", "T"), # ๐žค” โ†’ T ("\U0001e915", "U"), # ๐žค• โ†’ U ("\U0001e916", "V"), # ๐žค– โ†’ V ("\U0001e917", "W"), # ๐žค— โ†’ W ("\U0001e918", "Y"), # ๐žค˜ โ†’ Y ("\U0001e919", "Z"), # ๐žค™ โ†’ Z ("\U0001e91a", "KH"), # ๐žคš โ†’ KH ("\U0001e91b", "QU"), # ๐žค› โ†’ QU ("\U0001e91c", "SH"), # ๐žคœ โ†’ SH ("\U0001e91d", "GH"), # ๐žค โ†’ GH ("\U0001e91e", "NY"), # ๐žคž โ†’ NY (ษฒ) ("\U0001e91f", "TH"), # ๐žคŸ โ†’ TH ("\U0001e920", "WH"), # ๐žค  โ†’ WH ("\U0001e921", "NY"), # ๐žคก โ†’ NY (ษณ) # Lowercase ("\U0001e922", "a"), # ๐žคข โ†’ a ("\U0001e923", "b"), # ๐žคฃ โ†’ b ("\U0001e924", "b"), # ๐žคค โ†’ b ("\U0001e925", "d"), # ๐žคฅ โ†’ d ("\U0001e926", "d"), # ๐žคฆ โ†’ d ("\U0001e927", "e"), # ๐žคง โ†’ e ("\U0001e928", "f"), # ๐žคจ โ†’ f ("\U0001e929", "g"), # ๐žคฉ โ†’ g ("\U0001e92a", "h"), # ๐žคช โ†’ h ("\U0001e92b", "i"), # ๐žคซ โ†’ i ("\U0001e92c", "j"), # ๐žคฌ โ†’ j ("\U0001e92d", "k"), # ๐žคญ โ†’ k ("\U0001e92e", "l"), # ๐žคฎ โ†’ l ("\U0001e92f", "m"), # ๐žคฏ โ†’ m ("\U0001e930", "n"), # ๐žคฐ โ†’ n ("\U0001e931", "ng"), # ๐žคฑ โ†’ ng ("\U0001e932", "o"), # ๐žคฒ โ†’ o ("\U0001e933", "p"), # ๐žคณ โ†’ p ("\U0001e934", "r"), # ๐žคด โ†’ r ("\U0001e935", "s"), # ๐žคต โ†’ s ("\U0001e936", "t"), # ๐žคถ โ†’ t ("\U0001e937", "u"), # ๐žคท โ†’ u ("\U0001e938", "v"), # ๐žคธ โ†’ v ("\U0001e939", "w"), # ๐žคน โ†’ w ("\U0001e93a", "y"), # ๐žคบ โ†’ y ("\U0001e93b", "z"), # ๐žคป โ†’ z ("\U0001e93c", "kh"), # ๐žคผ โ†’ kh ("\U0001e93d", "qu"), # ๐žคฝ โ†’ qu ("\U0001e93e", "sh"), # ๐žคพ โ†’ sh ("\U0001e93f", "gh"), # ๐žคฟ โ†’ gh ("\U0001e940", "ny"), # ๐žฅ€ โ†’ ny (ษฒ) ("\U0001e941", "th"), # ๐žฅ โ†’ th ("\U0001e942", "wh"), # ๐žฅ‚ โ†’ wh ("\U0001e943", "ny"), # ๐žฅƒ โ†’ ny (ษณ) # Digits ("\U0001e950", "0"), # ๐žฅ ("\U0001e951", "1"), # ๐žฅ‘ ("\U0001e952", "2"), # ๐žฅ’ ("\U0001e953", "3"), # ๐žฅ“ ("\U0001e954", "4"), # ๐žฅ” ("\U0001e955", "5"), # ๐žฅ• ("\U0001e956", "6"), # ๐žฅ– ("\U0001e957", "7"), # ๐žฅ— ("\U0001e958", "8"), # ๐žฅ˜ ("\U0001e959", "9"), # ๐žฅ™ ] # Build fast lookup dicts _A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN} _L2A: dict[str, str] = {} for _a, _l in reversed(_ADLAM_TO_LATIN): # reversed so single-char wins over digraph _L2A[_l.lower()] = _a # Adlam Unicode range for fast detection _ADLAM_START = 0x1E900 _ADLAM_END = 0x1E95F def contains_adlam(text: str) -> bool: """Return True if text contains any Adlam character.""" return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text) def adlam_to_latin(text: str) -> str: """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through.""" result = [] for ch in text: result.append(_A2L.get(ch, ch)) return "".join(result) def latin_to_adlam(text: str) -> str: """ Convert Latin romanization to Adlam script. Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars. """ text = text.lower() out = [] i = 0 # Digraphs sorted longest-first digraphs = sorted( [(k, v) for k, v in _L2A.items() if len(k) == 2], key=lambda x: -len(x[0]), ) while i < len(text): matched = False for lat, adl in digraphs: if text[i:i + len(lat)] == lat: out.append(adl) i += len(lat) matched = True break if not matched: ch = text[i] out.append(_L2A.get(ch, ch)) i += 1 return "".join(out) def normalize_pular(text: str) -> str: """ Canonical pre-processing for Pular (Guinea Fula) ASR training: 1. Convert Adlam โ†’ Latin if present 2. Unicode NFC 3. Lowercase 4. Collapse whitespace """ if contains_adlam(text): text = adlam_to_latin(text) text = unicodedata.normalize("NFC", text) text = text.lower() text = re.sub(r"\s+", " ", text).strip() return text