Spaces:

MataStrategy
/

ground-zero

Running

File size: 6,473 Bytes

ced078c

"""
Adlam ↔ Latin transliteration for Pular (Guinea Fula).

Adlam (𞤀𞤣𞤤𞤢𞤥) is the indigenous alphabet created by Ibrahima and Abdoulaye Barry
for the Fula language family.  Unicode block U+1E900–U+1E95F.

This module provides:
  - adlam_to_latin(text)   — convert Adlam script → Latin romanization
  - latin_to_adlam(text)   — convert Latin romanization → Adlam script
  - normalize_pular(text)  — canonical pre-processing for ASR training:
                             strips diacritics variants, lowercases, unifies spacing
  - contains_adlam(text)   — detect whether a string has Adlam characters

Transliteration table follows the standard Pular (Guinea) orthography used in:
  - SIL/Fulfulde literacy materials
  - Pullo-Africa-Protagonist dataset
  - guizme/adlam_fulfulde dataset

Note: Whisper's BPE tokenizer covers the entire Unicode BMP but has never seen
Adlam in pre-training text, so Adlam tokens produce garbage output.  Training
and ASR therefore always use Latin romanization; Adlam is converted to Latin
before feeding to the model, and Latin is kept as-is for display.
"""
from __future__ import annotations

import re
import unicodedata

# ── Adlam → Latin mapping (uppercase + lowercase pairs) ──────────────────────
# Source: Unicode Adlam chart + SIL Pulaar keyboard standard
_ADLAM_TO_LATIN: list[tuple[str, str]] = [
    # Uppercase (U+1E900–U+1E921), then lowercase (U+1E922–U+1E943)
    ("\U0001e900", "A"),  # 𞤀 → A
    ("\U0001e901", "B"),  # 𞤁 → B
    ("\U0001e902", "B"),  # 𞤂 → B (Bhe)
    ("\U0001e903", "D"),  # 𞤃 → D
    ("\U0001e904", "D"),  # 𞤄 → D (Dhe)
    ("\U0001e905", "E"),  # 𞤅 → E
    ("\U0001e906", "F"),  # 𞤆 → F
    ("\U0001e907", "G"),  # 𞤇 → G
    ("\U0001e908", "H"),  # 𞤈 → H
    ("\U0001e909", "I"),  # 𞤉 → I
    ("\U0001e90a", "J"),  # 𞤊 → J
    ("\U0001e90b", "K"),  # 𞤋 → K
    ("\U0001e90c", "L"),  # 𞤌 → L
    ("\U0001e90d", "M"),  # 𞤍 → M
    ("\U0001e90e", "N"),  # 𞤎 → N
    ("\U0001e90f", "NG"), # 𞤏 → NG
    ("\U0001e910", "O"),  # 𞤐 → O
    ("\U0001e911", "P"),  # 𞤑 → P
    ("\U0001e912", "R"),  # 𞤒 → R
    ("\U0001e913", "S"),  # 𞤓 → S
    ("\U0001e914", "T"),  # 𞤔 → T
    ("\U0001e915", "U"),  # 𞤕 → U
    ("\U0001e916", "V"),  # 𞤖 → V
    ("\U0001e917", "W"),  # 𞤗 → W
    ("\U0001e918", "Y"),  # 𞤘 → Y
    ("\U0001e919", "Z"),  # 𞤙 → Z
    ("\U0001e91a", "KH"), # 𞤚 → KH
    ("\U0001e91b", "QU"), # 𞤛 → QU
    ("\U0001e91c", "SH"), # 𞤜 → SH
    ("\U0001e91d", "GH"), # 𞤝 → GH
    ("\U0001e91e", "NY"), # 𞤞 → NY (ɲ)
    ("\U0001e91f", "TH"), # 𞤟 → TH
    ("\U0001e920", "WH"), # 𞤠 → WH
    ("\U0001e921", "NY"), # 𞤡 → NY (ɳ)
    # Lowercase
    ("\U0001e922", "a"),  # 𞤢 → a
    ("\U0001e923", "b"),  # 𞤣 → b
    ("\U0001e924", "b"),  # 𞤤 → b
    ("\U0001e925", "d"),  # 𞤥 → d
    ("\U0001e926", "d"),  # 𞤦 → d
    ("\U0001e927", "e"),  # 𞤧 → e
    ("\U0001e928", "f"),  # 𞤨 → f
    ("\U0001e929", "g"),  # 𞤩 → g
    ("\U0001e92a", "h"),  # 𞤪 → h
    ("\U0001e92b", "i"),  # 𞤫 → i
    ("\U0001e92c", "j"),  # 𞤬 → j
    ("\U0001e92d", "k"),  # 𞤭 → k
    ("\U0001e92e", "l"),  # 𞤮 → l
    ("\U0001e92f", "m"),  # 𞤯 → m
    ("\U0001e930", "n"),  # 𞤰 → n
    ("\U0001e931", "ng"), # 𞤱 → ng
    ("\U0001e932", "o"),  # 𞤲 → o
    ("\U0001e933", "p"),  # 𞤳 → p
    ("\U0001e934", "r"),  # 𞤴 → r
    ("\U0001e935", "s"),  # 𞤵 → s
    ("\U0001e936", "t"),  # 𞤶 → t
    ("\U0001e937", "u"),  # 𞤷 → u
    ("\U0001e938", "v"),  # 𞤸 → v
    ("\U0001e939", "w"),  # 𞤹 → w
    ("\U0001e93a", "y"),  # 𞤺 → y
    ("\U0001e93b", "z"),  # 𞤻 → z
    ("\U0001e93c", "kh"), # 𞤼 → kh
    ("\U0001e93d", "qu"), # 𞤽 → qu
    ("\U0001e93e", "sh"), # 𞤾 → sh
    ("\U0001e93f", "gh"), # 𞤿 → gh
    ("\U0001e940", "ny"), # 𞥀 → ny (ɲ)
    ("\U0001e941", "th"), # 𞥁 → th
    ("\U0001e942", "wh"), # 𞥂 → wh
    ("\U0001e943", "ny"), # 𞥃 → ny (ɳ)
    # Digits
    ("\U0001e950", "0"),  # 𞥐
    ("\U0001e951", "1"),  # 𞥑
    ("\U0001e952", "2"),  # 𞥒
    ("\U0001e953", "3"),  # 𞥓
    ("\U0001e954", "4"),  # 𞥔
    ("\U0001e955", "5"),  # 𞥕
    ("\U0001e956", "6"),  # 𞥖
    ("\U0001e957", "7"),  # 𞥗
    ("\U0001e958", "8"),  # 𞥘
    ("\U0001e959", "9"),  # 𞥙
]

# Build fast lookup dicts
_A2L: dict[str, str] = {a: l for a, l in _ADLAM_TO_LATIN}
_L2A: dict[str, str] = {}
for _a, _l in reversed(_ADLAM_TO_LATIN):   # reversed so single-char wins over digraph
    _L2A[_l.lower()] = _a

# Adlam Unicode range for fast detection
_ADLAM_START = 0x1E900
_ADLAM_END   = 0x1E95F


def contains_adlam(text: str) -> bool:
    """Return True if text contains any Adlam character."""
    return any(_ADLAM_START <= ord(c) <= _ADLAM_END for c in text)


def adlam_to_latin(text: str) -> str:
    """Convert Adlam script characters to Latin romanization. Non-Adlam chars pass through."""
    result = []
    for ch in text:
        result.append(_A2L.get(ch, ch))
    return "".join(result)


def latin_to_adlam(text: str) -> str:
    """
    Convert Latin romanization to Adlam script.
    Handles digraphs (ng, kh, sh, gh, ny, th, wh, qu) before single chars.
    """
    text  = text.lower()
    out   = []
    i     = 0
    # Digraphs sorted longest-first
    digraphs = sorted(
        [(k, v) for k, v in _L2A.items() if len(k) == 2],
        key=lambda x: -len(x[0]),
    )
    while i < len(text):
        matched = False
        for lat, adl in digraphs:
            if text[i:i + len(lat)] == lat:
                out.append(adl)
                i += len(lat)
                matched = True
                break
        if not matched:
            ch = text[i]
            out.append(_L2A.get(ch, ch))
            i += 1
    return "".join(out)


def normalize_pular(text: str) -> str:
    """
    Canonical pre-processing for Pular (Guinea Fula) ASR training:
      1. Convert Adlam → Latin if present
      2. Unicode NFC
      3. Lowercase
      4. Collapse whitespace
    """
    if contains_adlam(text):
        text = adlam_to_latin(text)
    text = unicodedata.normalize("NFC", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text