/** * Text preprocessor — converts numbers, currencies, ordinals, etc. to words. * Port of KittenTTS preprocess.py. * https://github.com/KittenML/KittenTTS */ // ── Number → Words ── const ONES = [ "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", ]; const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]; const SCALE = ["", "thousand", "million", "billion", "trillion"]; const ORDINAL_EXCEPTIONS: Record = { one: "first", two: "second", three: "third", four: "fourth", five: "fifth", six: "sixth", seven: "seventh", eight: "eighth", nine: "ninth", twelve: "twelfth", }; const CURRENCY_SYMBOLS: Record = { "$": "dollar", "€": "euro", "£": "pound", "¥": "yen", "₹": "rupee", "₩": "won", "₿": "bitcoin", }; function threeDigitsToWords(n: number): string { if (n === 0) return ""; const parts: string[] = []; const hundreds = Math.floor(n / 100); const remainder = n % 100; if (hundreds) parts.push(`${ONES[hundreds]} hundred`); if (remainder < 20) { if (remainder) parts.push(ONES[remainder]); } else { const tensWord = TENS[Math.floor(remainder / 10)]; const onesWord = ONES[remainder % 10]; parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord); } return parts.join(" "); } export function numberToWords(n: number): string { if (!Number.isInteger(n)) n = Math.floor(n); if (n === 0) return "zero"; if (n < 0) return `negative ${numberToWords(-n)}`; if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) { const hundreds = Math.floor(n / 100); if (hundreds < 20) return `${ONES[hundreds]} hundred`; } const parts: string[] = []; let remaining = n; for (let i = 0; i < SCALE.length; i++) { const chunk = remaining % 1000; if (chunk) { const w = threeDigitsToWords(chunk); parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w); } remaining = Math.floor(remaining / 1000); if (remaining === 0) break; } return parts.reverse().join(" "); } export function floatToWords(value: string | number, sep = "point"): string { const text = typeof value === "string" ? value : `${value}`; const negative = text.startsWith("-"); const clean = negative ? text.slice(1) : text; let result: string; if (clean.includes(".")) { const [intPart, decPart] = clean.split("."); const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero"; const digitMap = ["zero", ...ONES.slice(1)]; const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" "); result = `${intWords} ${sep} ${decWords}`; } else { result = numberToWords(parseInt(clean, 10)); } return negative ? `negative ${result}` : result; } function ordinalSuffix(n: number): string { const word = numberToWords(n); let prefix: string, last: string, joiner: string; if (word.includes("-")) { const idx = word.lastIndexOf("-"); prefix = word.slice(0, idx); last = word.slice(idx + 1); joiner = "-"; } else { const parts = word.split(" "); if (parts.length >= 2) { last = parts.pop()!; prefix = parts.join(" "); joiner = " "; } else { last = word; prefix = ""; joiner = ""; } } let lastOrd: string; if (ORDINAL_EXCEPTIONS[last]) { lastOrd = ORDINAL_EXCEPTIONS[last]; } else if (last.endsWith("t")) { lastOrd = last + "h"; } else if (last.endsWith("e")) { lastOrd = last.slice(0, -1) + "th"; } else { lastOrd = last + "th"; } return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd; } // ── Regex patterns ── const RE_NUMBER = /(? = { km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters", gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes", hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz", mph: "miles per hour", kph: "kilometers per hour", ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds", "°c": "degrees Celsius", "c°": "degrees Celsius", "°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit", }; const SCALE_MAP: Record = { K: "thousand", M: "million", B: "billion", T: "trillion", }; const DECADE_MAP: Record = { 0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties", 5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties", }; // ── Expansion functions ── function expandOrdinals(text: string): string { return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10))); } function expandPercentages(text: string): string { return text.replace(RE_PERCENT, (_, raw) => { const clean = raw.replace(/,/g, ""); const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10)); return `${w} percent`; }); } function expandCurrency(text: string): string { return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => { const clean = raw.replace(/,/g, ""); const unit = CURRENCY_SYMBOLS[symbol] || ""; if (scaleSuffix) { const scaleWord = SCALE_MAP[scaleSuffix]; const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10)); return `${num} ${scaleWord} ${unit}s`.trim(); } if (clean.includes(".")) { const [intPart, decPart] = clean.split("."); const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10); let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`; if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`; return result; } const val = parseInt(clean, 10); return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`; }); } function expandTime(text: string): string { return text.replace(RE_TIME, (_, h, m, _s, suffix) => { const hour = parseInt(h, 10); const mins = parseInt(m, 10); const sfx = suffix ? ` ${suffix.toLowerCase()}` : ""; const hWords = numberToWords(hour); if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`; if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`; return `${hWords} ${numberToWords(mins)}${sfx}`; }); } function expandRanges(text: string): string { return text.replace(RE_RANGE, (_, lo, hi) => `${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}` ); } function expandModelNames(text: string): string { return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`); } function expandUnits(text: string): string { return text.replace(RE_UNIT, (_, raw, unit) => { const expanded = UNIT_MAP[unit.toLowerCase()] || unit; const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10)); return `${num} ${expanded}`; }); } function expandScaleSuffixes(text: string): string { return text.replace(RE_SCALE, (_, raw, suffix) => { const scaleWord = SCALE_MAP[suffix] || suffix; const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10)); return `${num} ${scaleWord}`; }); } function expandScientific(text: string): string { return text.replace(RE_SCI, (_, coeff, exp) => { const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10)); const expVal = parseInt(exp, 10); const sign = expVal < 0 ? "negative " : ""; return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`; }); } function expandFractions(text: string): string { return text.replace(RE_FRACTION, (m, num, den) => { const n = parseInt(num, 10); const d = parseInt(den, 10); if (d === 0) return m; const nWords = numberToWords(n); let dWord: string; if (d === 2) dWord = n === 1 ? "half" : "halves"; else if (d === 4) dWord = n === 1 ? "quarter" : "quarters"; else { dWord = ordinalSuffix(d); if (n !== 1) dWord += "s"; } return `${nWords} ${dWord}`; }); } function expandDecades(text: string): string { return text.replace(RE_DECADE, (_, base) => { const b = parseInt(base, 10); const decadeDigit = b % 10; const decadeWord = DECADE_MAP[decadeDigit] || ""; if (b < 10) return decadeWord; return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`; }); } function replaceNumbers(text: string): string { return text.replace(RE_NUMBER, (m) => { const clean = m.replace(/,/g, ""); if (clean.includes(".")) return floatToWords(clean); return numberToWords(parseInt(clean, 10)); }); } function normalizeLeadingDecimals(text: string): string { text = text.replace(/(?]+>/g; const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g; const RE_SPACES = /\s+/g; export function preprocessText(text: string): string { // Remove URLs, emails, HTML text = text.replace(RE_URL, ""); text = text.replace(RE_EMAIL, ""); text = text.replace(RE_HTML, " "); // Normalize leading decimals text = normalizeLeadingDecimals(text); // Expand special forms before generic number replacement text = expandCurrency(text); text = expandPercentages(text); text = expandScientific(text); text = expandTime(text); text = expandOrdinals(text); text = expandUnits(text); text = expandScaleSuffixes(text); text = expandFractions(text); text = expandDecades(text); text = expandRanges(text); text = expandModelNames(text); text = replaceNumbers(text); // Remove non-prosodic punctuation text = text.replace(RE_PUNCT, " "); // Lowercase and collapse whitespace text = text.toLowerCase(); text = text.replace(RE_SPACES, " ").trim(); return text; }