Spaces:
Running
Running
| /** | |
| * Text preprocessor — converts numbers, currencies, ordinals, etc. to words. | |
| * Port of KittenTTS preprocess.py. | |
| * https://github.com/KittenML/KittenTTS | |
| */ | |
| // ── Number → Words ── | |
| const ONES = [ | |
| "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", | |
| "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", | |
| "seventeen", "eighteen", "nineteen", | |
| ]; | |
| const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]; | |
| const SCALE = ["", "thousand", "million", "billion", "trillion"]; | |
| const ORDINAL_EXCEPTIONS: Record<string, string> = { | |
| one: "first", two: "second", three: "third", four: "fourth", | |
| five: "fifth", six: "sixth", seven: "seventh", eight: "eighth", | |
| nine: "ninth", twelve: "twelfth", | |
| }; | |
| const CURRENCY_SYMBOLS: Record<string, string> = { | |
| "$": "dollar", "€": "euro", "£": "pound", "¥": "yen", | |
| "₹": "rupee", "₩": "won", "₿": "bitcoin", | |
| }; | |
| function threeDigitsToWords(n: number): string { | |
| if (n === 0) return ""; | |
| const parts: string[] = []; | |
| const hundreds = Math.floor(n / 100); | |
| const remainder = n % 100; | |
| if (hundreds) parts.push(`${ONES[hundreds]} hundred`); | |
| if (remainder < 20) { | |
| if (remainder) parts.push(ONES[remainder]); | |
| } else { | |
| const tensWord = TENS[Math.floor(remainder / 10)]; | |
| const onesWord = ONES[remainder % 10]; | |
| parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord); | |
| } | |
| return parts.join(" "); | |
| } | |
| export function numberToWords(n: number): string { | |
| if (!Number.isInteger(n)) n = Math.floor(n); | |
| if (n === 0) return "zero"; | |
| if (n < 0) return `negative ${numberToWords(-n)}`; | |
| if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) { | |
| const hundreds = Math.floor(n / 100); | |
| if (hundreds < 20) return `${ONES[hundreds]} hundred`; | |
| } | |
| const parts: string[] = []; | |
| let remaining = n; | |
| for (let i = 0; i < SCALE.length; i++) { | |
| const chunk = remaining % 1000; | |
| if (chunk) { | |
| const w = threeDigitsToWords(chunk); | |
| parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w); | |
| } | |
| remaining = Math.floor(remaining / 1000); | |
| if (remaining === 0) break; | |
| } | |
| return parts.reverse().join(" "); | |
| } | |
| export function floatToWords(value: string | number, sep = "point"): string { | |
| const text = typeof value === "string" ? value : `${value}`; | |
| const negative = text.startsWith("-"); | |
| const clean = negative ? text.slice(1) : text; | |
| let result: string; | |
| if (clean.includes(".")) { | |
| const [intPart, decPart] = clean.split("."); | |
| const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero"; | |
| const digitMap = ["zero", ...ONES.slice(1)]; | |
| const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" "); | |
| result = `${intWords} ${sep} ${decWords}`; | |
| } else { | |
| result = numberToWords(parseInt(clean, 10)); | |
| } | |
| return negative ? `negative ${result}` : result; | |
| } | |
| function ordinalSuffix(n: number): string { | |
| const word = numberToWords(n); | |
| let prefix: string, last: string, joiner: string; | |
| if (word.includes("-")) { | |
| const idx = word.lastIndexOf("-"); | |
| prefix = word.slice(0, idx); | |
| last = word.slice(idx + 1); | |
| joiner = "-"; | |
| } else { | |
| const parts = word.split(" "); | |
| if (parts.length >= 2) { | |
| last = parts.pop()!; | |
| prefix = parts.join(" "); | |
| joiner = " "; | |
| } else { | |
| last = word; | |
| prefix = ""; | |
| joiner = ""; | |
| } | |
| } | |
| let lastOrd: string; | |
| if (ORDINAL_EXCEPTIONS[last]) { | |
| lastOrd = ORDINAL_EXCEPTIONS[last]; | |
| } else if (last.endsWith("t")) { | |
| lastOrd = last + "h"; | |
| } else if (last.endsWith("e")) { | |
| lastOrd = last.slice(0, -1) + "th"; | |
| } else { | |
| lastOrd = last + "th"; | |
| } | |
| return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd; | |
| } | |
| // ── Regex patterns ── | |
| const RE_NUMBER = /(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?/g; | |
| const RE_ORDINAL = /\b(\d+)(st|nd|rd|th)\b/gi; | |
| const RE_PERCENT = /(-?[\d,]+(?:\.\d+)?)\s*%/g; | |
| const RE_CURRENCY = /([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])/g; | |
| const RE_TIME = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b/gi; | |
| const RE_RANGE = /(?<!\w)(\d+)-(\d+)(?!\w)/g; | |
| const RE_MODEL_VER = /\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)/g; | |
| const RE_UNIT = /(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b/gi; | |
| const RE_SCALE = /(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])/g; | |
| const RE_SCI = /(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])/g; | |
| const RE_FRACTION = /\b(\d+)\s*\/\s*(\d+)\b/g; | |
| const RE_DECADE = /\b(\d{1,3})0s\b/g; | |
| const UNIT_MAP: Record<string, string> = { | |
| km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters", | |
| gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes", | |
| hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz", | |
| mph: "miles per hour", kph: "kilometers per hour", | |
| ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds", | |
| "°c": "degrees Celsius", "c°": "degrees Celsius", | |
| "°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit", | |
| }; | |
| const SCALE_MAP: Record<string, string> = { | |
| K: "thousand", M: "million", B: "billion", T: "trillion", | |
| }; | |
| const DECADE_MAP: Record<number, string> = { | |
| 0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties", | |
| 5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties", | |
| }; | |
| // ── Expansion functions ── | |
| function expandOrdinals(text: string): string { | |
| return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10))); | |
| } | |
| function expandPercentages(text: string): string { | |
| return text.replace(RE_PERCENT, (_, raw) => { | |
| const clean = raw.replace(/,/g, ""); | |
| const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10)); | |
| return `${w} percent`; | |
| }); | |
| } | |
| function expandCurrency(text: string): string { | |
| return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => { | |
| const clean = raw.replace(/,/g, ""); | |
| const unit = CURRENCY_SYMBOLS[symbol] || ""; | |
| if (scaleSuffix) { | |
| const scaleWord = SCALE_MAP[scaleSuffix]; | |
| const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10)); | |
| return `${num} ${scaleWord} ${unit}s`.trim(); | |
| } | |
| if (clean.includes(".")) { | |
| const [intPart, decPart] = clean.split("."); | |
| const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10); | |
| let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`; | |
| if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`; | |
| return result; | |
| } | |
| const val = parseInt(clean, 10); | |
| return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`; | |
| }); | |
| } | |
| function expandTime(text: string): string { | |
| return text.replace(RE_TIME, (_, h, m, _s, suffix) => { | |
| const hour = parseInt(h, 10); | |
| const mins = parseInt(m, 10); | |
| const sfx = suffix ? ` ${suffix.toLowerCase()}` : ""; | |
| const hWords = numberToWords(hour); | |
| if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`; | |
| if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`; | |
| return `${hWords} ${numberToWords(mins)}${sfx}`; | |
| }); | |
| } | |
| function expandRanges(text: string): string { | |
| return text.replace(RE_RANGE, (_, lo, hi) => | |
| `${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}` | |
| ); | |
| } | |
| function expandModelNames(text: string): string { | |
| return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`); | |
| } | |
| function expandUnits(text: string): string { | |
| return text.replace(RE_UNIT, (_, raw, unit) => { | |
| const expanded = UNIT_MAP[unit.toLowerCase()] || unit; | |
| const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10)); | |
| return `${num} ${expanded}`; | |
| }); | |
| } | |
| function expandScaleSuffixes(text: string): string { | |
| return text.replace(RE_SCALE, (_, raw, suffix) => { | |
| const scaleWord = SCALE_MAP[suffix] || suffix; | |
| const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10)); | |
| return `${num} ${scaleWord}`; | |
| }); | |
| } | |
| function expandScientific(text: string): string { | |
| return text.replace(RE_SCI, (_, coeff, exp) => { | |
| const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10)); | |
| const expVal = parseInt(exp, 10); | |
| const sign = expVal < 0 ? "negative " : ""; | |
| return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`; | |
| }); | |
| } | |
| function expandFractions(text: string): string { | |
| return text.replace(RE_FRACTION, (m, num, den) => { | |
| const n = parseInt(num, 10); | |
| const d = parseInt(den, 10); | |
| if (d === 0) return m; | |
| const nWords = numberToWords(n); | |
| let dWord: string; | |
| if (d === 2) dWord = n === 1 ? "half" : "halves"; | |
| else if (d === 4) dWord = n === 1 ? "quarter" : "quarters"; | |
| else { | |
| dWord = ordinalSuffix(d); | |
| if (n !== 1) dWord += "s"; | |
| } | |
| return `${nWords} ${dWord}`; | |
| }); | |
| } | |
| function expandDecades(text: string): string { | |
| return text.replace(RE_DECADE, (_, base) => { | |
| const b = parseInt(base, 10); | |
| const decadeDigit = b % 10; | |
| const decadeWord = DECADE_MAP[decadeDigit] || ""; | |
| if (b < 10) return decadeWord; | |
| return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`; | |
| }); | |
| } | |
| function replaceNumbers(text: string): string { | |
| return text.replace(RE_NUMBER, (m) => { | |
| const clean = m.replace(/,/g, ""); | |
| if (clean.includes(".")) return floatToWords(clean); | |
| return numberToWords(parseInt(clean, 10)); | |
| }); | |
| } | |
| function normalizeLeadingDecimals(text: string): string { | |
| text = text.replace(/(?<!\d)(-)\.([\d])/g, "$1" + "0.$2"); | |
| text = text.replace(/(?<!\d)\.([\d])/g, "0.$1"); | |
| return text; | |
| } | |
| const RE_URL = /https?:\/\/\S+|www\.\S+/g; | |
| const RE_EMAIL = /\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b/gi; | |
| const RE_HTML = /<[^>]+>/g; | |
| const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g; | |
| const RE_SPACES = /\s+/g; | |
| export function preprocessText(text: string): string { | |
| // Remove URLs, emails, HTML | |
| text = text.replace(RE_URL, ""); | |
| text = text.replace(RE_EMAIL, ""); | |
| text = text.replace(RE_HTML, " "); | |
| // Normalize leading decimals | |
| text = normalizeLeadingDecimals(text); | |
| // Expand special forms before generic number replacement | |
| text = expandCurrency(text); | |
| text = expandPercentages(text); | |
| text = expandScientific(text); | |
| text = expandTime(text); | |
| text = expandOrdinals(text); | |
| text = expandUnits(text); | |
| text = expandScaleSuffixes(text); | |
| text = expandFractions(text); | |
| text = expandDecades(text); | |
| text = expandRanges(text); | |
| text = expandModelNames(text); | |
| text = replaceNumbers(text); | |
| // Remove non-prosodic punctuation | |
| text = text.replace(RE_PUNCT, " "); | |
| // Lowercase and collapse whitespace | |
| text = text.toLowerCase(); | |
| text = text.replace(RE_SPACES, " ").trim(); | |
| return text; | |
| } | |