KittenTTS-WebGPU / src /lib /preprocess.ts
shreyask's picture
feat: KittenTTS WebGPU browser demo
9b1aef8 verified
/**
* Text preprocessor — converts numbers, currencies, ordinals, etc. to words.
* Port of KittenTTS preprocess.py.
* https://github.com/KittenML/KittenTTS
*/
// ── Number → Words ──
const ONES = [
"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
"seventeen", "eighteen", "nineteen",
];
const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"];
const SCALE = ["", "thousand", "million", "billion", "trillion"];
const ORDINAL_EXCEPTIONS: Record<string, string> = {
one: "first", two: "second", three: "third", four: "fourth",
five: "fifth", six: "sixth", seven: "seventh", eight: "eighth",
nine: "ninth", twelve: "twelfth",
};
const CURRENCY_SYMBOLS: Record<string, string> = {
"$": "dollar", "€": "euro", "£": "pound", "¥": "yen",
"₹": "rupee", "₩": "won", "₿": "bitcoin",
};
function threeDigitsToWords(n: number): string {
if (n === 0) return "";
const parts: string[] = [];
const hundreds = Math.floor(n / 100);
const remainder = n % 100;
if (hundreds) parts.push(`${ONES[hundreds]} hundred`);
if (remainder < 20) {
if (remainder) parts.push(ONES[remainder]);
} else {
const tensWord = TENS[Math.floor(remainder / 10)];
const onesWord = ONES[remainder % 10];
parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord);
}
return parts.join(" ");
}
export function numberToWords(n: number): string {
if (!Number.isInteger(n)) n = Math.floor(n);
if (n === 0) return "zero";
if (n < 0) return `negative ${numberToWords(-n)}`;
if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) {
const hundreds = Math.floor(n / 100);
if (hundreds < 20) return `${ONES[hundreds]} hundred`;
}
const parts: string[] = [];
let remaining = n;
for (let i = 0; i < SCALE.length; i++) {
const chunk = remaining % 1000;
if (chunk) {
const w = threeDigitsToWords(chunk);
parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w);
}
remaining = Math.floor(remaining / 1000);
if (remaining === 0) break;
}
return parts.reverse().join(" ");
}
export function floatToWords(value: string | number, sep = "point"): string {
const text = typeof value === "string" ? value : `${value}`;
const negative = text.startsWith("-");
const clean = negative ? text.slice(1) : text;
let result: string;
if (clean.includes(".")) {
const [intPart, decPart] = clean.split(".");
const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero";
const digitMap = ["zero", ...ONES.slice(1)];
const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" ");
result = `${intWords} ${sep} ${decWords}`;
} else {
result = numberToWords(parseInt(clean, 10));
}
return negative ? `negative ${result}` : result;
}
function ordinalSuffix(n: number): string {
const word = numberToWords(n);
let prefix: string, last: string, joiner: string;
if (word.includes("-")) {
const idx = word.lastIndexOf("-");
prefix = word.slice(0, idx);
last = word.slice(idx + 1);
joiner = "-";
} else {
const parts = word.split(" ");
if (parts.length >= 2) {
last = parts.pop()!;
prefix = parts.join(" ");
joiner = " ";
} else {
last = word;
prefix = "";
joiner = "";
}
}
let lastOrd: string;
if (ORDINAL_EXCEPTIONS[last]) {
lastOrd = ORDINAL_EXCEPTIONS[last];
} else if (last.endsWith("t")) {
lastOrd = last + "h";
} else if (last.endsWith("e")) {
lastOrd = last.slice(0, -1) + "th";
} else {
lastOrd = last + "th";
}
return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd;
}
// ── Regex patterns ──
const RE_NUMBER = /(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?/g;
const RE_ORDINAL = /\b(\d+)(st|nd|rd|th)\b/gi;
const RE_PERCENT = /(-?[\d,]+(?:\.\d+)?)\s*%/g;
const RE_CURRENCY = /([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])/g;
const RE_TIME = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b/gi;
const RE_RANGE = /(?<!\w)(\d+)-(\d+)(?!\w)/g;
const RE_MODEL_VER = /\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)/g;
const RE_UNIT = /(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b/gi;
const RE_SCALE = /(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])/g;
const RE_SCI = /(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])/g;
const RE_FRACTION = /\b(\d+)\s*\/\s*(\d+)\b/g;
const RE_DECADE = /\b(\d{1,3})0s\b/g;
const UNIT_MAP: Record<string, string> = {
km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters",
gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes",
hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz",
mph: "miles per hour", kph: "kilometers per hour",
ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds",
"°c": "degrees Celsius", "c°": "degrees Celsius",
"°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit",
};
const SCALE_MAP: Record<string, string> = {
K: "thousand", M: "million", B: "billion", T: "trillion",
};
const DECADE_MAP: Record<number, string> = {
0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties",
5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties",
};
// ── Expansion functions ──
function expandOrdinals(text: string): string {
return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10)));
}
function expandPercentages(text: string): string {
return text.replace(RE_PERCENT, (_, raw) => {
const clean = raw.replace(/,/g, "");
const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10));
return `${w} percent`;
});
}
function expandCurrency(text: string): string {
return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => {
const clean = raw.replace(/,/g, "");
const unit = CURRENCY_SYMBOLS[symbol] || "";
if (scaleSuffix) {
const scaleWord = SCALE_MAP[scaleSuffix];
const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10));
return `${num} ${scaleWord} ${unit}s`.trim();
}
if (clean.includes(".")) {
const [intPart, decPart] = clean.split(".");
const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10);
let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`;
if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`;
return result;
}
const val = parseInt(clean, 10);
return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`;
});
}
function expandTime(text: string): string {
return text.replace(RE_TIME, (_, h, m, _s, suffix) => {
const hour = parseInt(h, 10);
const mins = parseInt(m, 10);
const sfx = suffix ? ` ${suffix.toLowerCase()}` : "";
const hWords = numberToWords(hour);
if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`;
if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`;
return `${hWords} ${numberToWords(mins)}${sfx}`;
});
}
function expandRanges(text: string): string {
return text.replace(RE_RANGE, (_, lo, hi) =>
`${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}`
);
}
function expandModelNames(text: string): string {
return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`);
}
function expandUnits(text: string): string {
return text.replace(RE_UNIT, (_, raw, unit) => {
const expanded = UNIT_MAP[unit.toLowerCase()] || unit;
const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10));
return `${num} ${expanded}`;
});
}
function expandScaleSuffixes(text: string): string {
return text.replace(RE_SCALE, (_, raw, suffix) => {
const scaleWord = SCALE_MAP[suffix] || suffix;
const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10));
return `${num} ${scaleWord}`;
});
}
function expandScientific(text: string): string {
return text.replace(RE_SCI, (_, coeff, exp) => {
const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10));
const expVal = parseInt(exp, 10);
const sign = expVal < 0 ? "negative " : "";
return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`;
});
}
function expandFractions(text: string): string {
return text.replace(RE_FRACTION, (m, num, den) => {
const n = parseInt(num, 10);
const d = parseInt(den, 10);
if (d === 0) return m;
const nWords = numberToWords(n);
let dWord: string;
if (d === 2) dWord = n === 1 ? "half" : "halves";
else if (d === 4) dWord = n === 1 ? "quarter" : "quarters";
else {
dWord = ordinalSuffix(d);
if (n !== 1) dWord += "s";
}
return `${nWords} ${dWord}`;
});
}
function expandDecades(text: string): string {
return text.replace(RE_DECADE, (_, base) => {
const b = parseInt(base, 10);
const decadeDigit = b % 10;
const decadeWord = DECADE_MAP[decadeDigit] || "";
if (b < 10) return decadeWord;
return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`;
});
}
function replaceNumbers(text: string): string {
return text.replace(RE_NUMBER, (m) => {
const clean = m.replace(/,/g, "");
if (clean.includes(".")) return floatToWords(clean);
return numberToWords(parseInt(clean, 10));
});
}
function normalizeLeadingDecimals(text: string): string {
text = text.replace(/(?<!\d)(-)\.([\d])/g, "$1" + "0.$2");
text = text.replace(/(?<!\d)\.([\d])/g, "0.$1");
return text;
}
const RE_URL = /https?:\/\/\S+|www\.\S+/g;
const RE_EMAIL = /\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b/gi;
const RE_HTML = /<[^>]+>/g;
const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g;
const RE_SPACES = /\s+/g;
export function preprocessText(text: string): string {
// Remove URLs, emails, HTML
text = text.replace(RE_URL, "");
text = text.replace(RE_EMAIL, "");
text = text.replace(RE_HTML, " ");
// Normalize leading decimals
text = normalizeLeadingDecimals(text);
// Expand special forms before generic number replacement
text = expandCurrency(text);
text = expandPercentages(text);
text = expandScientific(text);
text = expandTime(text);
text = expandOrdinals(text);
text = expandUnits(text);
text = expandScaleSuffixes(text);
text = expandFractions(text);
text = expandDecades(text);
text = expandRanges(text);
text = expandModelNames(text);
text = replaceNumbers(text);
// Remove non-prosodic punctuation
text = text.replace(RE_PUNCT, " ");
// Lowercase and collapse whitespace
text = text.toLowerCase();
text = text.replace(RE_SPACES, " ").trim();
return text;
}