Spaces:

shreyask
/

KittenTTS-WebGPU

Running

File size: 10,848 Bytes

9b1aef8

/**
 * Text preprocessor — converts numbers, currencies, ordinals, etc. to words.
 * Port of KittenTTS preprocess.py.
 * https://github.com/KittenML/KittenTTS
 */

// ── Number → Words ──

const ONES = [
  "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
  "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
  "seventeen", "eighteen", "nineteen",
];
const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"];
const SCALE = ["", "thousand", "million", "billion", "trillion"];

const ORDINAL_EXCEPTIONS: Record<string, string> = {
  one: "first", two: "second", three: "third", four: "fourth",
  five: "fifth", six: "sixth", seven: "seventh", eight: "eighth",
  nine: "ninth", twelve: "twelfth",
};

const CURRENCY_SYMBOLS: Record<string, string> = {
  "$": "dollar", "€": "euro", "£": "pound", "¥": "yen",
  "₹": "rupee", "₩": "won", "₿": "bitcoin",
};

function threeDigitsToWords(n: number): string {
  if (n === 0) return "";
  const parts: string[] = [];
  const hundreds = Math.floor(n / 100);
  const remainder = n % 100;
  if (hundreds) parts.push(`${ONES[hundreds]} hundred`);
  if (remainder < 20) {
    if (remainder) parts.push(ONES[remainder]);
  } else {
    const tensWord = TENS[Math.floor(remainder / 10)];
    const onesWord = ONES[remainder % 10];
    parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord);
  }
  return parts.join(" ");
}

export function numberToWords(n: number): string {
  if (!Number.isInteger(n)) n = Math.floor(n);
  if (n === 0) return "zero";
  if (n < 0) return `negative ${numberToWords(-n)}`;
  if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) {
    const hundreds = Math.floor(n / 100);
    if (hundreds < 20) return `${ONES[hundreds]} hundred`;
  }
  const parts: string[] = [];
  let remaining = n;
  for (let i = 0; i < SCALE.length; i++) {
    const chunk = remaining % 1000;
    if (chunk) {
      const w = threeDigitsToWords(chunk);
      parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w);
    }
    remaining = Math.floor(remaining / 1000);
    if (remaining === 0) break;
  }
  return parts.reverse().join(" ");
}

export function floatToWords(value: string | number, sep = "point"): string {
  const text = typeof value === "string" ? value : `${value}`;
  const negative = text.startsWith("-");
  const clean = negative ? text.slice(1) : text;
  let result: string;
  if (clean.includes(".")) {
    const [intPart, decPart] = clean.split(".");
    const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero";
    const digitMap = ["zero", ...ONES.slice(1)];
    const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" ");
    result = `${intWords} ${sep} ${decWords}`;
  } else {
    result = numberToWords(parseInt(clean, 10));
  }
  return negative ? `negative ${result}` : result;
}

function ordinalSuffix(n: number): string {
  const word = numberToWords(n);
  let prefix: string, last: string, joiner: string;
  if (word.includes("-")) {
    const idx = word.lastIndexOf("-");
    prefix = word.slice(0, idx);
    last = word.slice(idx + 1);
    joiner = "-";
  } else {
    const parts = word.split(" ");
    if (parts.length >= 2) {
      last = parts.pop()!;
      prefix = parts.join(" ");
      joiner = " ";
    } else {
      last = word;
      prefix = "";
      joiner = "";
    }
  }
  let lastOrd: string;
  if (ORDINAL_EXCEPTIONS[last]) {
    lastOrd = ORDINAL_EXCEPTIONS[last];
  } else if (last.endsWith("t")) {
    lastOrd = last + "h";
  } else if (last.endsWith("e")) {
    lastOrd = last.slice(0, -1) + "th";
  } else {
    lastOrd = last + "th";
  }
  return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd;
}

// ── Regex patterns ──

const RE_NUMBER = /(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?/g;
const RE_ORDINAL = /\b(\d+)(st|nd|rd|th)\b/gi;
const RE_PERCENT = /(-?[\d,]+(?:\.\d+)?)\s*%/g;
const RE_CURRENCY = /([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])/g;
const RE_TIME = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b/gi;
const RE_RANGE = /(?<!\w)(\d+)-(\d+)(?!\w)/g;
const RE_MODEL_VER = /\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)/g;
const RE_UNIT = /(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b/gi;
const RE_SCALE = /(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])/g;
const RE_SCI = /(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])/g;
const RE_FRACTION = /\b(\d+)\s*\/\s*(\d+)\b/g;
const RE_DECADE = /\b(\d{1,3})0s\b/g;

const UNIT_MAP: Record<string, string> = {
  km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters",
  gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes",
  hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz",
  mph: "miles per hour", kph: "kilometers per hour",
  ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds",
  "°c": "degrees Celsius", "c°": "degrees Celsius",
  "°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit",
};

const SCALE_MAP: Record<string, string> = {
  K: "thousand", M: "million", B: "billion", T: "trillion",
};

const DECADE_MAP: Record<number, string> = {
  0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties",
  5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties",
};

// ── Expansion functions ──

function expandOrdinals(text: string): string {
  return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10)));
}

function expandPercentages(text: string): string {
  return text.replace(RE_PERCENT, (_, raw) => {
    const clean = raw.replace(/,/g, "");
    const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10));
    return `${w} percent`;
  });
}

function expandCurrency(text: string): string {
  return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => {
    const clean = raw.replace(/,/g, "");
    const unit = CURRENCY_SYMBOLS[symbol] || "";
    if (scaleSuffix) {
      const scaleWord = SCALE_MAP[scaleSuffix];
      const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10));
      return `${num} ${scaleWord} ${unit}s`.trim();
    }
    if (clean.includes(".")) {
      const [intPart, decPart] = clean.split(".");
      const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10);
      let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`;
      if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`;
      return result;
    }
    const val = parseInt(clean, 10);
    return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`;
  });
}

function expandTime(text: string): string {
  return text.replace(RE_TIME, (_, h, m, _s, suffix) => {
    const hour = parseInt(h, 10);
    const mins = parseInt(m, 10);
    const sfx = suffix ? ` ${suffix.toLowerCase()}` : "";
    const hWords = numberToWords(hour);
    if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`;
    if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`;
    return `${hWords} ${numberToWords(mins)}${sfx}`;
  });
}

function expandRanges(text: string): string {
  return text.replace(RE_RANGE, (_, lo, hi) =>
    `${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}`
  );
}

function expandModelNames(text: string): string {
  return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`);
}

function expandUnits(text: string): string {
  return text.replace(RE_UNIT, (_, raw, unit) => {
    const expanded = UNIT_MAP[unit.toLowerCase()] || unit;
    const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10));
    return `${num} ${expanded}`;
  });
}

function expandScaleSuffixes(text: string): string {
  return text.replace(RE_SCALE, (_, raw, suffix) => {
    const scaleWord = SCALE_MAP[suffix] || suffix;
    const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10));
    return `${num} ${scaleWord}`;
  });
}

function expandScientific(text: string): string {
  return text.replace(RE_SCI, (_, coeff, exp) => {
    const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10));
    const expVal = parseInt(exp, 10);
    const sign = expVal < 0 ? "negative " : "";
    return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`;
  });
}

function expandFractions(text: string): string {
  return text.replace(RE_FRACTION, (m, num, den) => {
    const n = parseInt(num, 10);
    const d = parseInt(den, 10);
    if (d === 0) return m;
    const nWords = numberToWords(n);
    let dWord: string;
    if (d === 2) dWord = n === 1 ? "half" : "halves";
    else if (d === 4) dWord = n === 1 ? "quarter" : "quarters";
    else {
      dWord = ordinalSuffix(d);
      if (n !== 1) dWord += "s";
    }
    return `${nWords} ${dWord}`;
  });
}

function expandDecades(text: string): string {
  return text.replace(RE_DECADE, (_, base) => {
    const b = parseInt(base, 10);
    const decadeDigit = b % 10;
    const decadeWord = DECADE_MAP[decadeDigit] || "";
    if (b < 10) return decadeWord;
    return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`;
  });
}

function replaceNumbers(text: string): string {
  return text.replace(RE_NUMBER, (m) => {
    const clean = m.replace(/,/g, "");
    if (clean.includes(".")) return floatToWords(clean);
    return numberToWords(parseInt(clean, 10));
  });
}

function normalizeLeadingDecimals(text: string): string {
  text = text.replace(/(?<!\d)(-)\.([\d])/g, "$1" + "0.$2");
  text = text.replace(/(?<!\d)\.([\d])/g, "0.$1");
  return text;
}

const RE_URL = /https?:\/\/\S+|www\.\S+/g;
const RE_EMAIL = /\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b/gi;
const RE_HTML = /<[^>]+>/g;
const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g;
const RE_SPACES = /\s+/g;

export function preprocessText(text: string): string {
  // Remove URLs, emails, HTML
  text = text.replace(RE_URL, "");
  text = text.replace(RE_EMAIL, "");
  text = text.replace(RE_HTML, " ");

  // Normalize leading decimals
  text = normalizeLeadingDecimals(text);

  // Expand special forms before generic number replacement
  text = expandCurrency(text);
  text = expandPercentages(text);
  text = expandScientific(text);
  text = expandTime(text);
  text = expandOrdinals(text);
  text = expandUnits(text);
  text = expandScaleSuffixes(text);
  text = expandFractions(text);
  text = expandDecades(text);
  text = expandRanges(text);
  text = expandModelNames(text);
  text = replaceNumbers(text);

  // Remove non-prosodic punctuation
  text = text.replace(RE_PUNCT, " ");

  // Lowercase and collapse whitespace
  text = text.toLowerCase();
  text = text.replace(RE_SPACES, " ").trim();

  return text;
}