// ===========================================================================
// PALETTE — pulled from CSS variables so tokens stay single-source-of-truth
// ===========================================================================
const CSS = getComputedStyle(document.documentElement);
const PALETTE = [
CSS.getPropertyValue('--palette-1').trim(),
CSS.getPropertyValue('--palette-2').trim(),
CSS.getPropertyValue('--palette-3').trim(),
CSS.getPropertyValue('--palette-4').trim(),
CSS.getPropertyValue('--palette-5').trim(),
CSS.getPropertyValue('--palette-6').trim(),
CSS.getPropertyValue('--palette-7').trim(),
CSS.getPropertyValue('--palette-8').trim(),
CSS.getPropertyValue('--palette-9').trim(),
CSS.getPropertyValue('--palette-10').trim(),
CSS.getPropertyValue('--palette-11').trim(),
CSS.getPropertyValue('--palette-12').trim(),
];
/** Brighten an HSL-mapped color for the inner tick mark (and voronoi hovers). */
function luminousVariant(hex, lightnessBoost = 0.4, saturationBoost = 0.12) {
const c = d3.hsl(hex);
c.l = Math.min(0.9, c.l + lightnessBoost);
c.s = Math.min(1, c.s + saturationBoost);
return c.formatHex();
}
/** Format a number as an integer + .Mk. */
function formatRows(n) {
if (n >= 1_000_000) {
const v = n / 1_000_000;
const [whole, frac] = v.toFixed(1).split('.');
return `${whole}.${frac}M`;
}
if (n >= 1_000) {
const v = n / 1_000;
const [whole, frac] = v.toFixed(1).split('.');
return `${whole}.${frac}k`;
}
return String(n);
}
function formatShort(n) {
if (n >= 1_000_000) return (n / 1_000_000).toFixed(1).replace(/\.0$/, '') + 'M';
if (n >= 1_000) return (n / 1_000).toFixed(1).replace(/\.0$/, '') + 'k';
return String(n);
}
// Upgrade hero stat numbers to support a decimal span
document.querySelectorAll('.stat .num').forEach(el => {
const raw = el.getAttribute('data-value');
const m = raw && raw.match(/^([~≈]?[\d,]+)(\.[\d]+)?([A-Za-z+]+)?$/);
if (!m) { el.textContent = raw || ''; return; }
const [, whole, frac = '', suffix = ''] = m;
el.innerHTML = frac || suffix
? `${whole}${frac}${suffix}`
: whole;
});
// ===========================================================================
// DATASET CATALOG
// ===========================================================================
const DATASETS = [
{
key: "speech",
title: "Multilingual Synthetic Speech",
tagline: "Zero-shot voice cloning with Qwen3-TTS across 9 languages",
raw: { repo: "Reubencf/multilingual-synthetic-tts", rows: 68677 },
adaption: { repo: "Reubencf/Adaption-multilingual-speech", rows: 10274 },
languages: "en, ja, zh, ko, de, es, fr, ru, pt",
modality: "audio + text",
license: "open / synthetic",
schema: ["audio", "text", "language", "language_name", "style", "voice", "sample_rate"],
model: "Qwen3-TTS-12Hz-1.7B-Base",
group: "paired"
},
{
key: "sentences",
title: "Multilingual Sentences (text-only)",
tagline: "Text projection of the TTS corpus — ready for Adaption",
raw: null,
adaption: { repo: "Reubencf/Adaption-multilingual-sentences", rows: 10000 },
languages: "ja, ru, ko, de, es, pt, zh, en, fr + 114 more",
modality: "text",
license: "open",
schema: ["text", "enhanced_prompt", "enhanced_completion", "language", "voice", "style"],
group: "paired"
},
{
key: "music",
title: "Music — FMA Labeled",
tagline: "Creative-Commons music tracks with lyrics, genre, mood, BPM, key",
raw: { repo: "Reubencf/fma-labeled", rows: 29000 },
adaption: { repo: "Reubencf/adaption-music-style-prompts", rows: 9950 },
languages: "en",
modality: "audio + text",
license: "CC-BY / CC0 (source-dependent)",
schema: ["audio", "lyrics", "genre", "sub_genres", "mood", "instruments", "bpm", "key", "vocal_type", "energy", "era", "quality"],
model: "gemini-flash-latest",
group: "paired"
},
{
key: "street",
title: "StreetView Global",
tagline: "Globally-sampled Mapillary street images with scene classification",
raw: { repo: "Reubencf/streetview-global", rows: 30000 },
adaption: { repo: "Reubencf/adaption-street-scene-descriptions", rows: 10100 },
languages: "en",
modality: "image + text",
license: "CC-BY-SA-4.0",
schema: ["image", "scene_description", "setting", "weather", "time_of_day", "road_type", "infrastructure", "lat", "lon", "compass"],
group: "paired"
},
{
key: "magazines",
title: "Magazines Multilingual VQA",
tagline: "Public-domain magazine OCR in 40+ source languages (including low-resource)",
raw: { repo: "Reubencf/magazines-multilingual-vqa", rows: 29039 },
adaption: { repo: "Reubencf/adaption-multilingual-doc-qa", rows: 8800 },
languages: "ar, de, en, es, fr, hi, it, ja, pt, zh + 35 more (Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Russian, Maithili, Tibetan, …)",
modality: "image + text",
license: "CC-BY-4.0",
schema: ["image", "ocr_text", "english_description", "question", "answer", "target_language", "page_type"],
model: "Gemma 4 31B via vLLM",
group: "paired"
},
{
key: "lowresource",
title: "Low-Resource Doc Q/A",
tagline: "Low-resource-language slice of the magazines corpus",
raw: null,
adaption: { repo: "Reubencf/Adaption-low-resource-doc-qa", rows: 10200 },
languages: "Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Maithili, Tigrinya, Meitei, Lao, …",
modality: "image + text",
license: "CC-BY-4.0",
schema: ["image", "ocr_text", "question", "answer", "source_language"],
group: "paired"
},
{
key: "captions",
title: "Multilingual Image Captions",
tagline: "English + multilingual captions with bounding-box visualizations",
raw: { repo: "Reubencf/multilingual-image-annotations", rows: 464 },
adaption: { repo: "Reubencf/adaption-multilingual-image-captions", rows: 462 },
languages: "en, es, fr, hi, zh, ar, pt",
modality: "image + text",
license: "CC-BY-4.0",
schema: ["image", "boxed_image", "description_en", "descriptions", "vqa", "detections"],
model: "Gemma 4 31B",
group: "paired"
},
{
key: "frontend",
title: "Frontend Coding",
tagline: "Hand-curated HTML / Tailwind / JS prompts and completions",
raw: { repo: "Reubencf/frontend-coding", rows: 500 },
adaption: { repo: "Reubencf/frontend-html-tailwind-js", rows: 145 },
languages: "en",
modality: "text (code)",
license: "MIT",
schema: ["prompt", "previous_code", "code", "reasoning"],
group: "paired"
},
{
key: "news2026",
title: "Current Affairs 2026",
tagline: "2026 Wikipedia current-events Q/A with RAG grounding (through Apr 9, 2026)",
raw: { repo: "Reubencf/future-news-events-2026", rows: 5447 },
adaption: { repo: "Reubencf/current-affairs-2026", rows: 5339 },
languages: "en",
modality: "text",
license: "open",
schema: ["question", "answer", "enhanced_prompt", "enhanced_completion", "reasoning_trace", "date", "event_id", "section", "source"],
model: "Cohere Command R + RAG",
group: "paired"
},
{
key: "news2025",
title: "Current Affairs 2025",
tagline: "2025 global events Q/A",
raw: { repo: "Reubencf/2025_events", rows: 5390 },
adaption: { repo: "Reubencf/current-affairs-2025", rows: 5390 },
languages: "en",
modality: "text",
license: "open",
schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"],
group: "paired"
},
{
key: "news2024",
title: "Current Affairs 2024",
tagline: "2024 global events Q/A",
raw: { repo: "Reubencf/2024_events", rows: 5190 },
adaption: { repo: "Reubencf/current-affairs-2024", rows: 5190 },
languages: "en",
modality: "text",
license: "open",
schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"],
group: "paired"
},
{
key: "news2023",
title: "Current Affairs 2023",
tagline: "2023 global events Q/A",
raw: { repo: "Reubencf/2023_events", rows: 4667 },
adaption: { repo: "Reubencf/current-affairs-2023", rows: 4667 },
languages: "en",
modality: "text",
license: "open",
schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"],
group: "paired"
},
// Pre-training pools — now included in the raw donut too.
{
key: "polyaudio",
title: "PolyglotAudio",
tagline: "Broad multilingual audio pre-training pool",
raw: { repo: "Reubencf/PolyglotAudio", rows: 1200000 },
adaption: null,
languages: "multilingual",
modality: "audio + text",
license: "open",
schema: ["audio", "text", "language"],
group: "paired"
},
{
key: "polytext",
title: "PolyglotText",
tagline: "Large multilingual text pre-training pool",
raw: { repo: "Reubencf/PolyglotText", rows: 13400000 },
adaption: null,
languages: "multilingual",
modality: "text",
license: "open",
schema: ["text", "language"],
group: "paired"
},
];
// Stable color per dataset key — cycles through PALETTE
const datasetColor = d3.scaleOrdinal(PALETTE).domain(DATASETS.map(d => d.key));
DATASETS.forEach(d => { d.color = datasetColor(d.key); });
// ===========================================================================
// DONUT CHART (D3 — used for both Raw/Adaption donuts and the modality donut)
// ===========================================================================
const tooltipEl = document.getElementById('donut-tooltip');
// Per-SVG selection state for drill-down (scale-up selected, dim others).
const donutState = new Map(); // svgId -> { selectedKey, paths, arcGen }
function renderDonut({ svgId, centerId, field, datasets, getValue, getKey, getTitle, getColor, getMeta, colorScale, topLabel, bottomLabel, topIcon, bottomIcon, sizing = 'linear' }) {
const svg = d3.select('#' + svgId);
svg.selectAll('*').remove();
const bbox = svg.node().getBoundingClientRect();
const size = Math.min(bbox.width, bbox.height);
const outerR = size / 2 - 6;
const innerR = outerR * 0.62;
svg.attr('viewBox', `${-size / 2} ${-size / 2} ${size} ${size}`);
const filtered = datasets.filter(d => getValue(d) > 0);
const total = d3.sum(filtered, getValue);
const count = filtered.length;
// Sizing strategy for the arc:
// "linear" — true proportions (small slices can vanish)
// "log" — power-compressed so tiny datasets stay visible while the
// big ones (PolyglotText 13M+, PolyglotAudio 1M+) still read
// as clearly the largest slices
// "sqrt" — lighter square-root compression
// Tooltip + center numbers always show real values.
const sizeValue = d => {
const v = getValue(d);
if (sizing === 'log') return Math.pow(v + 1, 0.38);
if (sizing === 'sqrt') return Math.sqrt(v + 1);
return v;
};
const pie = d3.pie().value(sizeValue).sort(null).padAngle(0.022);
const arcs = pie(filtered);
const arcGen = d3.arc().innerRadius(innerR).outerRadius(outerR).cornerRadius(3);
const resolveColor = getColor || (x => datasetColor(getKey(x)));
const g = svg.append('g');
// Slice paths
const paths = g.selectAll('path')
.data(arcs)
.join('path')
.attr('class', 'donut-slice')
.attr('fill', d => resolveColor(d.data))
.attr('stroke', '#000000')
.attr('stroke-width', 2)
.attr('stroke-linejoin', 'round');
// Radial sweep: interpolate endAngle from startAngle → target, so arcs
// literally grow around the ring from 0° of arc to their full sweep.
paths.each(function (d) {
const [cx, cy] = arcGen.centroid(d);
this._centroid = [cx, cy];
this._current = { startAngle: d.startAngle, endAngle: d.startAngle, padAngle: d.padAngle };
});
paths.transition()
.delay((d, i) => i * 80)
.duration(1100)
.ease(d3.easeCubicOut)
.attrTween('d', function (d) {
const interp = d3.interpolate(this._current, d);
this._current = interp(1);
return t => arcGen(interp(t));
});
// Hover tooltip + click drill-down
paths
.on('mouseenter', function (ev, d) {
const nm = getTitle(d.data);
const v = getValue(d.data);
const meta = getMeta ? getMeta(d.data) : '';
tooltipEl.innerHTML =
`
${nm}
` +
`${v.toLocaleString()} rows` +
(meta ? ` · ${meta}` : '') + `
`;
gsap.to(tooltipEl, { opacity: 1, duration: 0.15, overwrite: true });
})
.on('mousemove', function (ev) {
tooltipEl.style.left = (ev.clientX + 14) + 'px';
tooltipEl.style.top = (ev.clientY + 14) + 'px';
})
.on('mouseleave', function () {
gsap.to(tooltipEl, { opacity: 0, duration: 0.12, overwrite: true });
})
.on('click', function (ev, d) {
const key = getKey(d.data);
focusDonutSlice(svgId, this, key);
if (typeof showDetails === 'function') showDetails(key);
});
// Cache for drill-down reset logic.
donutState.set(svgId, { paths, arcGen, resolveColor });
// Center content — start at 0 and count up with GSAP.
if (centerId) {
const centerEl = document.getElementById(centerId);
const topIconHtml = topIcon ? `${topIcon}` : '';
const bottomIconHtml = bottomIcon ? `${bottomIcon}` : '';
centerEl.innerHTML =
`
${topIconHtml}${topLabel}
0
0
${bottomLabel}${bottomIconHtml}
`;
const topEl = centerEl.querySelector('.js-count-top');
const bottomEl = centerEl.querySelector('.js-count-bottom');
const topObj = { v: 0 };
gsap.to(topObj, {
v: count,
duration: 1.0,
ease: 'power2.out',
delay: 0.55,
onUpdate: () => { topEl.textContent = Math.floor(topObj.v); },
onComplete: () => { topEl.textContent = count; }
});
const bottomObj = { v: 0 };
gsap.to(bottomObj, {
v: total,
duration: 1.6,
ease: 'power2.out',
delay: 0.65,
onUpdate: () => { bottomEl.innerHTML = formatRows(Math.floor(bottomObj.v)); },
onComplete: () => { bottomEl.innerHTML = formatRows(total); }
});
gsap.from(`#${centerId} .center-label`, { y: 14, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.45, stagger: 0.15 });
gsap.from(`#${centerId} .center-number`, { y: 10, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.55, stagger: 0.15 });
// Space out the top/bottom blocks since the divider is gone.
centerEl.querySelector('.center-item.bottom').style.marginTop = '14px';
}
return paths;
}
/** Click drill-down: scale up clicked slice, dim the rest, toggle on re-click. */
function focusDonutSlice(svgId, clickedEl, clickedKey) {
const state = donutState.get(svgId);
if (!state) return;
const { paths, arcGen } = state;
// Toggle off if clicking the already-selected slice
if (state.selectedKey === clickedKey) {
resetDonutFocus(svgId);
return;
}
state.selectedKey = clickedKey;
paths.nodes().forEach((node, i) => {
const d = paths.data()[i];
const isSelected = node === clickedEl;
if (isSelected) {
const [cx, cy] = node._centroid || arcGen.centroid(d);
gsap.to(node, {
scale: 1.08,
opacity: 1,
svgOrigin: `${cx} ${cy}`,
filter: 'drop-shadow(0 0 14px rgba(255,255,255,0.35)) brightness(1.15)',
duration: 0.45,
ease: 'power2.out',
overwrite: 'auto'
});
} else {
gsap.to(node, {
scale: 1,
opacity: 0.3,
filter: 'none',
duration: 0.35,
ease: 'power2.out',
overwrite: 'auto'
});
}
});
}
function resetDonutFocus(svgId) {
const state = donutState.get(svgId);
if (!state) return;
state.selectedKey = null;
state.paths.nodes().forEach(node => {
gsap.to(node, {
scale: 1, opacity: 1, filter: 'none',
duration: 0.35, ease: 'power2.out', overwrite: 'auto'
});
});
}
// ---- Raw vs Adaption donuts ----
// The Raw donut shows every dataset that has a raw repo (including the
// PolyglotText / PolyglotAudio pre-training pools). The Adaption donut shows
// every dataset with an Adaption-remastered version. renderDonut() filters
// out zero-value entries automatically.
renderDonut({
svgId: 'chart-raw',
centerId: 'center-raw',
field: 'raw',
datasets: DATASETS,
getValue: d => (d.raw && d.raw.rows) || 0,
getKey: d => d.key,
getTitle: d => d.title,
getMeta: d => d.raw ? d.raw.repo : '',
topLabel: 'RAW DATASETS',
bottomLabel: 'ROWS',
topIcon: '',
bottomIcon: '',
sizing: 'log', // compress so tiny datasets still get a visible slice
});
renderDonut({
svgId: 'chart-adaption',
centerId: 'center-adaption',
field: 'adaption',
datasets: DATASETS,
getValue: d => (d.adaption && d.adaption.rows) || 0,
getKey: d => d.key,
getTitle: d => d.title,
getMeta: d => d.adaption ? d.adaption.repo : '',
topLabel: 'ADAPTION SETS',
bottomLabel: 'ROWS',
topIcon: '',
bottomIcon: '',
});
// ---- Modality donut ----
const MODALITIES = [
{ key: 'text', name: 'Text', count: 5 },
{ key: 'audio', name: 'Audio', count: 3 },
{ key: 'image', name: 'Image', count: 3 },
{ key: 'code', name: 'Code', count: 1 },
];
const modalityColor = d3.scaleOrdinal(PALETTE).domain(MODALITIES.map(m => m.key));
renderDonut({
svgId: 'chart-modality',
centerId: 'center-modality',
field: 'count',
datasets: MODALITIES,
getValue: d => d.count,
getKey: d => d.key,
getTitle: d => d.name,
getColor: d => modalityColor(d.key),
getMeta: d => `${d.count} datasets`,
topLabel: 'MODALITIES',
bottomLabel: 'DATASETS',
topIcon: '',
bottomIcon: '',
});
// ===========================================================================
// DETAILS CARD — rendered on slice click with GSAP reveal
// ===========================================================================
function hideLanguageDetails() {
// No-op placeholder — currently we share the single details card; click-
// another to switch. Kept as an explicit symbol for future extension.
}
function showLanguageDetails(langData, color) {
const card = document.getElementById('details-card');
card.style.display = '';
// Per-dataset breakdown for this language.
const breakdown = DATASET_LANGS
.map(d => ({ dataset: d.name, key: d.key, rows: d.langs[langData.code] || 0 }))
.filter(d => d.rows > 0)
.sort((a, b) => b.rows - a.rows);
const rows = breakdown.map(b =>
`${b.dataset}
${formatShort(b.rows)}
(${b.rows.toLocaleString()})
`
).join('');
card.innerHTML = `
${langData.name} (${langData.code})
Total across the raw corpus: ${langData.value.toLocaleString()} rows.
${rows}
`;
gsap.fromTo(card,
{ y: 80, opacity: 0 },
{ y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' }
);
gsap.from(card.querySelectorAll('.kv'), {
y: 18, opacity: 0, duration: 0.45, ease: 'power3.out',
stagger: 0.05, delay: 0.2
});
card.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
}
function showDetails(key) {
const d = DATASETS.find(x => x.key === key);
if (!d) return;
const card = document.getElementById('details-card');
card.style.display = '';
const repoLink = info => info
? `${info.repo}`
: `—`;
const rowsCell = info => info
? `${formatShort(info.rows)} (${info.rows.toLocaleString()})`
: `—`;
card.innerHTML = `
${d.title}
${d.tagline}
Raw repo
${repoLink(d.raw)}
Raw rows
${rowsCell(d.raw)}
Adaption repo
${repoLink(d.adaption)}
Adaption rows
${rowsCell(d.adaption)}
${d.model ? `
` : ''}
Schema
${d.schema.map(c => `${c}`).join('')}
`;
// Elegant slide-in from the bottom with power4.out.
gsap.fromTo(card,
{ y: 80, opacity: 0 },
{ y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' }
);
gsap.from(card.querySelectorAll('.kv'), {
y: 18, opacity: 0, duration: 0.45, ease: 'power3.out',
stagger: 0.05, delay: 0.2
});
card.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
}
// ===========================================================================
// INITIAL PAGE LOAD — hero image → hero stats → chart cards, staggered.
// ===========================================================================
const loadTl = gsap.timeline();
loadTl
.from('.hero-img', { y: 20, opacity: 0, duration: 0.9, ease: 'power3.out' })
.from('.stat', { y: 20, opacity: 0, duration: 0.7, ease: 'power3.out', stagger: 0.15 }, '-=0.5')
.from('.chart-card', { y: 20, opacity: 0, duration: 0.8, ease: 'power3.out', stagger: 0.15 }, '-=0.3');
// ===========================================================================
// LANGUAGE DATA (for the Voronoi treemap)
// ===========================================================================
const LANG_NAMES = {
tr: "Turkish", ru: "Russian", it: "Italian", en: "English", eo: "Esperanto",
hu: "Hungarian", de: "German", fr: "French", pt: "Portuguese", mk: "Macedonian",
es: "Spanish", he: "Hebrew", fi: "Finnish", ber: "Berber", nl: "Dutch",
pl: "Polish", sr: "Serbian", mr: "Marathi", el: "Greek", da: "Danish",
cs: "Czech", sv: "Swedish", bg: "Bulgarian", la: "Latin", zh: "Mandarin",
ro: "Romanian", ia: "Interlingua", ja: "Japanese", tok: "Toki Pona",
lfn: "Lingua Franca Nova", uk: "Ukrainian", tt: "Tatar", tl: "Tagalog",
id: "Indonesian", nb: "Norwegian B.", lt: "Lithuanian", az: "Azerbaijani",
ie: "Interlingue", tlh: "Klingon", jbo: "Lojban", mhr: "Meadow Mari",
bn: "Bengali", fa: "Persian", br: "Breton", ilo: "Ilocano", ar: "Arabic",
ceb: "Cebuano", hi: "Hindi", vi: "Vietnamese", pam: "Kapampangan",
hy: "Armenian", be: "Belarusian", ko: "Korean", yue: "Cantonese",
ca: "Catalan", kab: "Kabyle", af: "Afrikaans", am: "Amharic", yi: "Yiddish",
sat: "Santali", so: "Somali", te: "Telugu", ne: "Nepali", pa: "Punjabi",
ur: "Urdu", ta: "Tamil", ml: "Malayalam", th: "Thai", or: "Odia",
sd: "Sindhi", gu: "Gujarati", kn: "Kannada", my: "Burmese", bo: "Tibetan",
lo: "Lao", mni: "Meitei", kk: "Kazakh", oc: "Occitan", hr: "Croatian",
sk: "Slovak", et: "Estonian", sl: "Slovenian", is: "Icelandic", ms: "Malay",
sq: "Albanian", hsb: "Upper Sorbian", dsb: "Lower Sorbian", mai: "Maithili",
kha: "Khasi", dtp: "Kadazan", yo: "Yoruba", sw: "Swahili", cy: "Welsh",
ga: "Irish", gd: "Scottish Gaelic", ti: "Tigrinya", os: "Ossetian",
sa: "Sanskrit", ug: "Uyghur", uz: "Uzbek", ka: "Georgian", eu: "Basque",
vo: "Volapük", ido: "Ido", nov: "Novial", avk: "Kotava", ldn: "Láadan",
afh: "Afrihili", lzh: "Classical Chinese", non: "Old Norse", ang: "Old English",
grc: "Ancient Greek", sux: "Sumerian", fro: "Old French", cbk: "Chavacano",
zsm: "Standard Malay", war: "Waray", kw: "Cornish", nah: "Nahuatl",
kek: "Q'eqchi'", hif: "Fiji Hindi", crh: "Crimean Tatar", sah: "Sakha",
ext: "Extremaduran", csb: "Kashubian", sgs: "Samogitian", cha: "Chamorro",
tvl: "Tuvaluan", mi: "Maori", lin: "Lingala", arq: "Algerian Arabic",
arz: "Egyptian Arabic", orv: "Old East Slavic", prg: "Old Prussian",
chv: "Chuvash", bar: "Bavarian", pms: "Piedmontese", egl: "Emilian",
jav: "Javanese", sun: "Sundanese", hoc: "Ho", zza: "Zaza",
rif: "Riffian Berber", nog: "Nogai", km: "Khmer",
};
const DATASET_LANGS = [
{
key: "polytext", name: "PolyglotText",
langs: {
tr: 1767000, ru: 1695000, it: 1588000, en: 1337000, eo: 1171000,
hu: 817000, de: 675000, fr: 520000, pt: 470000, mk: 398000,
es: 358000, he: 272000, fi: 263000, ber: 180000, nl: 125000,
pl: 118000, sr: 106000, mr: 96000, el: 94000, da: 90000,
cs: 72000, sv: 71000, bg: 70000, la: 66000, zh: 58000, ro: 56000,
ia: 54000, ja: 43000, tok: 39000, lfn: 38000, uk: 38000, tt: 33000,
tl: 31000, id: 31000, nb: 31000, lt: 29000, az: 25000, ie: 24000,
tlh: 23000, jbo: 21000, mhr: 19000, bn: 19000, fa: 17000, br: 17000,
ilo: 17000, ar: 16000, ceb: 15000, hi: 13000, vi: 11000, pam: 11000,
hy: 9000, be: 9000, ko: 9000,
cbk: 19000, sk: 8000, vo: 8000, oc: 8000, et: 8000,
war: 6700, ms: 6700, hr: 6700, eu: 6700, yi: 5400, af: 5400,
km: 4000, ca: 4000, kha: 4000, dtp: 4000, zza: 4000, is: 4000,
avk: 4000, ga: 4000, hoc: 4000, sl: 4000, sq: 4000, chv: 4000,
kw: 4000, sux: 2700, ang: 2700, pms: 2700, prg: 2700, ug: 2700,
lzh: 2700, egl: 2700, ur: 2700, sah: 2700, nds: 2700, mi: 2700,
tvl: 1400, cha: 1400, th: 1400, cy: 1400, non: 1400, yo: 1400,
lin: 1400, grc: 1400, arq: 1400, orv: 1400, sw: 1400, rif: 1400,
crh: 1400, hif: 1400, jav: 1400, sun: 1400, hsb: 1400, dsb: 1400,
amh: 1400, csb: 1400, sgs: 1400, ext: 1400, nov: 1400, nog: 1400,
arz: 1400, nah: 1400, ido: 1400, afh: 1400, kk: 1400,
}
},
{
key: "polyaudio", name: "PolyglotAudio",
langs: {
en: 698000, es: 261000, eo: 105000, de: 32000, fr: 16000,
ru: 9200, pl: 8800, ber: 6600, nl: 5900, it: 5900,
yue: 4300, pt: 3300, ja: 1400, mr: 1200, ca: 505,
cs: 410, zh: 110, fi: 93, hu: 87, uk: 38,
he: 16, tok: 5, kab: 5,
}
},
{
key: "tts", name: "multilingual-synthetic-tts",
langs: {
ja: 13951, ru: 9105, de: 8972, ko: 8129, es: 7917,
pt: 5438, zh: 5417, en: 5157, fr: 4551,
}
},
{
key: "magazines", name: "magazines-multilingual-vqa",
langs: {
de: 4412, fr: 3279, ru: 2762, pt: 2047, vi: 1637, bn: 1598,
en: 1004, af: 826, ar: 156, it: 136, fa: 132, te: 132,
ja: 130, ne: 108, pa: 100, ur: 98, nl: 95, tr: 85, zh: 83,
ta: 68, ml: 64, id: 43, th: 47, am: 123, yi: 108, sat: 85,
so: 25, hi: 15, es: 10, mr: 9, kn: 8, or: 17, sd: 3,
mai: 1021, la: 146, bo: 25, be: 8, da: 6, ko: 4, bg: 4,
os: 3, sa: 3, my: 3, oc: 1, gd: 1, ti: 1, hy: 1, pl: 1,
mni: 1, uk: 1, lo: 1, kk: 1,
}
},
{ key: "fma", name: "fma-labeled", langs: { en: 29000 } },
{ key: "streetview", name: "streetview-global", langs: { en: 30000 } },
{ key: "current_affairs", name: "current-affairs (raw, 2023-26)", langs: { en: 20694 } },
{ key: "frontend", name: "frontend-coding", langs: { en: 500 } },
{
key: "image_ann", name: "multilingual-image-annotations",
langs: { en: 464, es: 464, fr: 464, hi: 464, zh: 464, ar: 464, pt: 464 }
},
];
// Aggregate totals across the raw corpus
const langTotals = {};
for (const d of DATASET_LANGS) {
for (const [lang, n] of Object.entries(d.langs)) {
langTotals[lang] = (langTotals[lang] || 0) + n;
}
}
const langEntries = Object.entries(langTotals)
.sort((a, b) => b[1] - a[1])
.map(([code, n]) => ({
code,
name: LANG_NAMES[code] || code,
value: n,
sizeValue: Math.log10(n + 10),
}));
// ===========================================================================
// VORONOI TREEMAP (D3) — palette quantile coloring, black strokes
// ===========================================================================
(function renderVoronoi() {
const container = document.getElementById('chart-treemap');
const tooltip = document.getElementById('voronoi-tooltip');
if (typeof d3 === 'undefined' || !d3.voronoiTreemap) {
container.insertAdjacentHTML('beforeend',
`
Voronoi treemap libraries failed to load. Check your network / CSP.
`);
return;
}
// Map language size buckets onto PALETTE via quantiles, so cells naturally
// cluster by tonal groups.
const colorScale = d3.scaleQuantile()
.domain(langEntries.map(e => e.sizeValue))
.range(PALETTE);
function draw() {
container.querySelectorAll('svg').forEach(s => s.remove());
const rect = container.getBoundingClientRect();
const width = Math.max(320, rect.width);
const height = Math.max(320, rect.height);
// Circular clip polygon
const clipPad = 6;
const cx = width / 2, cy = height / 2;
const r = Math.min(width, height) / 2 - clipPad;
const N = 96;
const clipPolygon = d3.range(N).map(i => [
cx + r * Math.cos((i / N) * 2 * Math.PI),
cy + r * Math.sin((i / N) * 2 * Math.PI),
]);
const root = d3.hierarchy({ name: 'root', children: langEntries })
.sum(d => d.sizeValue);
const treemap = d3.voronoiTreemap()
.clip(clipPolygon)
.convergenceRatio(0.005)
.maxIterationCount(120)
.minWeightRatio(0.01);
treemap(root);
const svg = d3.select(container).append('svg')
.attr('viewBox', `0 0 ${width} ${height}`)
.attr('preserveAspectRatio', 'xMidYMid meet');
const g = svg.append('g');
// Outer circle outline
g.append('circle')
.attr('cx', cx).attr('cy', cy).attr('r', r + 1)
.attr('fill', 'none')
.attr('stroke', '#1f1f1f')
.attr('stroke-width', 1);
const leaves = root.leaves();
const cells = g.selectAll('path.voronoi-cell')
.data(leaves)
.join('path')
.attr('class', 'voronoi-cell')
.attr('d', d => 'M' + d.polygon.map(p => p.join(',')).join('L') + 'Z')
.attr('fill', d => colorScale(d.data.sizeValue))
.attr('stroke', '#000000')
.attr('stroke-width', 1.2)
.attr('stroke-linejoin', 'round');
// Record centroid per cell so we can scale from the cell's own center.
cells.each(function (leaf) {
const [cx, cy] = leaf.site || d3.polygonCentroid(leaf.polygon);
this._centroid = [cx, cy];
});
// Mosaic build: fade + scale up from each cell's own centroid, staggered.
cells.nodes().forEach((node, i) => {
const [cx, cy] = node._centroid;
gsap.fromTo(node,
{ scale: 0, opacity: 0, svgOrigin: `${cx} ${cy}` },
{ scale: 1, opacity: 1, svgOrigin: `${cx} ${cy}`,
duration: 0.55, ease: 'power3.out', delay: i * 0.012 }
);
});
// Labels — sized by cell area; smaller cells hide the name, tiny ones only show on hover.
function polyArea(pts) {
let a = 0;
for (let i = 0, n = pts.length; i < n; i++) {
const [x1, y1] = pts[i], [x2, y2] = pts[(i + 1) % n];
a += x1 * y2 - x2 * y1;
}
return Math.abs(a) / 2;
}
leaves.forEach(leaf => {
const area = polyArea(leaf.polygon);
const side = Math.sqrt(area);
const [x, y] = leaf.site || d3.polygonCentroid(leaf.polygon);
const d = leaf.data;
if (side >= 44) {
const nameSize = Math.max(10, Math.min(18, side / 6));
const codeSize = Math.max(9, Math.min(13, side / 9));
const text = g.append('text')
.datum(leaf.data)
.attr('class', 'voronoi-label')
.attr('x', x).attr('y', y - 2)
.attr('font-size', nameSize);
text.append('tspan').text(d.name);
text.append('tspan')
.attr('class', 'code')
.attr('x', x).attr('dy', nameSize * 0.95)
.attr('font-size', codeSize)
.text(`${d.code} · ${formatShort(d.value)}`);
} else if (side >= 22) {
const sz = Math.max(8, Math.min(11, side / 3));
g.append('text')
.datum(leaf.data)
.attr('class', 'voronoi-label')
.attr('x', x).attr('y', y + sz / 3)
.attr('font-size', sz)
.text(d.code);
}
});
// Hover tooltip
cells
.on('mouseenter', (ev, d) => {
tooltip.innerHTML =
`${d.data.name}` +
`(${d.data.code})` +
`${d.data.value.toLocaleString()} rows
`;
gsap.to(tooltip, { opacity: 1, duration: 0.12, overwrite: true });
})
.on('mousemove', (ev) => {
const bb = container.getBoundingClientRect();
tooltip.style.left = (ev.clientX - bb.left + 12) + 'px';
tooltip.style.top = (ev.clientY - bb.top + 12) + 'px';
})
.on('mouseleave', () => {
gsap.to(tooltip, { opacity: 0, duration: 0.1, overwrite: true });
});
// Click drill-down: highlight the clicked cell, dim the rest, surface a
// language detail card below the treemap.
let selectedCell = null;
cells.on('click', function (ev, d) {
const [cx, cy] = this._centroid;
const sameAgain = selectedCell === this;
if (sameAgain) {
// reset
selectedCell = null;
cells.nodes().forEach(node => {
const [ncx, ncy] = node._centroid;
gsap.to(node, {
scale: 1, opacity: 1,
svgOrigin: `${ncx} ${ncy}`,
filter: 'none',
duration: 0.35, ease: 'power2.out', overwrite: 'auto',
});
});
g.selectAll('.voronoi-label').each(function() {
gsap.to(this, { opacity: 1, duration: 0.35, overwrite: 'auto' });
});
hideLanguageDetails();
return;
}
selectedCell = this;
cells.nodes().forEach(node => {
const [ncx, ncy] = node._centroid;
if (node === this) {
gsap.to(node, {
scale: 1.1, opacity: 1,
svgOrigin: `${ncx} ${ncy}`,
filter: 'drop-shadow(0 0 10px rgba(255,255,255,0.45)) brightness(1.35)',
duration: 0.45, ease: 'power2.out', overwrite: 'auto',
});
} else {
gsap.to(node, {
scale: 1, opacity: 0,
svgOrigin: `${ncx} ${ncy}`,
filter: 'none',
duration: 0.35, ease: 'power2.out', overwrite: 'auto',
});
}
});
g.selectAll('.voronoi-label').each(function(ld) {
if (ld && ld.code === d.data.code) {
gsap.to(this, { opacity: 1, duration: 0.45, overwrite: 'auto' });
} else {
gsap.to(this, { opacity: 0, duration: 0.35, overwrite: 'auto' });
}
});
showLanguageDetails(d.data, colorScale(d.data.sizeValue));
// Also emit the custom event so external code can react.
container.dispatchEvent(new CustomEvent('voronoi-drilldown', {
detail: { code: d.data.code, name: d.data.name, rows: d.data.value }
}));
});
}
draw();
let t;
window.addEventListener('resize', () => {
clearTimeout(t);
t = setTimeout(draw, 200);
});
})();