// =========================================================================== // PALETTE — pulled from CSS variables so tokens stay single-source-of-truth // =========================================================================== const CSS = getComputedStyle(document.documentElement); const PALETTE = [ CSS.getPropertyValue('--palette-1').trim(), CSS.getPropertyValue('--palette-2').trim(), CSS.getPropertyValue('--palette-3').trim(), CSS.getPropertyValue('--palette-4').trim(), CSS.getPropertyValue('--palette-5').trim(), CSS.getPropertyValue('--palette-6').trim(), CSS.getPropertyValue('--palette-7').trim(), CSS.getPropertyValue('--palette-8').trim(), CSS.getPropertyValue('--palette-9').trim(), CSS.getPropertyValue('--palette-10').trim(), CSS.getPropertyValue('--palette-11').trim(), CSS.getPropertyValue('--palette-12').trim(), ]; /** Brighten an HSL-mapped color for the inner tick mark (and voronoi hovers). */ function luminousVariant(hex, lightnessBoost = 0.4, saturationBoost = 0.12) { const c = d3.hsl(hex); c.l = Math.min(0.9, c.l + lightnessBoost); c.s = Math.min(1, c.s + saturationBoost); return c.formatHex(); } /** Format a number as an integer + .Mk. */ function formatRows(n) { if (n >= 1_000_000) { const v = n / 1_000_000; const [whole, frac] = v.toFixed(1).split('.'); return `${whole}.${frac}M`; } if (n >= 1_000) { const v = n / 1_000; const [whole, frac] = v.toFixed(1).split('.'); return `${whole}.${frac}k`; } return String(n); } function formatShort(n) { if (n >= 1_000_000) return (n / 1_000_000).toFixed(1).replace(/\.0$/, '') + 'M'; if (n >= 1_000) return (n / 1_000).toFixed(1).replace(/\.0$/, '') + 'k'; return String(n); } // Upgrade hero stat numbers to support a decimal span document.querySelectorAll('.stat .num').forEach(el => { const raw = el.getAttribute('data-value'); const m = raw && raw.match(/^([~≈]?[\d,]+)(\.[\d]+)?([A-Za-z+]+)?$/); if (!m) { el.textContent = raw || ''; return; } const [, whole, frac = '', suffix = ''] = m; el.innerHTML = frac || suffix ? `${whole}${frac}${suffix}` : whole; }); // =========================================================================== // DATASET CATALOG // =========================================================================== const DATASETS = [ { key: "speech", title: "Multilingual Synthetic Speech", tagline: "Zero-shot voice cloning with Qwen3-TTS across 9 languages", raw: { repo: "Reubencf/multilingual-synthetic-tts", rows: 68677 }, adaption: { repo: "Reubencf/Adaption-multilingual-speech", rows: 10274 }, languages: "en, ja, zh, ko, de, es, fr, ru, pt", modality: "audio + text", license: "open / synthetic", schema: ["audio", "text", "language", "language_name", "style", "voice", "sample_rate"], model: "Qwen3-TTS-12Hz-1.7B-Base", group: "paired" }, { key: "sentences", title: "Multilingual Sentences (text-only)", tagline: "Text projection of the TTS corpus — ready for Adaption", raw: null, adaption: { repo: "Reubencf/Adaption-multilingual-sentences", rows: 10000 }, languages: "ja, ru, ko, de, es, pt, zh, en, fr + 114 more", modality: "text", license: "open", schema: ["text", "enhanced_prompt", "enhanced_completion", "language", "voice", "style"], group: "paired" }, { key: "music", title: "Music — FMA Labeled", tagline: "Creative-Commons music tracks with lyrics, genre, mood, BPM, key", raw: { repo: "Reubencf/fma-labeled", rows: 29000 }, adaption: { repo: "Reubencf/adaption-music-style-prompts", rows: 9950 }, languages: "en", modality: "audio + text", license: "CC-BY / CC0 (source-dependent)", schema: ["audio", "lyrics", "genre", "sub_genres", "mood", "instruments", "bpm", "key", "vocal_type", "energy", "era", "quality"], model: "gemini-flash-latest", group: "paired" }, { key: "street", title: "StreetView Global", tagline: "Globally-sampled Mapillary street images with scene classification", raw: { repo: "Reubencf/streetview-global", rows: 30000 }, adaption: { repo: "Reubencf/adaption-street-scene-descriptions", rows: 10100 }, languages: "en", modality: "image + text", license: "CC-BY-SA-4.0", schema: ["image", "scene_description", "setting", "weather", "time_of_day", "road_type", "infrastructure", "lat", "lon", "compass"], group: "paired" }, { key: "magazines", title: "Magazines Multilingual VQA", tagline: "Public-domain magazine OCR in 40+ source languages (including low-resource)", raw: { repo: "Reubencf/magazines-multilingual-vqa", rows: 29039 }, adaption: { repo: "Reubencf/adaption-multilingual-doc-qa", rows: 8800 }, languages: "ar, de, en, es, fr, hi, it, ja, pt, zh + 35 more (Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Russian, Maithili, Tibetan, …)", modality: "image + text", license: "CC-BY-4.0", schema: ["image", "ocr_text", "english_description", "question", "answer", "target_language", "page_type"], model: "Gemma 4 31B via vLLM", group: "paired" }, { key: "lowresource", title: "Low-Resource Doc Q/A", tagline: "Low-resource-language slice of the magazines corpus", raw: null, adaption: { repo: "Reubencf/Adaption-low-resource-doc-qa", rows: 10200 }, languages: "Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Maithili, Tigrinya, Meitei, Lao, …", modality: "image + text", license: "CC-BY-4.0", schema: ["image", "ocr_text", "question", "answer", "source_language"], group: "paired" }, { key: "captions", title: "Multilingual Image Captions", tagline: "English + multilingual captions with bounding-box visualizations", raw: { repo: "Reubencf/multilingual-image-annotations", rows: 464 }, adaption: { repo: "Reubencf/adaption-multilingual-image-captions", rows: 462 }, languages: "en, es, fr, hi, zh, ar, pt", modality: "image + text", license: "CC-BY-4.0", schema: ["image", "boxed_image", "description_en", "descriptions", "vqa", "detections"], model: "Gemma 4 31B", group: "paired" }, { key: "frontend", title: "Frontend Coding", tagline: "Hand-curated HTML / Tailwind / JS prompts and completions", raw: { repo: "Reubencf/frontend-coding", rows: 500 }, adaption: { repo: "Reubencf/frontend-html-tailwind-js", rows: 145 }, languages: "en", modality: "text (code)", license: "MIT", schema: ["prompt", "previous_code", "code", "reasoning"], group: "paired" }, { key: "news2026", title: "Current Affairs 2026", tagline: "2026 Wikipedia current-events Q/A with RAG grounding (through Apr 9, 2026)", raw: { repo: "Reubencf/future-news-events-2026", rows: 5447 }, adaption: { repo: "Reubencf/current-affairs-2026", rows: 5339 }, languages: "en", modality: "text", license: "open", schema: ["question", "answer", "enhanced_prompt", "enhanced_completion", "reasoning_trace", "date", "event_id", "section", "source"], model: "Cohere Command R + RAG", group: "paired" }, { key: "news2025", title: "Current Affairs 2025", tagline: "2025 global events Q/A", raw: { repo: "Reubencf/2025_events", rows: 5390 }, adaption: { repo: "Reubencf/current-affairs-2025", rows: 5390 }, languages: "en", modality: "text", license: "open", schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], group: "paired" }, { key: "news2024", title: "Current Affairs 2024", tagline: "2024 global events Q/A", raw: { repo: "Reubencf/2024_events", rows: 5190 }, adaption: { repo: "Reubencf/current-affairs-2024", rows: 5190 }, languages: "en", modality: "text", license: "open", schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], group: "paired" }, { key: "news2023", title: "Current Affairs 2023", tagline: "2023 global events Q/A", raw: { repo: "Reubencf/2023_events", rows: 4667 }, adaption: { repo: "Reubencf/current-affairs-2023", rows: 4667 }, languages: "en", modality: "text", license: "open", schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], group: "paired" }, // Pre-training pools — now included in the raw donut too. { key: "polyaudio", title: "PolyglotAudio", tagline: "Broad multilingual audio pre-training pool", raw: { repo: "Reubencf/PolyglotAudio", rows: 1200000 }, adaption: null, languages: "multilingual", modality: "audio + text", license: "open", schema: ["audio", "text", "language"], group: "paired" }, { key: "polytext", title: "PolyglotText", tagline: "Large multilingual text pre-training pool", raw: { repo: "Reubencf/PolyglotText", rows: 13400000 }, adaption: null, languages: "multilingual", modality: "text", license: "open", schema: ["text", "language"], group: "paired" }, ]; // Stable color per dataset key — cycles through PALETTE const datasetColor = d3.scaleOrdinal(PALETTE).domain(DATASETS.map(d => d.key)); DATASETS.forEach(d => { d.color = datasetColor(d.key); }); // =========================================================================== // DONUT CHART (D3 — used for both Raw/Adaption donuts and the modality donut) // =========================================================================== const tooltipEl = document.getElementById('donut-tooltip'); // Per-SVG selection state for drill-down (scale-up selected, dim others). const donutState = new Map(); // svgId -> { selectedKey, paths, arcGen } function renderDonut({ svgId, centerId, field, datasets, getValue, getKey, getTitle, getColor, getMeta, colorScale, topLabel, bottomLabel, topIcon, bottomIcon, sizing = 'linear' }) { const svg = d3.select('#' + svgId); svg.selectAll('*').remove(); const bbox = svg.node().getBoundingClientRect(); const size = Math.min(bbox.width, bbox.height); const outerR = size / 2 - 6; const innerR = outerR * 0.62; svg.attr('viewBox', `${-size / 2} ${-size / 2} ${size} ${size}`); const filtered = datasets.filter(d => getValue(d) > 0); const total = d3.sum(filtered, getValue); const count = filtered.length; // Sizing strategy for the arc: // "linear" — true proportions (small slices can vanish) // "log" — power-compressed so tiny datasets stay visible while the // big ones (PolyglotText 13M+, PolyglotAudio 1M+) still read // as clearly the largest slices // "sqrt" — lighter square-root compression // Tooltip + center numbers always show real values. const sizeValue = d => { const v = getValue(d); if (sizing === 'log') return Math.pow(v + 1, 0.38); if (sizing === 'sqrt') return Math.sqrt(v + 1); return v; }; const pie = d3.pie().value(sizeValue).sort(null).padAngle(0.022); const arcs = pie(filtered); const arcGen = d3.arc().innerRadius(innerR).outerRadius(outerR).cornerRadius(3); const resolveColor = getColor || (x => datasetColor(getKey(x))); const g = svg.append('g'); // Slice paths const paths = g.selectAll('path') .data(arcs) .join('path') .attr('class', 'donut-slice') .attr('fill', d => resolveColor(d.data)) .attr('stroke', '#000000') .attr('stroke-width', 2) .attr('stroke-linejoin', 'round'); // Radial sweep: interpolate endAngle from startAngle → target, so arcs // literally grow around the ring from 0° of arc to their full sweep. paths.each(function (d) { const [cx, cy] = arcGen.centroid(d); this._centroid = [cx, cy]; this._current = { startAngle: d.startAngle, endAngle: d.startAngle, padAngle: d.padAngle }; }); paths.transition() .delay((d, i) => i * 80) .duration(1100) .ease(d3.easeCubicOut) .attrTween('d', function (d) { const interp = d3.interpolate(this._current, d); this._current = interp(1); return t => arcGen(interp(t)); }); // Hover tooltip + click drill-down paths .on('mouseenter', function (ev, d) { const nm = getTitle(d.data); const v = getValue(d.data); const meta = getMeta ? getMeta(d.data) : ''; tooltipEl.innerHTML = `
${nm}
` + `
${v.toLocaleString()} rows` + (meta ? ` · ${meta}` : '') + `
`; gsap.to(tooltipEl, { opacity: 1, duration: 0.15, overwrite: true }); }) .on('mousemove', function (ev) { tooltipEl.style.left = (ev.clientX + 14) + 'px'; tooltipEl.style.top = (ev.clientY + 14) + 'px'; }) .on('mouseleave', function () { gsap.to(tooltipEl, { opacity: 0, duration: 0.12, overwrite: true }); }) .on('click', function (ev, d) { const key = getKey(d.data); focusDonutSlice(svgId, this, key); if (typeof showDetails === 'function') showDetails(key); }); // Cache for drill-down reset logic. donutState.set(svgId, { paths, arcGen, resolveColor }); // Center content — start at 0 and count up with GSAP. if (centerId) { const centerEl = document.getElementById(centerId); const topIconHtml = topIcon ? `${topIcon}` : ''; const bottomIconHtml = bottomIcon ? `${bottomIcon}` : ''; centerEl.innerHTML = `
${topIconHtml}${topLabel}
0
0
${bottomLabel}${bottomIconHtml}
`; const topEl = centerEl.querySelector('.js-count-top'); const bottomEl = centerEl.querySelector('.js-count-bottom'); const topObj = { v: 0 }; gsap.to(topObj, { v: count, duration: 1.0, ease: 'power2.out', delay: 0.55, onUpdate: () => { topEl.textContent = Math.floor(topObj.v); }, onComplete: () => { topEl.textContent = count; } }); const bottomObj = { v: 0 }; gsap.to(bottomObj, { v: total, duration: 1.6, ease: 'power2.out', delay: 0.65, onUpdate: () => { bottomEl.innerHTML = formatRows(Math.floor(bottomObj.v)); }, onComplete: () => { bottomEl.innerHTML = formatRows(total); } }); gsap.from(`#${centerId} .center-label`, { y: 14, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.45, stagger: 0.15 }); gsap.from(`#${centerId} .center-number`, { y: 10, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.55, stagger: 0.15 }); // Space out the top/bottom blocks since the divider is gone. centerEl.querySelector('.center-item.bottom').style.marginTop = '14px'; } return paths; } /** Click drill-down: scale up clicked slice, dim the rest, toggle on re-click. */ function focusDonutSlice(svgId, clickedEl, clickedKey) { const state = donutState.get(svgId); if (!state) return; const { paths, arcGen } = state; // Toggle off if clicking the already-selected slice if (state.selectedKey === clickedKey) { resetDonutFocus(svgId); return; } state.selectedKey = clickedKey; paths.nodes().forEach((node, i) => { const d = paths.data()[i]; const isSelected = node === clickedEl; if (isSelected) { const [cx, cy] = node._centroid || arcGen.centroid(d); gsap.to(node, { scale: 1.08, opacity: 1, svgOrigin: `${cx} ${cy}`, filter: 'drop-shadow(0 0 14px rgba(255,255,255,0.35)) brightness(1.15)', duration: 0.45, ease: 'power2.out', overwrite: 'auto' }); } else { gsap.to(node, { scale: 1, opacity: 0.3, filter: 'none', duration: 0.35, ease: 'power2.out', overwrite: 'auto' }); } }); } function resetDonutFocus(svgId) { const state = donutState.get(svgId); if (!state) return; state.selectedKey = null; state.paths.nodes().forEach(node => { gsap.to(node, { scale: 1, opacity: 1, filter: 'none', duration: 0.35, ease: 'power2.out', overwrite: 'auto' }); }); } // ---- Raw vs Adaption donuts ---- // The Raw donut shows every dataset that has a raw repo (including the // PolyglotText / PolyglotAudio pre-training pools). The Adaption donut shows // every dataset with an Adaption-remastered version. renderDonut() filters // out zero-value entries automatically. renderDonut({ svgId: 'chart-raw', centerId: 'center-raw', field: 'raw', datasets: DATASETS, getValue: d => (d.raw && d.raw.rows) || 0, getKey: d => d.key, getTitle: d => d.title, getMeta: d => d.raw ? d.raw.repo : '', topLabel: 'RAW DATASETS', bottomLabel: 'ROWS', topIcon: '', bottomIcon: '', sizing: 'log', // compress so tiny datasets still get a visible slice }); renderDonut({ svgId: 'chart-adaption', centerId: 'center-adaption', field: 'adaption', datasets: DATASETS, getValue: d => (d.adaption && d.adaption.rows) || 0, getKey: d => d.key, getTitle: d => d.title, getMeta: d => d.adaption ? d.adaption.repo : '', topLabel: 'ADAPTION SETS', bottomLabel: 'ROWS', topIcon: '', bottomIcon: '', }); // ---- Modality donut ---- const MODALITIES = [ { key: 'text', name: 'Text', count: 5 }, { key: 'audio', name: 'Audio', count: 3 }, { key: 'image', name: 'Image', count: 3 }, { key: 'code', name: 'Code', count: 1 }, ]; const modalityColor = d3.scaleOrdinal(PALETTE).domain(MODALITIES.map(m => m.key)); renderDonut({ svgId: 'chart-modality', centerId: 'center-modality', field: 'count', datasets: MODALITIES, getValue: d => d.count, getKey: d => d.key, getTitle: d => d.name, getColor: d => modalityColor(d.key), getMeta: d => `${d.count} datasets`, topLabel: 'MODALITIES', bottomLabel: 'DATASETS', topIcon: '', bottomIcon: '', }); // =========================================================================== // DETAILS CARD — rendered on slice click with GSAP reveal // =========================================================================== function hideLanguageDetails() { // No-op placeholder — currently we share the single details card; click- // another to switch. Kept as an explicit symbol for future extension. } function showLanguageDetails(langData, color) { const card = document.getElementById('details-card'); card.style.display = ''; // Per-dataset breakdown for this language. const breakdown = DATASET_LANGS .map(d => ({ dataset: d.name, key: d.key, rows: d.langs[langData.code] || 0 })) .filter(d => d.rows > 0) .sort((a, b) => b.rows - a.rows); const rows = breakdown.map(b => `
${b.dataset}
${formatShort(b.rows)} (${b.rows.toLocaleString()})
` ).join(''); card.innerHTML = `

${langData.name} (${langData.code})

Total across the raw corpus: ${langData.value.toLocaleString()} rows.

${rows}
`; gsap.fromTo(card, { y: 80, opacity: 0 }, { y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' } ); gsap.from(card.querySelectorAll('.kv'), { y: 18, opacity: 0, duration: 0.45, ease: 'power3.out', stagger: 0.05, delay: 0.2 }); card.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); } function showDetails(key) { const d = DATASETS.find(x => x.key === key); if (!d) return; const card = document.getElementById('details-card'); card.style.display = ''; const repoLink = info => info ? `${info.repo}` : ``; const rowsCell = info => info ? `${formatShort(info.rows)} (${info.rows.toLocaleString()})` : ``; card.innerHTML = `

${d.title}

${d.tagline}

Raw repo
${repoLink(d.raw)}
Raw rows
${rowsCell(d.raw)}
Adaption repo
${repoLink(d.adaption)}
Adaption rows
${rowsCell(d.adaption)}
Modality
${d.modality}
License
${d.license}
${d.model ? `
Annotator
${d.model}
` : ''}
Languages
${d.languages}
Schema
${d.schema.map(c => `${c}`).join('')}
`; // Elegant slide-in from the bottom with power4.out. gsap.fromTo(card, { y: 80, opacity: 0 }, { y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' } ); gsap.from(card.querySelectorAll('.kv'), { y: 18, opacity: 0, duration: 0.45, ease: 'power3.out', stagger: 0.05, delay: 0.2 }); card.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); } // =========================================================================== // INITIAL PAGE LOAD — hero image → hero stats → chart cards, staggered. // =========================================================================== const loadTl = gsap.timeline(); loadTl .from('.hero-img', { y: 20, opacity: 0, duration: 0.9, ease: 'power3.out' }) .from('.stat', { y: 20, opacity: 0, duration: 0.7, ease: 'power3.out', stagger: 0.15 }, '-=0.5') .from('.chart-card', { y: 20, opacity: 0, duration: 0.8, ease: 'power3.out', stagger: 0.15 }, '-=0.3'); // =========================================================================== // LANGUAGE DATA (for the Voronoi treemap) // =========================================================================== const LANG_NAMES = { tr: "Turkish", ru: "Russian", it: "Italian", en: "English", eo: "Esperanto", hu: "Hungarian", de: "German", fr: "French", pt: "Portuguese", mk: "Macedonian", es: "Spanish", he: "Hebrew", fi: "Finnish", ber: "Berber", nl: "Dutch", pl: "Polish", sr: "Serbian", mr: "Marathi", el: "Greek", da: "Danish", cs: "Czech", sv: "Swedish", bg: "Bulgarian", la: "Latin", zh: "Mandarin", ro: "Romanian", ia: "Interlingua", ja: "Japanese", tok: "Toki Pona", lfn: "Lingua Franca Nova", uk: "Ukrainian", tt: "Tatar", tl: "Tagalog", id: "Indonesian", nb: "Norwegian B.", lt: "Lithuanian", az: "Azerbaijani", ie: "Interlingue", tlh: "Klingon", jbo: "Lojban", mhr: "Meadow Mari", bn: "Bengali", fa: "Persian", br: "Breton", ilo: "Ilocano", ar: "Arabic", ceb: "Cebuano", hi: "Hindi", vi: "Vietnamese", pam: "Kapampangan", hy: "Armenian", be: "Belarusian", ko: "Korean", yue: "Cantonese", ca: "Catalan", kab: "Kabyle", af: "Afrikaans", am: "Amharic", yi: "Yiddish", sat: "Santali", so: "Somali", te: "Telugu", ne: "Nepali", pa: "Punjabi", ur: "Urdu", ta: "Tamil", ml: "Malayalam", th: "Thai", or: "Odia", sd: "Sindhi", gu: "Gujarati", kn: "Kannada", my: "Burmese", bo: "Tibetan", lo: "Lao", mni: "Meitei", kk: "Kazakh", oc: "Occitan", hr: "Croatian", sk: "Slovak", et: "Estonian", sl: "Slovenian", is: "Icelandic", ms: "Malay", sq: "Albanian", hsb: "Upper Sorbian", dsb: "Lower Sorbian", mai: "Maithili", kha: "Khasi", dtp: "Kadazan", yo: "Yoruba", sw: "Swahili", cy: "Welsh", ga: "Irish", gd: "Scottish Gaelic", ti: "Tigrinya", os: "Ossetian", sa: "Sanskrit", ug: "Uyghur", uz: "Uzbek", ka: "Georgian", eu: "Basque", vo: "Volapük", ido: "Ido", nov: "Novial", avk: "Kotava", ldn: "Láadan", afh: "Afrihili", lzh: "Classical Chinese", non: "Old Norse", ang: "Old English", grc: "Ancient Greek", sux: "Sumerian", fro: "Old French", cbk: "Chavacano", zsm: "Standard Malay", war: "Waray", kw: "Cornish", nah: "Nahuatl", kek: "Q'eqchi'", hif: "Fiji Hindi", crh: "Crimean Tatar", sah: "Sakha", ext: "Extremaduran", csb: "Kashubian", sgs: "Samogitian", cha: "Chamorro", tvl: "Tuvaluan", mi: "Maori", lin: "Lingala", arq: "Algerian Arabic", arz: "Egyptian Arabic", orv: "Old East Slavic", prg: "Old Prussian", chv: "Chuvash", bar: "Bavarian", pms: "Piedmontese", egl: "Emilian", jav: "Javanese", sun: "Sundanese", hoc: "Ho", zza: "Zaza", rif: "Riffian Berber", nog: "Nogai", km: "Khmer", }; const DATASET_LANGS = [ { key: "polytext", name: "PolyglotText", langs: { tr: 1767000, ru: 1695000, it: 1588000, en: 1337000, eo: 1171000, hu: 817000, de: 675000, fr: 520000, pt: 470000, mk: 398000, es: 358000, he: 272000, fi: 263000, ber: 180000, nl: 125000, pl: 118000, sr: 106000, mr: 96000, el: 94000, da: 90000, cs: 72000, sv: 71000, bg: 70000, la: 66000, zh: 58000, ro: 56000, ia: 54000, ja: 43000, tok: 39000, lfn: 38000, uk: 38000, tt: 33000, tl: 31000, id: 31000, nb: 31000, lt: 29000, az: 25000, ie: 24000, tlh: 23000, jbo: 21000, mhr: 19000, bn: 19000, fa: 17000, br: 17000, ilo: 17000, ar: 16000, ceb: 15000, hi: 13000, vi: 11000, pam: 11000, hy: 9000, be: 9000, ko: 9000, cbk: 19000, sk: 8000, vo: 8000, oc: 8000, et: 8000, war: 6700, ms: 6700, hr: 6700, eu: 6700, yi: 5400, af: 5400, km: 4000, ca: 4000, kha: 4000, dtp: 4000, zza: 4000, is: 4000, avk: 4000, ga: 4000, hoc: 4000, sl: 4000, sq: 4000, chv: 4000, kw: 4000, sux: 2700, ang: 2700, pms: 2700, prg: 2700, ug: 2700, lzh: 2700, egl: 2700, ur: 2700, sah: 2700, nds: 2700, mi: 2700, tvl: 1400, cha: 1400, th: 1400, cy: 1400, non: 1400, yo: 1400, lin: 1400, grc: 1400, arq: 1400, orv: 1400, sw: 1400, rif: 1400, crh: 1400, hif: 1400, jav: 1400, sun: 1400, hsb: 1400, dsb: 1400, amh: 1400, csb: 1400, sgs: 1400, ext: 1400, nov: 1400, nog: 1400, arz: 1400, nah: 1400, ido: 1400, afh: 1400, kk: 1400, } }, { key: "polyaudio", name: "PolyglotAudio", langs: { en: 698000, es: 261000, eo: 105000, de: 32000, fr: 16000, ru: 9200, pl: 8800, ber: 6600, nl: 5900, it: 5900, yue: 4300, pt: 3300, ja: 1400, mr: 1200, ca: 505, cs: 410, zh: 110, fi: 93, hu: 87, uk: 38, he: 16, tok: 5, kab: 5, } }, { key: "tts", name: "multilingual-synthetic-tts", langs: { ja: 13951, ru: 9105, de: 8972, ko: 8129, es: 7917, pt: 5438, zh: 5417, en: 5157, fr: 4551, } }, { key: "magazines", name: "magazines-multilingual-vqa", langs: { de: 4412, fr: 3279, ru: 2762, pt: 2047, vi: 1637, bn: 1598, en: 1004, af: 826, ar: 156, it: 136, fa: 132, te: 132, ja: 130, ne: 108, pa: 100, ur: 98, nl: 95, tr: 85, zh: 83, ta: 68, ml: 64, id: 43, th: 47, am: 123, yi: 108, sat: 85, so: 25, hi: 15, es: 10, mr: 9, kn: 8, or: 17, sd: 3, mai: 1021, la: 146, bo: 25, be: 8, da: 6, ko: 4, bg: 4, os: 3, sa: 3, my: 3, oc: 1, gd: 1, ti: 1, hy: 1, pl: 1, mni: 1, uk: 1, lo: 1, kk: 1, } }, { key: "fma", name: "fma-labeled", langs: { en: 29000 } }, { key: "streetview", name: "streetview-global", langs: { en: 30000 } }, { key: "current_affairs", name: "current-affairs (raw, 2023-26)", langs: { en: 20694 } }, { key: "frontend", name: "frontend-coding", langs: { en: 500 } }, { key: "image_ann", name: "multilingual-image-annotations", langs: { en: 464, es: 464, fr: 464, hi: 464, zh: 464, ar: 464, pt: 464 } }, ]; // Aggregate totals across the raw corpus const langTotals = {}; for (const d of DATASET_LANGS) { for (const [lang, n] of Object.entries(d.langs)) { langTotals[lang] = (langTotals[lang] || 0) + n; } } const langEntries = Object.entries(langTotals) .sort((a, b) => b[1] - a[1]) .map(([code, n]) => ({ code, name: LANG_NAMES[code] || code, value: n, sizeValue: Math.log10(n + 10), })); // =========================================================================== // VORONOI TREEMAP (D3) — palette quantile coloring, black strokes // =========================================================================== (function renderVoronoi() { const container = document.getElementById('chart-treemap'); const tooltip = document.getElementById('voronoi-tooltip'); if (typeof d3 === 'undefined' || !d3.voronoiTreemap) { container.insertAdjacentHTML('beforeend', `
Voronoi treemap libraries failed to load. Check your network / CSP.
`); return; } // Map language size buckets onto PALETTE via quantiles, so cells naturally // cluster by tonal groups. const colorScale = d3.scaleQuantile() .domain(langEntries.map(e => e.sizeValue)) .range(PALETTE); function draw() { container.querySelectorAll('svg').forEach(s => s.remove()); const rect = container.getBoundingClientRect(); const width = Math.max(320, rect.width); const height = Math.max(320, rect.height); // Circular clip polygon const clipPad = 6; const cx = width / 2, cy = height / 2; const r = Math.min(width, height) / 2 - clipPad; const N = 96; const clipPolygon = d3.range(N).map(i => [ cx + r * Math.cos((i / N) * 2 * Math.PI), cy + r * Math.sin((i / N) * 2 * Math.PI), ]); const root = d3.hierarchy({ name: 'root', children: langEntries }) .sum(d => d.sizeValue); const treemap = d3.voronoiTreemap() .clip(clipPolygon) .convergenceRatio(0.005) .maxIterationCount(120) .minWeightRatio(0.01); treemap(root); const svg = d3.select(container).append('svg') .attr('viewBox', `0 0 ${width} ${height}`) .attr('preserveAspectRatio', 'xMidYMid meet'); const g = svg.append('g'); // Outer circle outline g.append('circle') .attr('cx', cx).attr('cy', cy).attr('r', r + 1) .attr('fill', 'none') .attr('stroke', '#1f1f1f') .attr('stroke-width', 1); const leaves = root.leaves(); const cells = g.selectAll('path.voronoi-cell') .data(leaves) .join('path') .attr('class', 'voronoi-cell') .attr('d', d => 'M' + d.polygon.map(p => p.join(',')).join('L') + 'Z') .attr('fill', d => colorScale(d.data.sizeValue)) .attr('stroke', '#000000') .attr('stroke-width', 1.2) .attr('stroke-linejoin', 'round'); // Record centroid per cell so we can scale from the cell's own center. cells.each(function (leaf) { const [cx, cy] = leaf.site || d3.polygonCentroid(leaf.polygon); this._centroid = [cx, cy]; }); // Mosaic build: fade + scale up from each cell's own centroid, staggered. cells.nodes().forEach((node, i) => { const [cx, cy] = node._centroid; gsap.fromTo(node, { scale: 0, opacity: 0, svgOrigin: `${cx} ${cy}` }, { scale: 1, opacity: 1, svgOrigin: `${cx} ${cy}`, duration: 0.55, ease: 'power3.out', delay: i * 0.012 } ); }); // Labels — sized by cell area; smaller cells hide the name, tiny ones only show on hover. function polyArea(pts) { let a = 0; for (let i = 0, n = pts.length; i < n; i++) { const [x1, y1] = pts[i], [x2, y2] = pts[(i + 1) % n]; a += x1 * y2 - x2 * y1; } return Math.abs(a) / 2; } leaves.forEach(leaf => { const area = polyArea(leaf.polygon); const side = Math.sqrt(area); const [x, y] = leaf.site || d3.polygonCentroid(leaf.polygon); const d = leaf.data; if (side >= 44) { const nameSize = Math.max(10, Math.min(18, side / 6)); const codeSize = Math.max(9, Math.min(13, side / 9)); const text = g.append('text') .datum(leaf.data) .attr('class', 'voronoi-label') .attr('x', x).attr('y', y - 2) .attr('font-size', nameSize); text.append('tspan').text(d.name); text.append('tspan') .attr('class', 'code') .attr('x', x).attr('dy', nameSize * 0.95) .attr('font-size', codeSize) .text(`${d.code} · ${formatShort(d.value)}`); } else if (side >= 22) { const sz = Math.max(8, Math.min(11, side / 3)); g.append('text') .datum(leaf.data) .attr('class', 'voronoi-label') .attr('x', x).attr('y', y + sz / 3) .attr('font-size', sz) .text(d.code); } }); // Hover tooltip cells .on('mouseenter', (ev, d) => { tooltip.innerHTML = `${d.data.name}` + `(${d.data.code})` + `
${d.data.value.toLocaleString()} rows
`; gsap.to(tooltip, { opacity: 1, duration: 0.12, overwrite: true }); }) .on('mousemove', (ev) => { const bb = container.getBoundingClientRect(); tooltip.style.left = (ev.clientX - bb.left + 12) + 'px'; tooltip.style.top = (ev.clientY - bb.top + 12) + 'px'; }) .on('mouseleave', () => { gsap.to(tooltip, { opacity: 0, duration: 0.1, overwrite: true }); }); // Click drill-down: highlight the clicked cell, dim the rest, surface a // language detail card below the treemap. let selectedCell = null; cells.on('click', function (ev, d) { const [cx, cy] = this._centroid; const sameAgain = selectedCell === this; if (sameAgain) { // reset selectedCell = null; cells.nodes().forEach(node => { const [ncx, ncy] = node._centroid; gsap.to(node, { scale: 1, opacity: 1, svgOrigin: `${ncx} ${ncy}`, filter: 'none', duration: 0.35, ease: 'power2.out', overwrite: 'auto', }); }); g.selectAll('.voronoi-label').each(function() { gsap.to(this, { opacity: 1, duration: 0.35, overwrite: 'auto' }); }); hideLanguageDetails(); return; } selectedCell = this; cells.nodes().forEach(node => { const [ncx, ncy] = node._centroid; if (node === this) { gsap.to(node, { scale: 1.1, opacity: 1, svgOrigin: `${ncx} ${ncy}`, filter: 'drop-shadow(0 0 10px rgba(255,255,255,0.45)) brightness(1.35)', duration: 0.45, ease: 'power2.out', overwrite: 'auto', }); } else { gsap.to(node, { scale: 1, opacity: 0, svgOrigin: `${ncx} ${ncy}`, filter: 'none', duration: 0.35, ease: 'power2.out', overwrite: 'auto', }); } }); g.selectAll('.voronoi-label').each(function(ld) { if (ld && ld.code === d.data.code) { gsap.to(this, { opacity: 1, duration: 0.45, overwrite: 'auto' }); } else { gsap.to(this, { opacity: 0, duration: 0.35, overwrite: 'auto' }); } }); showLanguageDetails(d.data, colorScale(d.data.sizeValue)); // Also emit the custom event so external code can react. container.dispatchEvent(new CustomEvent('voronoi-drilldown', { detail: { code: d.data.code, name: d.data.name, rows: d.data.value } })); }); } draw(); let t; window.addEventListener('resize', () => { clearTimeout(t); t = setTimeout(draw, 200); }); })();