Spaces:
No application file
No application file
| // =========================================================================== | |
| // PALETTE — pulled from CSS variables so tokens stay single-source-of-truth | |
| // =========================================================================== | |
| const CSS = getComputedStyle(document.documentElement); | |
| const PALETTE = [ | |
| CSS.getPropertyValue('--palette-1').trim(), | |
| CSS.getPropertyValue('--palette-2').trim(), | |
| CSS.getPropertyValue('--palette-3').trim(), | |
| CSS.getPropertyValue('--palette-4').trim(), | |
| CSS.getPropertyValue('--palette-5').trim(), | |
| CSS.getPropertyValue('--palette-6').trim(), | |
| CSS.getPropertyValue('--palette-7').trim(), | |
| CSS.getPropertyValue('--palette-8').trim(), | |
| CSS.getPropertyValue('--palette-9').trim(), | |
| CSS.getPropertyValue('--palette-10').trim(), | |
| CSS.getPropertyValue('--palette-11').trim(), | |
| CSS.getPropertyValue('--palette-12').trim(), | |
| ]; | |
| /** Brighten an HSL-mapped color for the inner tick mark (and voronoi hovers). */ | |
| function luminousVariant(hex, lightnessBoost = 0.4, saturationBoost = 0.12) { | |
| const c = d3.hsl(hex); | |
| c.l = Math.min(0.9, c.l + lightnessBoost); | |
| c.s = Math.min(1, c.s + saturationBoost); | |
| return c.formatHex(); | |
| } | |
| /** Format a number as an integer + <span class="decimal">.Mk</span>. */ | |
| function formatRows(n) { | |
| if (n >= 1_000_000) { | |
| const v = n / 1_000_000; | |
| const [whole, frac] = v.toFixed(1).split('.'); | |
| return `${whole}<span class="decimal">.${frac}M</span>`; | |
| } | |
| if (n >= 1_000) { | |
| const v = n / 1_000; | |
| const [whole, frac] = v.toFixed(1).split('.'); | |
| return `${whole}<span class="decimal">.${frac}k</span>`; | |
| } | |
| return String(n); | |
| } | |
| function formatShort(n) { | |
| if (n >= 1_000_000) return (n / 1_000_000).toFixed(1).replace(/\.0$/, '') + 'M'; | |
| if (n >= 1_000) return (n / 1_000).toFixed(1).replace(/\.0$/, '') + 'k'; | |
| return String(n); | |
| } | |
| // Upgrade hero stat numbers to support a decimal span | |
| document.querySelectorAll('.stat .num').forEach(el => { | |
| const raw = el.getAttribute('data-value'); | |
| const m = raw && raw.match(/^([~≈]?[\d,]+)(\.[\d]+)?([A-Za-z+]+)?$/); | |
| if (!m) { el.textContent = raw || ''; return; } | |
| const [, whole, frac = '', suffix = ''] = m; | |
| el.innerHTML = frac || suffix | |
| ? `${whole}<span class="decimal">${frac}${suffix}</span>` | |
| : whole; | |
| }); | |
| // =========================================================================== | |
| // DATASET CATALOG | |
| // =========================================================================== | |
| const DATASETS = [ | |
| { | |
| key: "speech", | |
| title: "Multilingual Synthetic Speech", | |
| tagline: "Zero-shot voice cloning with Qwen3-TTS across 9 languages", | |
| raw: { repo: "Reubencf/multilingual-synthetic-tts", rows: 68677 }, | |
| adaption: { repo: "Reubencf/Adaption-multilingual-speech", rows: 10274 }, | |
| languages: "en, ja, zh, ko, de, es, fr, ru, pt", | |
| modality: "audio + text", | |
| license: "open / synthetic", | |
| schema: ["audio", "text", "language", "language_name", "style", "voice", "sample_rate"], | |
| model: "Qwen3-TTS-12Hz-1.7B-Base", | |
| group: "paired" | |
| }, | |
| { | |
| key: "sentences", | |
| title: "Multilingual Sentences (text-only)", | |
| tagline: "Text projection of the TTS corpus — ready for Adaption", | |
| raw: null, | |
| adaption: { repo: "Reubencf/Adaption-multilingual-sentences", rows: 10000 }, | |
| languages: "ja, ru, ko, de, es, pt, zh, en, fr + 114 more", | |
| modality: "text", | |
| license: "open", | |
| schema: ["text", "enhanced_prompt", "enhanced_completion", "language", "voice", "style"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "music", | |
| title: "Music — FMA Labeled", | |
| tagline: "Creative-Commons music tracks with lyrics, genre, mood, BPM, key", | |
| raw: { repo: "Reubencf/fma-labeled", rows: 29000 }, | |
| adaption: { repo: "Reubencf/adaption-music-style-prompts", rows: 9950 }, | |
| languages: "en", | |
| modality: "audio + text", | |
| license: "CC-BY / CC0 (source-dependent)", | |
| schema: ["audio", "lyrics", "genre", "sub_genres", "mood", "instruments", "bpm", "key", "vocal_type", "energy", "era", "quality"], | |
| model: "gemini-flash-latest", | |
| group: "paired" | |
| }, | |
| { | |
| key: "street", | |
| title: "StreetView Global", | |
| tagline: "Globally-sampled Mapillary street images with scene classification", | |
| raw: { repo: "Reubencf/streetview-global", rows: 30000 }, | |
| adaption: { repo: "Reubencf/adaption-street-scene-descriptions", rows: 10100 }, | |
| languages: "en", | |
| modality: "image + text", | |
| license: "CC-BY-SA-4.0", | |
| schema: ["image", "scene_description", "setting", "weather", "time_of_day", "road_type", "infrastructure", "lat", "lon", "compass"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "magazines", | |
| title: "Magazines Multilingual VQA", | |
| tagline: "Public-domain magazine OCR in 40+ source languages (including low-resource)", | |
| raw: { repo: "Reubencf/magazines-multilingual-vqa", rows: 29039 }, | |
| adaption: { repo: "Reubencf/adaption-multilingual-doc-qa", rows: 8800 }, | |
| languages: "ar, de, en, es, fr, hi, it, ja, pt, zh + 35 more (Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Russian, Maithili, Tibetan, …)", | |
| modality: "image + text", | |
| license: "CC-BY-4.0", | |
| schema: ["image", "ocr_text", "english_description", "question", "answer", "target_language", "page_type"], | |
| model: "Gemma 4 31B via vLLM", | |
| group: "paired" | |
| }, | |
| { | |
| key: "lowresource", | |
| title: "Low-Resource Doc Q/A", | |
| tagline: "Low-resource-language slice of the magazines corpus", | |
| raw: null, | |
| adaption: { repo: "Reubencf/Adaption-low-resource-doc-qa", rows: 10200 }, | |
| languages: "Afrikaans, Amharic, Yoruba, Yiddish, Bengali, Santali, Somali, Vietnamese, Maithili, Tigrinya, Meitei, Lao, …", | |
| modality: "image + text", | |
| license: "CC-BY-4.0", | |
| schema: ["image", "ocr_text", "question", "answer", "source_language"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "captions", | |
| title: "Multilingual Image Captions", | |
| tagline: "English + multilingual captions with bounding-box visualizations", | |
| raw: { repo: "Reubencf/multilingual-image-annotations", rows: 464 }, | |
| adaption: { repo: "Reubencf/adaption-multilingual-image-captions", rows: 462 }, | |
| languages: "en, es, fr, hi, zh, ar, pt", | |
| modality: "image + text", | |
| license: "CC-BY-4.0", | |
| schema: ["image", "boxed_image", "description_en", "descriptions", "vqa", "detections"], | |
| model: "Gemma 4 31B", | |
| group: "paired" | |
| }, | |
| { | |
| key: "frontend", | |
| title: "Frontend Coding", | |
| tagline: "Hand-curated HTML / Tailwind / JS prompts and completions", | |
| raw: { repo: "Reubencf/frontend-coding", rows: 500 }, | |
| adaption: { repo: "Reubencf/frontend-html-tailwind-js", rows: 145 }, | |
| languages: "en", | |
| modality: "text (code)", | |
| license: "MIT", | |
| schema: ["prompt", "previous_code", "code", "reasoning"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "news2026", | |
| title: "Current Affairs 2026", | |
| tagline: "2026 Wikipedia current-events Q/A with RAG grounding (through Apr 9, 2026)", | |
| raw: { repo: "Reubencf/future-news-events-2026", rows: 5447 }, | |
| adaption: { repo: "Reubencf/current-affairs-2026", rows: 5339 }, | |
| languages: "en", | |
| modality: "text", | |
| license: "open", | |
| schema: ["question", "answer", "enhanced_prompt", "enhanced_completion", "reasoning_trace", "date", "event_id", "section", "source"], | |
| model: "Cohere Command R + RAG", | |
| group: "paired" | |
| }, | |
| { | |
| key: "news2025", | |
| title: "Current Affairs 2025", | |
| tagline: "2025 global events Q/A", | |
| raw: { repo: "Reubencf/2025_events", rows: 5390 }, | |
| adaption: { repo: "Reubencf/current-affairs-2025", rows: 5390 }, | |
| languages: "en", | |
| modality: "text", | |
| license: "open", | |
| schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "news2024", | |
| title: "Current Affairs 2024", | |
| tagline: "2024 global events Q/A", | |
| raw: { repo: "Reubencf/2024_events", rows: 5190 }, | |
| adaption: { repo: "Reubencf/current-affairs-2024", rows: 5190 }, | |
| languages: "en", | |
| modality: "text", | |
| license: "open", | |
| schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "news2023", | |
| title: "Current Affairs 2023", | |
| tagline: "2023 global events Q/A", | |
| raw: { repo: "Reubencf/2023_events", rows: 4667 }, | |
| adaption: { repo: "Reubencf/current-affairs-2023", rows: 4667 }, | |
| languages: "en", | |
| modality: "text", | |
| license: "open", | |
| schema: ["question", "answer", "enhanced_prompt", "enhanced_completion"], | |
| group: "paired" | |
| }, | |
| // Pre-training pools — now included in the raw donut too. | |
| { | |
| key: "polyaudio", | |
| title: "PolyglotAudio", | |
| tagline: "Broad multilingual audio pre-training pool", | |
| raw: { repo: "Reubencf/PolyglotAudio", rows: 1200000 }, | |
| adaption: null, | |
| languages: "multilingual", | |
| modality: "audio + text", | |
| license: "open", | |
| schema: ["audio", "text", "language"], | |
| group: "paired" | |
| }, | |
| { | |
| key: "polytext", | |
| title: "PolyglotText", | |
| tagline: "Large multilingual text pre-training pool", | |
| raw: { repo: "Reubencf/PolyglotText", rows: 13400000 }, | |
| adaption: null, | |
| languages: "multilingual", | |
| modality: "text", | |
| license: "open", | |
| schema: ["text", "language"], | |
| group: "paired" | |
| }, | |
| ]; | |
| // Stable color per dataset key — cycles through PALETTE | |
| const datasetColor = d3.scaleOrdinal(PALETTE).domain(DATASETS.map(d => d.key)); | |
| DATASETS.forEach(d => { d.color = datasetColor(d.key); }); | |
| // =========================================================================== | |
| // DONUT CHART (D3 — used for both Raw/Adaption donuts and the modality donut) | |
| // =========================================================================== | |
| const tooltipEl = document.getElementById('donut-tooltip'); | |
| // Per-SVG selection state for drill-down (scale-up selected, dim others). | |
| const donutState = new Map(); // svgId -> { selectedKey, paths, arcGen } | |
| function renderDonut({ svgId, centerId, field, datasets, getValue, getKey, getTitle, getColor, getMeta, colorScale, topLabel, bottomLabel, topIcon, bottomIcon, sizing = 'linear' }) { | |
| const svg = d3.select('#' + svgId); | |
| svg.selectAll('*').remove(); | |
| const bbox = svg.node().getBoundingClientRect(); | |
| const size = Math.min(bbox.width, bbox.height); | |
| const outerR = size / 2 - 6; | |
| const innerR = outerR * 0.62; | |
| svg.attr('viewBox', `${-size / 2} ${-size / 2} ${size} ${size}`); | |
| const filtered = datasets.filter(d => getValue(d) > 0); | |
| const total = d3.sum(filtered, getValue); | |
| const count = filtered.length; | |
| // Sizing strategy for the arc: | |
| // "linear" — true proportions (small slices can vanish) | |
| // "log" — power-compressed so tiny datasets stay visible while the | |
| // big ones (PolyglotText 13M+, PolyglotAudio 1M+) still read | |
| // as clearly the largest slices | |
| // "sqrt" — lighter square-root compression | |
| // Tooltip + center numbers always show real values. | |
| const sizeValue = d => { | |
| const v = getValue(d); | |
| if (sizing === 'log') return Math.pow(v + 1, 0.38); | |
| if (sizing === 'sqrt') return Math.sqrt(v + 1); | |
| return v; | |
| }; | |
| const pie = d3.pie().value(sizeValue).sort(null).padAngle(0.022); | |
| const arcs = pie(filtered); | |
| const arcGen = d3.arc().innerRadius(innerR).outerRadius(outerR).cornerRadius(3); | |
| const resolveColor = getColor || (x => datasetColor(getKey(x))); | |
| const g = svg.append('g'); | |
| // Slice paths | |
| const paths = g.selectAll('path') | |
| .data(arcs) | |
| .join('path') | |
| .attr('class', 'donut-slice') | |
| .attr('fill', d => resolveColor(d.data)) | |
| .attr('stroke', '#000000') | |
| .attr('stroke-width', 2) | |
| .attr('stroke-linejoin', 'round'); | |
| // Radial sweep: interpolate endAngle from startAngle → target, so arcs | |
| // literally grow around the ring from 0° of arc to their full sweep. | |
| paths.each(function (d) { | |
| const [cx, cy] = arcGen.centroid(d); | |
| this._centroid = [cx, cy]; | |
| this._current = { startAngle: d.startAngle, endAngle: d.startAngle, padAngle: d.padAngle }; | |
| }); | |
| paths.transition() | |
| .delay((d, i) => i * 80) | |
| .duration(1100) | |
| .ease(d3.easeCubicOut) | |
| .attrTween('d', function (d) { | |
| const interp = d3.interpolate(this._current, d); | |
| this._current = interp(1); | |
| return t => arcGen(interp(t)); | |
| }); | |
| // Hover tooltip + click drill-down | |
| paths | |
| .on('mouseenter', function (ev, d) { | |
| const nm = getTitle(d.data); | |
| const v = getValue(d.data); | |
| const meta = getMeta ? getMeta(d.data) : ''; | |
| tooltipEl.innerHTML = | |
| `<div class="t-name">${nm}</div>` + | |
| `<div class="t-meta">${v.toLocaleString()} rows` + | |
| (meta ? ` · ${meta}` : '') + `</div>`; | |
| gsap.to(tooltipEl, { opacity: 1, duration: 0.15, overwrite: true }); | |
| }) | |
| .on('mousemove', function (ev) { | |
| tooltipEl.style.left = (ev.clientX + 14) + 'px'; | |
| tooltipEl.style.top = (ev.clientY + 14) + 'px'; | |
| }) | |
| .on('mouseleave', function () { | |
| gsap.to(tooltipEl, { opacity: 0, duration: 0.12, overwrite: true }); | |
| }) | |
| .on('click', function (ev, d) { | |
| const key = getKey(d.data); | |
| focusDonutSlice(svgId, this, key); | |
| if (typeof showDetails === 'function') showDetails(key); | |
| }); | |
| // Cache for drill-down reset logic. | |
| donutState.set(svgId, { paths, arcGen, resolveColor }); | |
| // Center content — start at 0 and count up with GSAP. | |
| if (centerId) { | |
| const centerEl = document.getElementById(centerId); | |
| const topIconHtml = topIcon ? `<span class="icon">${topIcon}</span>` : ''; | |
| const bottomIconHtml = bottomIcon ? `<span class="icon">${bottomIcon}</span>` : ''; | |
| centerEl.innerHTML = | |
| `<div class="center-item top"> | |
| <div class="center-label">${topIconHtml}${topLabel}</div> | |
| <div class="center-number js-count-top">0</div> | |
| </div> | |
| <div class="center-item bottom"> | |
| <div class="center-number js-count-bottom">0</div> | |
| <div class="center-label">${bottomLabel}${bottomIconHtml}</div> | |
| </div>`; | |
| const topEl = centerEl.querySelector('.js-count-top'); | |
| const bottomEl = centerEl.querySelector('.js-count-bottom'); | |
| const topObj = { v: 0 }; | |
| gsap.to(topObj, { | |
| v: count, | |
| duration: 1.0, | |
| ease: 'power2.out', | |
| delay: 0.55, | |
| onUpdate: () => { topEl.textContent = Math.floor(topObj.v); }, | |
| onComplete: () => { topEl.textContent = count; } | |
| }); | |
| const bottomObj = { v: 0 }; | |
| gsap.to(bottomObj, { | |
| v: total, | |
| duration: 1.6, | |
| ease: 'power2.out', | |
| delay: 0.65, | |
| onUpdate: () => { bottomEl.innerHTML = formatRows(Math.floor(bottomObj.v)); }, | |
| onComplete: () => { bottomEl.innerHTML = formatRows(total); } | |
| }); | |
| gsap.from(`#${centerId} .center-label`, { y: 14, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.45, stagger: 0.15 }); | |
| gsap.from(`#${centerId} .center-number`, { y: 10, opacity: 0, duration: 0.55, ease: 'power3.out', delay: 0.55, stagger: 0.15 }); | |
| // Space out the top/bottom blocks since the divider is gone. | |
| centerEl.querySelector('.center-item.bottom').style.marginTop = '14px'; | |
| } | |
| return paths; | |
| } | |
| /** Click drill-down: scale up clicked slice, dim the rest, toggle on re-click. */ | |
| function focusDonutSlice(svgId, clickedEl, clickedKey) { | |
| const state = donutState.get(svgId); | |
| if (!state) return; | |
| const { paths, arcGen } = state; | |
| // Toggle off if clicking the already-selected slice | |
| if (state.selectedKey === clickedKey) { | |
| resetDonutFocus(svgId); | |
| return; | |
| } | |
| state.selectedKey = clickedKey; | |
| paths.nodes().forEach((node, i) => { | |
| const d = paths.data()[i]; | |
| const isSelected = node === clickedEl; | |
| if (isSelected) { | |
| const [cx, cy] = node._centroid || arcGen.centroid(d); | |
| gsap.to(node, { | |
| scale: 1.08, | |
| opacity: 1, | |
| svgOrigin: `${cx} ${cy}`, | |
| filter: 'drop-shadow(0 0 14px rgba(255,255,255,0.35)) brightness(1.15)', | |
| duration: 0.45, | |
| ease: 'power2.out', | |
| overwrite: 'auto' | |
| }); | |
| } else { | |
| gsap.to(node, { | |
| scale: 1, | |
| opacity: 0.3, | |
| filter: 'none', | |
| duration: 0.35, | |
| ease: 'power2.out', | |
| overwrite: 'auto' | |
| }); | |
| } | |
| }); | |
| } | |
| function resetDonutFocus(svgId) { | |
| const state = donutState.get(svgId); | |
| if (!state) return; | |
| state.selectedKey = null; | |
| state.paths.nodes().forEach(node => { | |
| gsap.to(node, { | |
| scale: 1, opacity: 1, filter: 'none', | |
| duration: 0.35, ease: 'power2.out', overwrite: 'auto' | |
| }); | |
| }); | |
| } | |
| // ---- Raw vs Adaption donuts ---- | |
| // The Raw donut shows every dataset that has a raw repo (including the | |
| // PolyglotText / PolyglotAudio pre-training pools). The Adaption donut shows | |
| // every dataset with an Adaption-remastered version. renderDonut() filters | |
| // out zero-value entries automatically. | |
| renderDonut({ | |
| svgId: 'chart-raw', | |
| centerId: 'center-raw', | |
| field: 'raw', | |
| datasets: DATASETS, | |
| getValue: d => (d.raw && d.raw.rows) || 0, | |
| getKey: d => d.key, | |
| getTitle: d => d.title, | |
| getMeta: d => d.raw ? d.raw.repo : '', | |
| topLabel: 'RAW DATASETS', | |
| bottomLabel: 'ROWS', | |
| topIcon: '', | |
| bottomIcon: '', | |
| sizing: 'log', // compress so tiny datasets still get a visible slice | |
| }); | |
| renderDonut({ | |
| svgId: 'chart-adaption', | |
| centerId: 'center-adaption', | |
| field: 'adaption', | |
| datasets: DATASETS, | |
| getValue: d => (d.adaption && d.adaption.rows) || 0, | |
| getKey: d => d.key, | |
| getTitle: d => d.title, | |
| getMeta: d => d.adaption ? d.adaption.repo : '', | |
| topLabel: 'ADAPTION SETS', | |
| bottomLabel: 'ROWS', | |
| topIcon: '', | |
| bottomIcon: '', | |
| }); | |
| // ---- Modality donut ---- | |
| const MODALITIES = [ | |
| { key: 'text', name: 'Text', count: 5 }, | |
| { key: 'audio', name: 'Audio', count: 3 }, | |
| { key: 'image', name: 'Image', count: 3 }, | |
| { key: 'code', name: 'Code', count: 1 }, | |
| ]; | |
| const modalityColor = d3.scaleOrdinal(PALETTE).domain(MODALITIES.map(m => m.key)); | |
| renderDonut({ | |
| svgId: 'chart-modality', | |
| centerId: 'center-modality', | |
| field: 'count', | |
| datasets: MODALITIES, | |
| getValue: d => d.count, | |
| getKey: d => d.key, | |
| getTitle: d => d.name, | |
| getColor: d => modalityColor(d.key), | |
| getMeta: d => `${d.count} datasets`, | |
| topLabel: 'MODALITIES', | |
| bottomLabel: 'DATASETS', | |
| topIcon: '', | |
| bottomIcon: '', | |
| }); | |
| // =========================================================================== | |
| // DETAILS CARD — rendered on slice click with GSAP reveal | |
| // =========================================================================== | |
| function hideLanguageDetails() { | |
| // No-op placeholder — currently we share the single details card; click- | |
| // another to switch. Kept as an explicit symbol for future extension. | |
| } | |
| function showLanguageDetails(langData, color) { | |
| const card = document.getElementById('details-card'); | |
| card.style.display = ''; | |
| // Per-dataset breakdown for this language. | |
| const breakdown = DATASET_LANGS | |
| .map(d => ({ dataset: d.name, key: d.key, rows: d.langs[langData.code] || 0 })) | |
| .filter(d => d.rows > 0) | |
| .sort((a, b) => b.rows - a.rows); | |
| const rows = breakdown.map(b => | |
| `<div class="kv"><div class="k">${b.dataset}</div><div class="v"><strong>${formatShort(b.rows)}</strong> | |
| <span style="color:var(--muted);font-size:0.85em">(${b.rows.toLocaleString()})</span></div></div>` | |
| ).join(''); | |
| card.innerHTML = ` | |
| <h3> | |
| <span class="swatch" style="background:${color}"></span> | |
| ${langData.name} <span style="color:var(--muted);font-weight:400;font-size:0.9rem">(${langData.code})</span> | |
| </h3> | |
| <p class="tagline">Total across the raw corpus: <strong>${langData.value.toLocaleString()}</strong> rows.</p> | |
| <div class="kv-grid">${rows}</div> | |
| `; | |
| gsap.fromTo(card, | |
| { y: 80, opacity: 0 }, | |
| { y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' } | |
| ); | |
| gsap.from(card.querySelectorAll('.kv'), { | |
| y: 18, opacity: 0, duration: 0.45, ease: 'power3.out', | |
| stagger: 0.05, delay: 0.2 | |
| }); | |
| card.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | |
| } | |
| function showDetails(key) { | |
| const d = DATASETS.find(x => x.key === key); | |
| if (!d) return; | |
| const card = document.getElementById('details-card'); | |
| card.style.display = ''; | |
| const repoLink = info => info | |
| ? `<a href="https://huggingface.co/datasets/${info.repo}" target="_blank">${info.repo}</a>` | |
| : `<span style="color:var(--muted);">—</span>`; | |
| const rowsCell = info => info | |
| ? `<strong>${formatShort(info.rows)}</strong> <span style="color:var(--muted);font-size:0.85em">(${info.rows.toLocaleString()})</span>` | |
| : `<span style="color:var(--muted);">—</span>`; | |
| card.innerHTML = ` | |
| <h3> | |
| <span class="swatch" style="background:${d.color}"></span> | |
| ${d.title} | |
| </h3> | |
| <p class="tagline">${d.tagline}</p> | |
| <div class="kv-grid"> | |
| <div class="kv"><div class="k">Raw repo</div><div class="v">${repoLink(d.raw)}</div></div> | |
| <div class="kv"><div class="k">Raw rows</div><div class="v">${rowsCell(d.raw)}</div></div> | |
| <div class="kv"><div class="k">Adaption repo</div><div class="v">${repoLink(d.adaption)}</div></div> | |
| <div class="kv"><div class="k">Adaption rows</div><div class="v">${rowsCell(d.adaption)}</div></div> | |
| <div class="kv"><div class="k">Modality</div><div class="v">${d.modality}</div></div> | |
| <div class="kv"><div class="k">License</div><div class="v">${d.license}</div></div> | |
| ${d.model ? `<div class="kv"><div class="k">Annotator</div><div class="v">${d.model}</div></div>` : ''} | |
| <div class="kv"><div class="k">Languages</div><div class="v">${d.languages}</div></div> | |
| </div> | |
| <div class="kv" style="margin-top:18px;"> | |
| <div class="k">Schema</div> | |
| <div class="schema-list">${d.schema.map(c => `<code>${c}</code>`).join('')}</div> | |
| </div> | |
| `; | |
| // Elegant slide-in from the bottom with power4.out. | |
| gsap.fromTo(card, | |
| { y: 80, opacity: 0 }, | |
| { y: 0, opacity: 1, duration: 0.75, ease: 'power4.out' } | |
| ); | |
| gsap.from(card.querySelectorAll('.kv'), { | |
| y: 18, opacity: 0, duration: 0.45, ease: 'power3.out', | |
| stagger: 0.05, delay: 0.2 | |
| }); | |
| card.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); | |
| } | |
| // =========================================================================== | |
| // INITIAL PAGE LOAD — hero image → hero stats → chart cards, staggered. | |
| // =========================================================================== | |
| const loadTl = gsap.timeline(); | |
| loadTl | |
| .from('.hero-img', { y: 20, opacity: 0, duration: 0.9, ease: 'power3.out' }) | |
| .from('.stat', { y: 20, opacity: 0, duration: 0.7, ease: 'power3.out', stagger: 0.15 }, '-=0.5') | |
| .from('.chart-card', { y: 20, opacity: 0, duration: 0.8, ease: 'power3.out', stagger: 0.15 }, '-=0.3'); | |
| // =========================================================================== | |
| // LANGUAGE DATA (for the Voronoi treemap) | |
| // =========================================================================== | |
| const LANG_NAMES = { | |
| tr: "Turkish", ru: "Russian", it: "Italian", en: "English", eo: "Esperanto", | |
| hu: "Hungarian", de: "German", fr: "French", pt: "Portuguese", mk: "Macedonian", | |
| es: "Spanish", he: "Hebrew", fi: "Finnish", ber: "Berber", nl: "Dutch", | |
| pl: "Polish", sr: "Serbian", mr: "Marathi", el: "Greek", da: "Danish", | |
| cs: "Czech", sv: "Swedish", bg: "Bulgarian", la: "Latin", zh: "Mandarin", | |
| ro: "Romanian", ia: "Interlingua", ja: "Japanese", tok: "Toki Pona", | |
| lfn: "Lingua Franca Nova", uk: "Ukrainian", tt: "Tatar", tl: "Tagalog", | |
| id: "Indonesian", nb: "Norwegian B.", lt: "Lithuanian", az: "Azerbaijani", | |
| ie: "Interlingue", tlh: "Klingon", jbo: "Lojban", mhr: "Meadow Mari", | |
| bn: "Bengali", fa: "Persian", br: "Breton", ilo: "Ilocano", ar: "Arabic", | |
| ceb: "Cebuano", hi: "Hindi", vi: "Vietnamese", pam: "Kapampangan", | |
| hy: "Armenian", be: "Belarusian", ko: "Korean", yue: "Cantonese", | |
| ca: "Catalan", kab: "Kabyle", af: "Afrikaans", am: "Amharic", yi: "Yiddish", | |
| sat: "Santali", so: "Somali", te: "Telugu", ne: "Nepali", pa: "Punjabi", | |
| ur: "Urdu", ta: "Tamil", ml: "Malayalam", th: "Thai", or: "Odia", | |
| sd: "Sindhi", gu: "Gujarati", kn: "Kannada", my: "Burmese", bo: "Tibetan", | |
| lo: "Lao", mni: "Meitei", kk: "Kazakh", oc: "Occitan", hr: "Croatian", | |
| sk: "Slovak", et: "Estonian", sl: "Slovenian", is: "Icelandic", ms: "Malay", | |
| sq: "Albanian", hsb: "Upper Sorbian", dsb: "Lower Sorbian", mai: "Maithili", | |
| kha: "Khasi", dtp: "Kadazan", yo: "Yoruba", sw: "Swahili", cy: "Welsh", | |
| ga: "Irish", gd: "Scottish Gaelic", ti: "Tigrinya", os: "Ossetian", | |
| sa: "Sanskrit", ug: "Uyghur", uz: "Uzbek", ka: "Georgian", eu: "Basque", | |
| vo: "Volapük", ido: "Ido", nov: "Novial", avk: "Kotava", ldn: "Láadan", | |
| afh: "Afrihili", lzh: "Classical Chinese", non: "Old Norse", ang: "Old English", | |
| grc: "Ancient Greek", sux: "Sumerian", fro: "Old French", cbk: "Chavacano", | |
| zsm: "Standard Malay", war: "Waray", kw: "Cornish", nah: "Nahuatl", | |
| kek: "Q'eqchi'", hif: "Fiji Hindi", crh: "Crimean Tatar", sah: "Sakha", | |
| ext: "Extremaduran", csb: "Kashubian", sgs: "Samogitian", cha: "Chamorro", | |
| tvl: "Tuvaluan", mi: "Maori", lin: "Lingala", arq: "Algerian Arabic", | |
| arz: "Egyptian Arabic", orv: "Old East Slavic", prg: "Old Prussian", | |
| chv: "Chuvash", bar: "Bavarian", pms: "Piedmontese", egl: "Emilian", | |
| jav: "Javanese", sun: "Sundanese", hoc: "Ho", zza: "Zaza", | |
| rif: "Riffian Berber", nog: "Nogai", km: "Khmer", | |
| }; | |
| const DATASET_LANGS = [ | |
| { | |
| key: "polytext", name: "PolyglotText", | |
| langs: { | |
| tr: 1767000, ru: 1695000, it: 1588000, en: 1337000, eo: 1171000, | |
| hu: 817000, de: 675000, fr: 520000, pt: 470000, mk: 398000, | |
| es: 358000, he: 272000, fi: 263000, ber: 180000, nl: 125000, | |
| pl: 118000, sr: 106000, mr: 96000, el: 94000, da: 90000, | |
| cs: 72000, sv: 71000, bg: 70000, la: 66000, zh: 58000, ro: 56000, | |
| ia: 54000, ja: 43000, tok: 39000, lfn: 38000, uk: 38000, tt: 33000, | |
| tl: 31000, id: 31000, nb: 31000, lt: 29000, az: 25000, ie: 24000, | |
| tlh: 23000, jbo: 21000, mhr: 19000, bn: 19000, fa: 17000, br: 17000, | |
| ilo: 17000, ar: 16000, ceb: 15000, hi: 13000, vi: 11000, pam: 11000, | |
| hy: 9000, be: 9000, ko: 9000, | |
| cbk: 19000, sk: 8000, vo: 8000, oc: 8000, et: 8000, | |
| war: 6700, ms: 6700, hr: 6700, eu: 6700, yi: 5400, af: 5400, | |
| km: 4000, ca: 4000, kha: 4000, dtp: 4000, zza: 4000, is: 4000, | |
| avk: 4000, ga: 4000, hoc: 4000, sl: 4000, sq: 4000, chv: 4000, | |
| kw: 4000, sux: 2700, ang: 2700, pms: 2700, prg: 2700, ug: 2700, | |
| lzh: 2700, egl: 2700, ur: 2700, sah: 2700, nds: 2700, mi: 2700, | |
| tvl: 1400, cha: 1400, th: 1400, cy: 1400, non: 1400, yo: 1400, | |
| lin: 1400, grc: 1400, arq: 1400, orv: 1400, sw: 1400, rif: 1400, | |
| crh: 1400, hif: 1400, jav: 1400, sun: 1400, hsb: 1400, dsb: 1400, | |
| amh: 1400, csb: 1400, sgs: 1400, ext: 1400, nov: 1400, nog: 1400, | |
| arz: 1400, nah: 1400, ido: 1400, afh: 1400, kk: 1400, | |
| } | |
| }, | |
| { | |
| key: "polyaudio", name: "PolyglotAudio", | |
| langs: { | |
| en: 698000, es: 261000, eo: 105000, de: 32000, fr: 16000, | |
| ru: 9200, pl: 8800, ber: 6600, nl: 5900, it: 5900, | |
| yue: 4300, pt: 3300, ja: 1400, mr: 1200, ca: 505, | |
| cs: 410, zh: 110, fi: 93, hu: 87, uk: 38, | |
| he: 16, tok: 5, kab: 5, | |
| } | |
| }, | |
| { | |
| key: "tts", name: "multilingual-synthetic-tts", | |
| langs: { | |
| ja: 13951, ru: 9105, de: 8972, ko: 8129, es: 7917, | |
| pt: 5438, zh: 5417, en: 5157, fr: 4551, | |
| } | |
| }, | |
| { | |
| key: "magazines", name: "magazines-multilingual-vqa", | |
| langs: { | |
| de: 4412, fr: 3279, ru: 2762, pt: 2047, vi: 1637, bn: 1598, | |
| en: 1004, af: 826, ar: 156, it: 136, fa: 132, te: 132, | |
| ja: 130, ne: 108, pa: 100, ur: 98, nl: 95, tr: 85, zh: 83, | |
| ta: 68, ml: 64, id: 43, th: 47, am: 123, yi: 108, sat: 85, | |
| so: 25, hi: 15, es: 10, mr: 9, kn: 8, or: 17, sd: 3, | |
| mai: 1021, la: 146, bo: 25, be: 8, da: 6, ko: 4, bg: 4, | |
| os: 3, sa: 3, my: 3, oc: 1, gd: 1, ti: 1, hy: 1, pl: 1, | |
| mni: 1, uk: 1, lo: 1, kk: 1, | |
| } | |
| }, | |
| { key: "fma", name: "fma-labeled", langs: { en: 29000 } }, | |
| { key: "streetview", name: "streetview-global", langs: { en: 30000 } }, | |
| { key: "current_affairs", name: "current-affairs (raw, 2023-26)", langs: { en: 20694 } }, | |
| { key: "frontend", name: "frontend-coding", langs: { en: 500 } }, | |
| { | |
| key: "image_ann", name: "multilingual-image-annotations", | |
| langs: { en: 464, es: 464, fr: 464, hi: 464, zh: 464, ar: 464, pt: 464 } | |
| }, | |
| ]; | |
| // Aggregate totals across the raw corpus | |
| const langTotals = {}; | |
| for (const d of DATASET_LANGS) { | |
| for (const [lang, n] of Object.entries(d.langs)) { | |
| langTotals[lang] = (langTotals[lang] || 0) + n; | |
| } | |
| } | |
| const langEntries = Object.entries(langTotals) | |
| .sort((a, b) => b[1] - a[1]) | |
| .map(([code, n]) => ({ | |
| code, | |
| name: LANG_NAMES[code] || code, | |
| value: n, | |
| sizeValue: Math.log10(n + 10), | |
| })); | |
| // =========================================================================== | |
| // VORONOI TREEMAP (D3) — palette quantile coloring, black strokes | |
| // =========================================================================== | |
| (function renderVoronoi() { | |
| const container = document.getElementById('chart-treemap'); | |
| const tooltip = document.getElementById('voronoi-tooltip'); | |
| if (typeof d3 === 'undefined' || !d3.voronoiTreemap) { | |
| container.insertAdjacentHTML('beforeend', | |
| `<div style="color:#ff8a8a;padding:24px;text-align:center;font-size:0.9rem"> | |
| Voronoi treemap libraries failed to load. Check your network / CSP. | |
| </div>`); | |
| return; | |
| } | |
| // Map language size buckets onto PALETTE via quantiles, so cells naturally | |
| // cluster by tonal groups. | |
| const colorScale = d3.scaleQuantile() | |
| .domain(langEntries.map(e => e.sizeValue)) | |
| .range(PALETTE); | |
| function draw() { | |
| container.querySelectorAll('svg').forEach(s => s.remove()); | |
| const rect = container.getBoundingClientRect(); | |
| const width = Math.max(320, rect.width); | |
| const height = Math.max(320, rect.height); | |
| // Circular clip polygon | |
| const clipPad = 6; | |
| const cx = width / 2, cy = height / 2; | |
| const r = Math.min(width, height) / 2 - clipPad; | |
| const N = 96; | |
| const clipPolygon = d3.range(N).map(i => [ | |
| cx + r * Math.cos((i / N) * 2 * Math.PI), | |
| cy + r * Math.sin((i / N) * 2 * Math.PI), | |
| ]); | |
| const root = d3.hierarchy({ name: 'root', children: langEntries }) | |
| .sum(d => d.sizeValue); | |
| const treemap = d3.voronoiTreemap() | |
| .clip(clipPolygon) | |
| .convergenceRatio(0.005) | |
| .maxIterationCount(120) | |
| .minWeightRatio(0.01); | |
| treemap(root); | |
| const svg = d3.select(container).append('svg') | |
| .attr('viewBox', `0 0 ${width} ${height}`) | |
| .attr('preserveAspectRatio', 'xMidYMid meet'); | |
| const g = svg.append('g'); | |
| // Outer circle outline | |
| g.append('circle') | |
| .attr('cx', cx).attr('cy', cy).attr('r', r + 1) | |
| .attr('fill', 'none') | |
| .attr('stroke', '#1f1f1f') | |
| .attr('stroke-width', 1); | |
| const leaves = root.leaves(); | |
| const cells = g.selectAll('path.voronoi-cell') | |
| .data(leaves) | |
| .join('path') | |
| .attr('class', 'voronoi-cell') | |
| .attr('d', d => 'M' + d.polygon.map(p => p.join(',')).join('L') + 'Z') | |
| .attr('fill', d => colorScale(d.data.sizeValue)) | |
| .attr('stroke', '#000000') | |
| .attr('stroke-width', 1.2) | |
| .attr('stroke-linejoin', 'round'); | |
| // Record centroid per cell so we can scale from the cell's own center. | |
| cells.each(function (leaf) { | |
| const [cx, cy] = leaf.site || d3.polygonCentroid(leaf.polygon); | |
| this._centroid = [cx, cy]; | |
| }); | |
| // Mosaic build: fade + scale up from each cell's own centroid, staggered. | |
| cells.nodes().forEach((node, i) => { | |
| const [cx, cy] = node._centroid; | |
| gsap.fromTo(node, | |
| { scale: 0, opacity: 0, svgOrigin: `${cx} ${cy}` }, | |
| { scale: 1, opacity: 1, svgOrigin: `${cx} ${cy}`, | |
| duration: 0.55, ease: 'power3.out', delay: i * 0.012 } | |
| ); | |
| }); | |
| // Labels — sized by cell area; smaller cells hide the name, tiny ones only show on hover. | |
| function polyArea(pts) { | |
| let a = 0; | |
| for (let i = 0, n = pts.length; i < n; i++) { | |
| const [x1, y1] = pts[i], [x2, y2] = pts[(i + 1) % n]; | |
| a += x1 * y2 - x2 * y1; | |
| } | |
| return Math.abs(a) / 2; | |
| } | |
| leaves.forEach(leaf => { | |
| const area = polyArea(leaf.polygon); | |
| const side = Math.sqrt(area); | |
| const [x, y] = leaf.site || d3.polygonCentroid(leaf.polygon); | |
| const d = leaf.data; | |
| if (side >= 44) { | |
| const nameSize = Math.max(10, Math.min(18, side / 6)); | |
| const codeSize = Math.max(9, Math.min(13, side / 9)); | |
| const text = g.append('text') | |
| .datum(leaf.data) | |
| .attr('class', 'voronoi-label') | |
| .attr('x', x).attr('y', y - 2) | |
| .attr('font-size', nameSize); | |
| text.append('tspan').text(d.name); | |
| text.append('tspan') | |
| .attr('class', 'code') | |
| .attr('x', x).attr('dy', nameSize * 0.95) | |
| .attr('font-size', codeSize) | |
| .text(`${d.code} · ${formatShort(d.value)}`); | |
| } else if (side >= 22) { | |
| const sz = Math.max(8, Math.min(11, side / 3)); | |
| g.append('text') | |
| .datum(leaf.data) | |
| .attr('class', 'voronoi-label') | |
| .attr('x', x).attr('y', y + sz / 3) | |
| .attr('font-size', sz) | |
| .text(d.code); | |
| } | |
| }); | |
| // Hover tooltip | |
| cells | |
| .on('mouseenter', (ev, d) => { | |
| tooltip.innerHTML = | |
| `<span class="t-name">${d.data.name}</span>` + | |
| `<span class="t-code">(${d.data.code})</span>` + | |
| `<div class="t-rows">${d.data.value.toLocaleString()} rows</div>`; | |
| gsap.to(tooltip, { opacity: 1, duration: 0.12, overwrite: true }); | |
| }) | |
| .on('mousemove', (ev) => { | |
| const bb = container.getBoundingClientRect(); | |
| tooltip.style.left = (ev.clientX - bb.left + 12) + 'px'; | |
| tooltip.style.top = (ev.clientY - bb.top + 12) + 'px'; | |
| }) | |
| .on('mouseleave', () => { | |
| gsap.to(tooltip, { opacity: 0, duration: 0.1, overwrite: true }); | |
| }); | |
| // Click drill-down: highlight the clicked cell, dim the rest, surface a | |
| // language detail card below the treemap. | |
| let selectedCell = null; | |
| cells.on('click', function (ev, d) { | |
| const [cx, cy] = this._centroid; | |
| const sameAgain = selectedCell === this; | |
| if (sameAgain) { | |
| // reset | |
| selectedCell = null; | |
| cells.nodes().forEach(node => { | |
| const [ncx, ncy] = node._centroid; | |
| gsap.to(node, { | |
| scale: 1, opacity: 1, | |
| svgOrigin: `${ncx} ${ncy}`, | |
| filter: 'none', | |
| duration: 0.35, ease: 'power2.out', overwrite: 'auto', | |
| }); | |
| }); | |
| g.selectAll('.voronoi-label').each(function() { | |
| gsap.to(this, { opacity: 1, duration: 0.35, overwrite: 'auto' }); | |
| }); | |
| hideLanguageDetails(); | |
| return; | |
| } | |
| selectedCell = this; | |
| cells.nodes().forEach(node => { | |
| const [ncx, ncy] = node._centroid; | |
| if (node === this) { | |
| gsap.to(node, { | |
| scale: 1.1, opacity: 1, | |
| svgOrigin: `${ncx} ${ncy}`, | |
| filter: 'drop-shadow(0 0 10px rgba(255,255,255,0.45)) brightness(1.35)', | |
| duration: 0.45, ease: 'power2.out', overwrite: 'auto', | |
| }); | |
| } else { | |
| gsap.to(node, { | |
| scale: 1, opacity: 0, | |
| svgOrigin: `${ncx} ${ncy}`, | |
| filter: 'none', | |
| duration: 0.35, ease: 'power2.out', overwrite: 'auto', | |
| }); | |
| } | |
| }); | |
| g.selectAll('.voronoi-label').each(function(ld) { | |
| if (ld && ld.code === d.data.code) { | |
| gsap.to(this, { opacity: 1, duration: 0.45, overwrite: 'auto' }); | |
| } else { | |
| gsap.to(this, { opacity: 0, duration: 0.35, overwrite: 'auto' }); | |
| } | |
| }); | |
| showLanguageDetails(d.data, colorScale(d.data.sizeValue)); | |
| // Also emit the custom event so external code can react. | |
| container.dispatchEvent(new CustomEvent('voronoi-drilldown', { | |
| detail: { code: d.data.code, name: d.data.name, rows: d.data.value } | |
| })); | |
| }); | |
| } | |
| draw(); | |
| let t; | |
| window.addEventListener('resize', () => { | |
| clearTimeout(t); | |
| t = setTimeout(draw, 200); | |
| }); | |
| })(); |