Spaces:
No application file
No application file
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
| <title>ReubenDataLab 路 Dataset Explorer</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> | |
| <link href="https://fonts.googleapis.com/css2?family=Geist:wght@100..900&family=Google+Sans:ital,opsz,wght@0,17..18,400..700;1,17..18,400..700&display=swap" rel="stylesheet"> | |
| <script src="vendor/d3.min.js" defer></script> | |
| <script src="vendor/d3-weighted-voronoi.js" defer></script> | |
| <script src="vendor/d3-voronoi-map.js" defer></script> | |
| <script src="vendor/d3-voronoi-treemap.js" defer></script> | |
| <script src="vendor/gsap.min.js" defer></script> | |
| <style> | |
| :root { | |
| --bg: #000000; | |
| --fg: #ffffff; | |
| --muted: #8a8a94; | |
| --card: #141414; | |
| --card-alt: #1c1c1e; | |
| --border: #262626; | |
| --divider: #2e2e2e; | |
| --tooltip-bg: rgba(20, 20, 20, 0.96); | |
| --palette-1: #3b82f6; | |
| --palette-2: #10b981; | |
| --palette-3: #ef4444; | |
| --palette-4: #f59e0b; | |
| --palette-5: #8b5cf6; | |
| --palette-6: #ec4899; | |
| --palette-7: #06b6d4; | |
| --palette-8: #84cc16; | |
| --palette-9: #f97316; | |
| --palette-10: #14b8a6; | |
| --palette-11: #a855f7; | |
| --palette-12: #eab308; | |
| } | |
| * { box-sizing: border-box; } | |
| html, body { | |
| margin: 0; padding: 0; | |
| background: var(--bg); | |
| color: var(--fg); | |
| font-family: "Geist", "Google Sans", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; | |
| font-weight: 400; | |
| min-height: 100vh; | |
| -webkit-font-smoothing: antialiased; | |
| letter-spacing: 0.005em; | |
| } | |
| a { color: var(--fg); text-decoration: none; } | |
| a:hover { opacity: 0.7; } | |
| /* Header / hero image */ | |
| header { | |
| max-width: 1440px; | |
| margin: 0 auto; | |
| padding: 32px 24px 8px 24px; | |
| text-align: center; | |
| } | |
| .hero-img { | |
| display: block; | |
| max-width: 900px; | |
| width: 100%; | |
| height: auto; | |
| margin: 0 auto; | |
| border-radius: 14px; | |
| } | |
| /* Hero stats banner */ | |
| .hero-stats { | |
| max-width: 1440px; | |
| margin: 24px auto 0 auto; | |
| padding: 0 24px; | |
| display: grid; | |
| grid-template-columns: repeat(5, 1fr); | |
| gap: 14px; | |
| } | |
| .stat { | |
| background: var(--card); | |
| border: 1px solid var(--border); | |
| border-radius: 16px; | |
| padding: 18px 14px; | |
| text-align: center; | |
| } | |
| .stat .num { | |
| display: block; | |
| font-size: 1.75rem; | |
| font-weight: 700; | |
| color: var(--fg); | |
| letter-spacing: -0.015em; | |
| line-height: 1.05; | |
| } | |
| .stat .num .decimal { font-size: 0.55em; font-weight: 500; opacity: 0.75; margin-left: 1px; } | |
| .stat .lbl { | |
| display: block; | |
| font-size: 0.68rem; | |
| color: var(--muted); | |
| text-transform: uppercase; | |
| letter-spacing: 0.13em; | |
| margin-top: 8px; | |
| font-weight: 500; | |
| } | |
| .stat .sub { | |
| display: block; | |
| font-size: 0.6rem; | |
| color: var(--muted); | |
| font-weight: 400; | |
| letter-spacing: 0.04em; | |
| margin-top: 4px; | |
| opacity: 0.65; | |
| text-transform: none; | |
| } | |
| /* Chart sections */ | |
| .charts { | |
| max-width: 1440px; | |
| margin: 0 auto; | |
| display: grid; | |
| grid-template-columns: 1fr 1fr; | |
| gap: 24px; | |
| padding: 24px; | |
| } | |
| .chart-card { | |
| background: var(--card); | |
| border: 1px solid var(--border); | |
| border-radius: 20px; | |
| padding: 24px 20px 16px 20px; | |
| } | |
| .chart-card h2 { | |
| text-align: center; | |
| margin: 0 0 4px 0; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| color: var(--fg); | |
| letter-spacing: -0.005em; | |
| } | |
| .chart-card .subtitle { | |
| text-align: center; | |
| margin: 0 0 14px 0; | |
| font-size: 0.82rem; | |
| color: var(--muted); | |
| font-weight: 400; | |
| } | |
| /* Donut */ | |
| .donut-wrap { | |
| position: relative; | |
| width: 100%; | |
| max-width: 560px; | |
| aspect-ratio: 1; | |
| margin: 0 auto; | |
| } | |
| .donut-wrap.small { max-width: 400px; } | |
| .donut-svg { | |
| width: 100%; | |
| height: 100%; | |
| display: block; | |
| overflow: visible; | |
| } | |
| .donut-slice { cursor: pointer; transition: filter 0.2s ease; } | |
| .donut-slice:hover { filter: brightness(1.25) drop-shadow(0 0 10px rgba(255,255,255,0.15)); } | |
| .donut-center { | |
| position: absolute; | |
| inset: 0; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| pointer-events: none; | |
| padding: 18%; | |
| text-align: center; | |
| } | |
| .donut-center.small { padding: 22%; } | |
| .center-item { width: 100%; } | |
| .center-label { | |
| font-size: 0.65rem; | |
| font-weight: 500; | |
| color: var(--muted); | |
| letter-spacing: 0.18em; | |
| text-transform: uppercase; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| gap: 6px; | |
| } | |
| .center-label .icon { font-size: 0.85rem; opacity: 0.9; } | |
| .center-number { | |
| font-size: clamp(1.8rem, 4.5vw, 2.75rem); | |
| font-weight: 700; | |
| color: var(--fg); | |
| line-height: 1; | |
| letter-spacing: -0.03em; | |
| margin: 4px 0; | |
| } | |
| .center-number .decimal { | |
| font-size: 0.55em; | |
| font-weight: 500; | |
| color: var(--fg); | |
| opacity: 0.72; | |
| margin-left: 1px; | |
| } | |
| .center-divider { | |
| width: 42%; | |
| border: none; | |
| border-top: 1px solid rgba(255, 255, 255, 0.08); | |
| margin: 10px auto; | |
| } | |
| /* Details card */ | |
| .details { | |
| max-width: 1440px; | |
| margin: 0 auto 32px auto; | |
| padding: 0 24px; | |
| } | |
| .details-card { | |
| background: var(--card); | |
| border: 1px solid var(--border); | |
| border-radius: 20px; | |
| padding: 26px 28px; | |
| min-height: 140px; | |
| } | |
| .details-card h3 { | |
| margin: 0 0 8px 0; | |
| font-size: 1.35rem; | |
| color: var(--fg); | |
| display: flex; | |
| align-items: center; | |
| gap: 12px; | |
| font-weight: 600; | |
| letter-spacing: -0.01em; | |
| } | |
| .details-card h3 .swatch { display: inline-block; width: 14px; height: 14px; border-radius: 50%; } | |
| .details-card h3 a { color: var(--fg); font-size: 1.05rem; opacity: 0.85; } | |
| .details-card h3 a:hover { opacity: 1; text-decoration: underline; } | |
| .details-card .tagline { color: var(--muted); font-size: 0.95rem; margin: 0 0 18px 0; } | |
| .kv-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); | |
| gap: 12px 24px; | |
| } | |
| .kv .k { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 4px; font-weight: 500; } | |
| .kv .v { color: var(--fg); font-size: 0.9rem; } | |
| .kv .v a { border-bottom: 1px dashed var(--muted); } | |
| .kv .v strong { font-weight: 600; } | |
| .schema-list { display: flex; flex-wrap: wrap; gap: 6px; margin-top: 6px; } | |
| .schema-list code { | |
| background: var(--card-alt); | |
| color: var(--fg); | |
| padding: 3px 8px; | |
| border-radius: 6px; | |
| font-size: 0.78rem; | |
| font-family: "SF Mono", Consolas, monospace; | |
| border: 1px solid var(--border); | |
| } | |
| /* Extras (modality + treemap) */ | |
| .extras { | |
| max-width: 1440px; | |
| margin: 8px auto 0 auto; | |
| padding: 0 24px 24px 24px; | |
| display: grid; | |
| grid-template-columns: 1fr 2fr; | |
| gap: 24px; | |
| } | |
| .plot-treemap { width: 100%; height: 900px; position: relative; } | |
| .plot-treemap svg { width: 100%; height: 100%; display: block; } | |
| /* Voronoi */ | |
| .voronoi-cell { | |
| cursor: pointer; | |
| transition: filter 0.18s ease, opacity 0.18s ease; | |
| } | |
| .voronoi-cell:hover { filter: brightness(1.35) drop-shadow(0 0 8px rgba(255,255,255,0.35)); } | |
| .voronoi-label { | |
| font-family: "Geist", "Google Sans", sans-serif; | |
| font-weight: 600; | |
| fill: #ffffff; | |
| pointer-events: none; | |
| text-anchor: middle; | |
| user-select: none; | |
| } | |
| .voronoi-label .code { font-weight: 400; opacity: 0.8; fill: #ffffff; } | |
| .voronoi-tooltip { | |
| position: absolute; | |
| pointer-events: none; | |
| background: var(--tooltip-bg); | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| padding: 10px 14px; | |
| font-size: 0.85rem; | |
| color: var(--fg); | |
| box-shadow: 0 12px 32px rgba(0,0,0,0.7); | |
| opacity: 0; | |
| transition: opacity 0.12s ease; | |
| white-space: nowrap; | |
| z-index: 20; | |
| font-family: "Geist", sans-serif; | |
| } | |
| .voronoi-tooltip .t-name { font-weight: 700; color: var(--fg); font-size: 0.95rem; } | |
| .voronoi-tooltip .t-code { color: var(--muted); font-size: 0.72rem; margin-left: 4px; } | |
| .voronoi-tooltip .t-rows { color: var(--fg); font-weight: 600; margin-top: 4px; opacity: 0.9; } | |
| /* Donut tooltip (shared style) */ | |
| .donut-tooltip { | |
| position: fixed; | |
| pointer-events: none; | |
| background: var(--tooltip-bg); | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| padding: 10px 14px; | |
| font-size: 0.85rem; | |
| color: var(--fg); | |
| box-shadow: 0 12px 32px rgba(0,0,0,0.7); | |
| opacity: 0; | |
| transition: opacity 0.12s ease; | |
| white-space: nowrap; | |
| z-index: 50; | |
| font-family: "Geist", sans-serif; | |
| } | |
| .donut-tooltip .t-name { font-weight: 700; font-size: 0.95rem; } | |
| .donut-tooltip .t-meta { color: var(--muted); font-size: 0.78rem; margin-top: 4px; } | |
| footer { | |
| max-width: 1440px; | |
| margin: 0 auto 32px auto; | |
| padding: 0 24px; | |
| text-align: center; | |
| color: var(--muted); | |
| font-size: 0.8rem; | |
| font-weight: 400; | |
| } | |
| footer a { border-bottom: 1px dashed var(--muted); } | |
| @media (max-width: 900px) { | |
| .hero-stats { grid-template-columns: repeat(2, 1fr); } | |
| .extras { grid-template-columns: 1fr; } | |
| } | |
| @media (max-width: 780px) { | |
| .charts { grid-template-columns: 1fr; } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <header> | |
| <img src="Reubensdataset.png" alt="Reuben's Data Lab" class="hero-img" /> | |
| </header> | |
| <section class="hero-stats"> | |
| <div class="stat"> | |
| <span class="num" data-value="12"></span> | |
| <span class="lbl">Raw datasets</span> | |
| <span class="sub">in four HF collections</span> | |
| </div> | |
| <div class="stat"> | |
| <span class="num" data-value="14.8M"></span> | |
| <span class="lbl">Total rows</span> | |
| <span class="sub">every row, every dataset</span> | |
| </div> | |
| <div class="stat"> | |
| <span class="num" data-value="130+"></span> | |
| <span class="lbl">Languages</span> | |
| <span class="sub">many rarely seen online</span> | |
| </div> | |
| <div class="stat"> | |
| <span class="num" data-value="4"></span> | |
| <span class="lbl">Modalities</span> | |
| <span class="sub">audio, text, images, code</span> | |
| </div> | |
| <div class="stat"> | |
| <span class="num" data-value="17"></span> | |
| <span class="lbl">Days to build</span> | |
| <span class="sub">April 8 to April 24, 2026</span> | |
| </div> | |
| </section> | |
| <section class="charts"> | |
| <div class="chart-card"> | |
| <h2>Raw corpus</h2> | |
| <div class="subtitle">Every dataset I've created in the <a href="https://huggingface.co/ReubenDataLab/collections" target="_blank" rel="noopener">ReubenDataLab collections</a></div> | |
| <div class="donut-wrap"> | |
| <svg id="chart-raw" class="donut-svg"></svg> | |
| <div class="donut-center" id="center-raw"></div> | |
| </div> | |
| </div> | |
| <div class="chart-card"> | |
| <h2>Adaption-remastered</h2> | |
| <div class="subtitle">Improved datasets after running them through <a href="https://adaptionlabs.ai" target="_blank" rel="noopener">adaptionlabs.ai</a></div> | |
| <div class="donut-wrap"> | |
| <svg id="chart-adaption" class="donut-svg"></svg> | |
| <div class="donut-center" id="center-adaption"></div> | |
| </div> | |
| </div> | |
| </section> | |
| <div class="details"> | |
| <div id="details-card" class="details-card" style="display: none;"></div> | |
| </div> | |
| <section class="extras"> | |
| <div class="chart-card"> | |
| <h2>Modality split</h2> | |
| <div class="subtitle">Share of the corpus by data type</div> | |
| <div class="donut-wrap small"> | |
| <svg id="chart-modality" class="donut-svg"></svg> | |
| <div class="donut-center small" id="center-modality"></div> | |
| </div> | |
| </div> | |
| <div class="chart-card"> | |
| <h2>Languages across the corpus</h2> | |
| <div class="subtitle">Every language that appears in any raw dataset, sized (log-scale) by total row count. Hover for exact numbers.</div> | |
| <div id="chart-treemap" class="plot-treemap"> | |
| <div id="voronoi-tooltip" class="voronoi-tooltip"></div> | |
| </div> | |
| </div> | |
| </section> | |
| <div id="donut-tooltip" class="donut-tooltip"></div> | |
| <footer> | |
| Data self-reported from HF dataset pages 路 Built for the | |
| <a href="https://www.adaptionlabs.ai/blog/the-uncharted-data-challenge" target="_blank">Uncharted Data Challenge</a> | |
| 路 Author <a href="https://huggingface.co/Reubencf" target="_blank">@Reubencf</a> | |
| </footer> | |
| <script src="app.js" defer></script> | |
| </body> | |
| </html> | |