finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on 2 days ago

Commit

20b148e

1 Parent(s): 9142722

extend math analysis

Browse files

Files changed (7) hide show

app/src/content/analysis/qwen3_vs_smollm2_math.py +445 -79
app/src/content/analysis/qwen3_vs_smollm2_math_results.json +2 -2
app/src/content/assets/data/math_format_adherence.csv +3 -0
app/src/content/assets/data/qwen3_vs_smollm2_prefix_collapse.csv +2 -2
app/src/content/chapters/4-analyses.mdx +48 -15
app/src/content/embeds/d3-prefix-collapse.html +91 -57
app/src/content/embeds/math-adherence-audit.html +346 -0

app/src/content/analysis/qwen3_vs_smollm2_math.py CHANGED Viewed

@@ -15,15 +15,28 @@ from collections import Counter
 from pathlib import Path
 import pandas as pd
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
 BUCKET_URI = "hf://buckets/HuggingFaceFW/finephrase-rephrased/format"
-CHUNK = "00000_chunk_0.jsonl.gz"
 CACHE_DIR = Path(__file__).parent / ".cache_math_outputs"
-MODELS = {"SmolLM2": "math-smollm2-1.7b-hq", "Qwen3": "math-qwen3-1.7b-hq"}
 # Headers like "Problem:" and "**Problem:**", optionally bolded
 PROBLEM_RE = re.compile(r"\*{0,2}\s*problem\s*\*{0,2}\s*:", re.IGNORECASE)
@@ -35,48 +48,207 @@ STEP_RE = re.compile(r"^\s*(?:\d+[\.\)]|Step \d+:|[-*])\s", re.MULTILINE)
 LATEX_RE = re.compile(
     r"\$\$[^$]+\$\$|\$[^$\n]+\$|\\frac|\\times|\\div|\\cdot|\\sqrt|\\sum|\\int|\\pi\b|\\alpha\b|\\beta\b"
 )
 # Prefix lengths (in characters) at which to measure template collapse.
 START_LENS = list(range(2, 61, 2))
-def fetch_chunk(model_dir: str) -> Path:
-    """Download chunk 0 for `model_dir` from the bucket into the local cache."""
-    CACHE_DIR.mkdir(exist_ok=True)
-    dst = CACHE_DIR / f"{model_dir}.jsonl.gz"
     if dst.exists():
         return dst
-    src = f"{BUCKET_URI}/{model_dir}/{CHUNK}"
-    logger.info(f"Downloading {src} -> {dst}")
-    subprocess.run(["hf", "buckets", "cp", src, str(dst)], check=True)
     return dst
-def load_outputs(path: Path) -> pd.DataFrame:
-    """Load rephrased outputs and per-row structural features."""
     rows = []
-    with gzip.open(path, "rt") as f:
-        for line in f:
-            d = json.loads(line)
-            # Some SmolLM2 rows had vLLM context-length errors and lack `text`
-            text = d["text"] if "text" in d else ""
-            tokens = d["metadata"]["token_count"]
-            ir = d["metadata"]["inference_results"][0]
-            finish_reason = ir["finish_reason"] if "finish_reason" in ir else "error"
-            has_problem = bool(PROBLEM_RE.search(text))
-            has_solution = bool(SOLUTION_RE.search(text))
-            row = {
-                "text": text,
-                "tokens": tokens,
-                "finish_reason": finish_reason,
-                "has_problem": has_problem,
-                "has_solution": has_solution,
-                "has_problem_solution": has_problem and has_solution,
-                "has_step_by_step": len(STEP_RE.findall(text)) >= 2,
-                "has_latex": bool(LATEX_RE.search(text)),
-            }
-            for n in START_LENS:
-                row[f"start_{n}"] = text[:n]
-            rows.append(row)
     return pd.DataFrame(rows)
@@ -93,10 +265,36 @@ def quality_bucket(row: pd.Series) -> str:
     return "Poor"
 def summarise(name: str, df: pd.DataFrame) -> dict:
     """Compute the headline numbers for one model."""
     n = len(df)
     buckets = df.apply(quality_bucket, axis=1).value_counts().to_dict()
     # Exclude failed inferences (empty text) from prefix-collapse to avoid
     # an artificial "" cluster.
     non_empty = df[df["text"].str.len() > 0]
@@ -128,57 +326,105 @@ def summarise(name: str, df: pd.DataFrame) -> dict:
         "n_errored": int((df["finish_reason"] == "error").sum()),
         "prefix_collapse": prefix_collapse,
         "buckets": {k: 100 * buckets.get(k, 0) / n for k in ("Excellent", "Good", "Partial", "Poor")},
     }
 def main() -> None:
     summaries = {}
-    for name, model_dir in MODELS.items():
-        path = fetch_chunk(model_dir)
-        df = load_outputs(path)
         summaries[name] = summarise(name, df)
     logger.info("\n=== Structural quality (per 1000 outputs) ===")
-    logger.info(f"{'Metric':<32} {'SmolLM2':>10} {'Qwen3':>10}")
-    logger.info("-" * 54)
-    for key, label in [
-        ("pct_problem_solution", "Problem + Solution sections"),
-        ("pct_step_by_step", "Numbered step-by-step"),
-        ("pct_latex", "LaTeX math notation"),
-        ("pct_has_solution", "Contains the word 'solution'"),
-    ]:
-        logger.info(
-            f"{label:<32} {summaries['SmolLM2'][key]:>9.1f}% {summaries['Qwen3'][key]:>9.1f}%"
-        )
     logger.info("\n=== Output length (tokens, output_tokenizer) ===")
-    logger.info(f"{'Statistic':<32} {'SmolLM2':>10} {'Qwen3':>10}")
-    logger.info("-" * 54)
-    for key, label in [
-        ("token_min", "min (all)"),
-        ("token_median", "median (all)"),
-        ("token_max", "max (all)"),
-        ("stop_token_min", "min (finish=stop)"),
-        ("stop_token_max", "max (finish=stop)"),
-        ("n_truncated", "# length-truncated"),
-        ("n_errored", "# inference errors"),
-    ]:
-        logger.info(f"{label:<32} {summaries['SmolLM2'][key]:>10} {summaries['Qwen3'][key]:>10}")
     logger.info("\n=== Template collapse (most common prefix at varying lengths) ===")
-    logger.info(f"{'Prefix chars':<14} {'SmolLM2 (top/distinct)':>26} {'Qwen3 (top/distinct)':>26}")
-    logger.info("-" * 70)
     for size in [10, 20, 40, 60]:
-        s = summaries["SmolLM2"]["prefix_collapse"][size]
-        q = summaries["Qwen3"]["prefix_collapse"][size]
-        logger.info(
-            f"{size:<14} "
-            f"{s['most_common_count']:>10}/1000 ({s['distinct']:>4} unique) "
-            f"{q['most_common_count']:>10}/1000 ({q['distinct']:>4} unique)"
-        )
     logger.info("\nMost common 40-char prefix per model:")
-    for name in ("SmolLM2", "Qwen3"):
-        logger.info(f"  {name}: {summaries[name]['prefix_collapse'][40]['example']!r}")
     # Emit the dense prefix-collapse curve as CSV for the d3 chart.
     csv_path = Path(__file__).parent.parent / "assets/data/qwen3_vs_smollm2_prefix_collapse.csv"
@@ -196,16 +442,136 @@ def main() -> None:
     pd.DataFrame(csv_rows).to_csv(csv_path, index=False)
     logger.info(f"\nSaved prefix-collapse curve to {csv_path}")
-    logger.info("\n=== Quality buckets (% of outputs) ===")
-    logger.info(f"{'Bucket':<12} {'SmolLM2':>10} {'Qwen3':>10}")
-    logger.info("-" * 34)
-    for bucket in ("Excellent", "Good", "Partial", "Poor"):
-        logger.info(
-            f"{bucket:<12} "
-            f"{summaries['SmolLM2']['buckets'][bucket]:>9.1f}% "
-            f"{summaries['Qwen3']['buckets'][bucket]:>9.1f}%"
         )
     out = Path(__file__).parent / "qwen3_vs_smollm2_math_results.json"
     with out.open("w") as f:
         json.dump(summaries, f, indent=2)

 from pathlib import Path
 import pandas as pd
+from joblib import Parallel, delayed
+from tqdm.auto import tqdm
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
 BUCKET_URI = "hf://buckets/HuggingFaceFW/finephrase-rephrased/format"
+# Each chunk holds 1000 documents; 10 chunks = 10000 outputs per model.
+N_CHUNKS = 10
+CHUNKS = [f"00000_chunk_{i}.jsonl.gz" for i in range(N_CHUNKS)]
 CACHE_DIR = Path(__file__).parent / ".cache_math_outputs"
+MODELS = {
+    "SmolLM2": "math-smollm2-1.7b-hq",
+    "Qwen3": "math-qwen3-1.7b-hq",
+    "Llama 3.2": "math-llama3.2-1b-hq",
+    "Gemma 3": "math-1b-hq",
+    "Falcon 3": "math-falcon3-1b-hq",
+    "Granite 3": "math-granite3-1b-hq",
+}
+# Default-visible models in the interactive comparison.
+DEFAULT_VISIBLE = ["SmolLM2", "Qwen3", "Llama 3.2"]
 # Headers like "Problem:" and "**Problem:**", optionally bolded
 PROBLEM_RE = re.compile(r"\*{0,2}\s*problem\s*\*{0,2}\s*:", re.IGNORECASE)
 LATEX_RE = re.compile(
     r"\$\$[^$]+\$\$|\$[^$\n]+\$|\\frac|\\times|\\div|\\cdot|\\sqrt|\\sum|\\int|\\pi\b|\\alpha\b|\\beta\b"
 )
+# Format-adherence patterns for the math audit (Section 5.2 reviewer response).
+DIGIT_RE = re.compile(r"\d")
+# Any arithmetic op directly between two numbers, or common LaTeX math ops.
+ARITH_OP_RE = re.compile(r"\d\s*[+\-×÷*/]\s*\d|\\times|\\div|\\cdot|\\frac")
+# A full "N op N = N" arithmetic statement; allows commas (e.g. "1,500") and decimals.
+_NUM = r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?"
+ARITH_EQ_RE = re.compile(rf"({_NUM})\s*([+\-×÷*/])\s*({_NUM})\s*=\s*({_NUM})")
+# Standalone numeric tokens for the number-provenance check.
+NUM_TOKEN_RE = re.compile(r"\d+(?:[.,]\d+)*")
+# Interrogative / imperative cues that mark a real math question.
+QUESTION_CUE_RE = re.compile(
+    r"\b(how many|how much|how long|how often|what is|what was|what are|"
+    r"calculate|compute|determine|find|solve)\b",
+    re.IGNORECASE,
+)
+# Patterns at the end of a solution that indicate a closed numeric answer.
+FINAL_ANSWER_RE = re.compile(
+    r"(the (final )?answer is\s*\$?\d|therefore[, ]+[^.\n]{0,40}=\s*-?\d|"
+    r"=\s*-?\d+(?:\.\d+)?\s*[.\n]|answer\s*[:=]\s*-?\d|total\s*[:=]\s*-?\d)",
+    re.IGNORECASE,
+)
+# Lexicon used for math-vocabulary density (per 100 alpha tokens).
+MATH_WORDS = frozenset({
+    "calculate", "calculation", "equation", "equations", "probability", "percentage",
+    "percent", "ratio", "sum", "product", "average", "mean", "median",
+    "divided", "multiplied", "equals", "subtract", "subtraction", "add", "addition",
+    "multiply", "multiplication", "divide", "division", "formula", "variable",
+    "solve", "value", "integer", "decimal", "fraction", "remainder", "quotient",
+})
+# English stopwords for the topic-drift Jaccard check.
+STOPWORDS = frozenset((
+    "the a an and or but if then so as is are was were be been being have has "
+    "had do does did of to in on at by for with from up about into through "
+    "during before after above below between under further once here there "
+    "when where why how all any both each few more most other some such no nor "
+    "not only own same so than too very can will just don should now this that "
+    "these those it its their them they we you our your his her him she he"
+).split())
 # Prefix lengths (in characters) at which to measure template collapse.
 START_LENS = list(range(2, 61, 2))
+def _download_one(model_dir: str, chunk_name: str) -> Path:
+    """Download a single chunk if not already cached. Returns the destination path."""
+    dst_dir = CACHE_DIR / model_dir
+    dst_dir.mkdir(parents=True, exist_ok=True)
+    dst = dst_dir / chunk_name
     if dst.exists():
         return dst
+    src = f"{BUCKET_URI}/{model_dir}/{chunk_name}"
+    subprocess.run(
+        ["hf", "buckets", "cp", src, str(dst)],
+        check=True,
+        capture_output=True,
+    )
     return dst
+def fetch_chunks(model_dir: str, chunks: list[str] = CHUNKS) -> list[Path]:
+    """Download `chunks` for `model_dir` in parallel; reuses anything already cached."""
+    missing = [c for c in chunks if not (CACHE_DIR / model_dir / c).exists()]
+    if missing:
+        logger.info(f"  Downloading {len(missing)} chunk(s) for {model_dir}...")
+        Parallel(n_jobs=4, backend="threading")(
+            delayed(_download_one)(model_dir, c)
+            for c in tqdm(missing, desc=f"  {model_dir}", leave=False)
+        )
+    return [CACHE_DIR / model_dir / c for c in chunks]
+def _verify_equation(a_s: str, op: str, b_s: str, r_s: str) -> bool:
+    """Return True iff `a op b == r` (with small tolerance, ignoring thousands commas)."""
+    try:
+        a = float(a_s.replace(",", ""))
+        b = float(b_s.replace(",", ""))
+        r = float(r_s.replace(",", ""))
+    except ValueError:
+        return False
+    if op == "+":
+        expected = a + b
+    elif op == "-":
+        expected = a - b
+    elif op in ("×", "*"):
+        expected = a * b
+    elif op in ("÷", "/"):
+        if b == 0:
+            return False
+        expected = a / b
+    else:
+        return False
+    return abs(expected - r) < max(0.01, abs(expected) * 1e-4)
+def _content_words(text: str) -> set[str]:
+    """Lowercase alphabetic tokens >=4 chars, excluding stopwords."""
+    return {w for w in re.findall(r"[a-z]{4,}", text.lower()) if w not in STOPWORDS}
+def _normalize_numbers(text: str) -> set[str]:
+    """Set of distinct numeric tokens with thousands-commas stripped."""
+    return {n.replace(",", "") for n in NUM_TOKEN_RE.findall(text)}
+def audit_math_validity(text: str, input_text: str) -> dict:
+    """Quantify math format-adherence at the row level for the reviewer audit.
+    Aggregates five complementary signals, each addressing a different failure mode
+    the regex audit catches imperfectly on its own:
+      1. Number provenance: are the numbers in the output anchored in the source
+         document, or invented?
+      2. Question/answer alignment: does the problem pose a real question and the
+         solution end with a numeric answer?
+      3. Final-answer pattern: is there a closed "the answer is N" termination?
+      4. Math vocabulary density: how math-flavored is the prose, normalized per
+         100 alphabetic tokens?
+      5. Topic Jaccard: content-word overlap with the source document, a cheap
+         proxy for whether the rephrasing stayed on topic.
+    """
+    eqs = ARITH_EQ_RE.findall(text)
+    n_correct = sum(_verify_equation(*e) for e in eqs)
+    out_numbers = _normalize_numbers(text)
+    in_numbers = _normalize_numbers(input_text)
+    shared_numbers = out_numbers & in_numbers
+    in_words = _content_words(input_text)
+    out_words = _content_words(text)
+    jaccard = (
+        len(in_words & out_words) / len(in_words | out_words)
+        if (in_words or out_words)
+        else 0.0
+    )
+    alpha_tokens = re.findall(r"[a-zA-Z]+", text.lower())
+    math_vocab_density = (
+        100.0 * sum(1 for t in alpha_tokens if t in MATH_WORDS) / len(alpha_tokens)
+        if alpha_tokens
+        else 0.0
+    )
+    return {
+        # Structural
+        "has_digits": bool(DIGIT_RE.search(text)),
+        "has_arith_op": bool(ARITH_OP_RE.search(text)),
+        "n_equations": len(eqs),
+        "n_correct_equations": n_correct,
+        "has_correct_equation": n_correct > 0,
+        # Audit #1: number provenance
+        "n_out_numbers": len(out_numbers),
+        "n_shared_numbers": len(shared_numbers),
+        "any_number_from_input": len(shared_numbers) > 0,
+        # Audit #2: question/answer alignment
+        "has_question_cue": bool(QUESTION_CUE_RE.search(text)),
+        "has_numeric_in_tail": bool(re.search(r"\d", text[-200:])),
+        "qa_aligned": bool(QUESTION_CUE_RE.search(text)) and bool(re.search(r"\d", text[-200:])),
+        # Audit #3: final-answer pattern
+        "has_final_answer": bool(FINAL_ANSWER_RE.search(text)),
+        # Audit #4: math vocabulary density (per 100 alpha tokens)
+        "math_vocab_density": math_vocab_density,
+        # Audit #5: topic Jaccard with input
+        "topic_jaccard": jaccard,
+    }
+def _features_for_line(line: str) -> dict:
+    """Compute structural + audit features for one JSONL row."""
+    d = json.loads(line)
+    # Some SmolLM2 rows had vLLM context-length errors and lack `text`
+    text = d["text"] if "text" in d else ""
+    input_text = d["metadata"]["input"]["text"]
+    tokens = d["metadata"]["token_count"]
+    ir = d["metadata"]["inference_results"][0]
+    finish_reason = ir["finish_reason"] if "finish_reason" in ir else "error"
+    has_problem = bool(PROBLEM_RE.search(text))
+    has_solution = bool(SOLUTION_RE.search(text))
+    row = {
+        "text": text,
+        "tokens": tokens,
+        "finish_reason": finish_reason,
+        "has_problem": has_problem,
+        "has_solution": has_solution,
+        "has_problem_solution": has_problem and has_solution,
+        "has_step_by_step": len(STEP_RE.findall(text)) >= 2,
+        "has_latex": bool(LATEX_RE.search(text)),
+        **audit_math_validity(text, input_text),
+    }
+    for n in START_LENS:
+        row[f"start_{n}"] = text[:n]
+    return row
+def load_outputs(paths: list[Path]) -> pd.DataFrame:
+    """Load rephrased outputs from one or more chunks and compute per-row features."""
     rows = []
+    for path in paths:
+        with gzip.open(path, "rt") as f:
+            for line in f:
+                rows.append(_features_for_line(line))
     return pd.DataFrame(rows)
     return "Poor"
+def math_adherence_bucket(row: pd.Series) -> str:
+    """Categorize each output by how well it actually performs the math task.
+    Independent of surface structure ("Problem:" / "Solution:" markers); this
+    audit answers the reviewer concern that some "math" outputs contain no math.
+    """
+    if row["has_correct_equation"]:
+        return "Valid math"  # Has at least one verifiably correct N op N = N
+    if row["n_equations"] > 0:
+        return "Wrong math"  # Has equations but none arithmetically correct
+    if row["has_arith_op"]:
+        return "Implicit math"  # Operators between numbers but no closed equation
+    if row["has_digits"]:
+        return "Numeric but no math"  # Numbers present, no arithmetic at all
+    return "No math content"  # No digits at all (the "Paul Revere Williams" case)
 def summarise(name: str, df: pd.DataFrame) -> dict:
     """Compute the headline numbers for one model."""
     n = len(df)
     buckets = df.apply(quality_bucket, axis=1).value_counts().to_dict()
+    adherence = df.apply(math_adherence_bucket, axis=1).value_counts().to_dict()
+    # Sample a few "No math content" outputs (the Paul Revere case) for the appendix.
+    no_math_examples = (
+        df[df["has_problem_solution"] & ~df["has_digits"]]["text"]
+        .head(3)
+        .tolist()
+    )
+    total_eqs = int(df["n_equations"].sum())
+    correct_eqs = int(df["n_correct_equations"].sum())
     # Exclude failed inferences (empty text) from prefix-collapse to avoid
     # an artificial "" cluster.
     non_empty = df[df["text"].str.len() > 0]
         "n_errored": int((df["finish_reason"] == "error").sum()),
         "prefix_collapse": prefix_collapse,
         "buckets": {k: 100 * buckets.get(k, 0) / n for k in ("Excellent", "Good", "Partial", "Poor")},
+        "math_adherence": {
+            k: 100 * adherence.get(k, 0) / n
+            for k in ("Valid math", "Wrong math", "Implicit math", "Numeric but no math", "No math content")
+        },
+        "pct_with_equation": 100 * (df["n_equations"] > 0).mean(),
+        "pct_with_correct_equation": 100 * df["has_correct_equation"].mean(),
+        "pct_with_arith_op": 100 * df["has_arith_op"].mean(),
+        "pct_no_digits": 100 * (~df["has_digits"]).mean(),
+        "equation_accuracy": (correct_eqs / total_eqs) if total_eqs else float("nan"),
+        "total_equations": total_eqs,
+        "correct_equations": correct_eqs,
+        # Tier-1 audits
+        "pct_any_number_from_input": 100 * df["any_number_from_input"].mean(),
+        "pct_qa_aligned": 100 * df["qa_aligned"].mean(),
+        "pct_has_final_answer": 100 * df["has_final_answer"].mean(),
+        "math_vocab_density_mean": float(df["math_vocab_density"].mean()),
+        "topic_jaccard_mean": float(df["topic_jaccard"].mean()),
+        "no_math_examples": no_math_examples,
     }
+def _print_metric_table(summaries: dict, rows: list[tuple[str, str, str]]) -> None:
+    """Print a single metric table with all models as columns.
+    `rows` is a list of (summary_key, display_label, format_spec) where
+    format_spec is "%" for percentages, "i" for integers, "f" for floats.
+    """
+    names = list(summaries)
+    col_w = max(11, max(len(n) for n in names) + 1)
+    header = f"{'Metric':<42}" + "".join(f"{n:>{col_w}}" for n in names)
+    logger.info(header)
+    logger.info("-" * len(header))
+    for key, label, fmt in rows:
+        cells = []
+        for n in names:
+            v = summaries[n][key]
+            if fmt == "%":
+                cells.append(f"{v:>{col_w - 1}.1f}%")
+            elif fmt == "i":
+                cells.append(f"{v:>{col_w}}")
+            else:
+                cells.append(f"{v:>{col_w}.3f}")
+        logger.info(f"{label:<42}" + "".join(cells))
+def _migrate_old_cache() -> None:
+    """Move single-file caches (e.g. `math-qwen3-1.7b-hq.jsonl.gz`) into per-model
+    subfolders so the new multi-chunk layout can reuse them as chunk 0."""
+    if not CACHE_DIR.exists():
+        return
+    for model_dir in MODELS.values():
+        old_path = CACHE_DIR / f"{model_dir}.jsonl.gz"
+        new_path = CACHE_DIR / model_dir / CHUNKS[0]
+        if old_path.exists() and not new_path.exists():
+            new_path.parent.mkdir(parents=True, exist_ok=True)
+            old_path.rename(new_path)
 def main() -> None:
+    _migrate_old_cache()
     summaries = {}
+    for name, model_dir in tqdm(MODELS.items(), desc="Models", unit="model"):
+        logger.info(f"\nProcessing {name} ({model_dir})...")
+        paths = fetch_chunks(model_dir)
+        df = load_outputs(paths)
+        logger.info(f"  Loaded {len(df)} outputs.")
         summaries[name] = summarise(name, df)
     logger.info("\n=== Structural quality (per 1000 outputs) ===")
+    _print_metric_table(summaries, [
+        ("pct_problem_solution", "Problem + Solution sections", "%"),
+        ("pct_step_by_step", "Numbered step-by-step", "%"),
+        ("pct_latex", "LaTeX math notation", "%"),
+        ("pct_has_solution", "Contains the word 'solution'", "%"),
+    ])
     logger.info("\n=== Output length (tokens, output_tokenizer) ===")
+    _print_metric_table(summaries, [
+        ("token_min", "min (all)", "i"),
+        ("token_median", "median (all)", "i"),
+        ("token_max", "max (all)", "i"),
+        ("stop_token_min", "min (finish=stop)", "i"),
+        ("stop_token_max", "max (finish=stop)", "i"),
+        ("n_truncated", "# length-truncated", "i"),
+        ("n_errored", "# inference errors", "i"),
+    ])
     logger.info("\n=== Template collapse (most common prefix at varying lengths) ===")
+    names = list(summaries)
+    col_w = max(15, max(len(n) for n in names) + 4)
+    header = f"{'Prefix chars':<14}" + "".join(f"{n:>{col_w}}" for n in names)
+    logger.info(header)
+    logger.info("-" * len(header))
     for size in [10, 20, 40, 60]:
+        cells = [f"{summaries[n]['prefix_collapse'][size]['most_common_count']:>4}/1000".rjust(col_w) for n in names]
+        logger.info(f"{size:<14}" + "".join(cells))
     logger.info("\nMost common 40-char prefix per model:")
+    for n in names:
+        logger.info(f"  {n}: {summaries[n]['prefix_collapse'][40]['example']!r}")
     # Emit the dense prefix-collapse curve as CSV for the d3 chart.
     csv_path = Path(__file__).parent.parent / "assets/data/qwen3_vs_smollm2_prefix_collapse.csv"
     pd.DataFrame(csv_rows).to_csv(csv_path, index=False)
     logger.info(f"\nSaved prefix-collapse curve to {csv_path}")
+    # Flatten bucket/adherence dicts into top-level keys so the table helper can read them.
+    for s in summaries.values():
+        for b, v in s["buckets"].items():
+            s[f"bucket__{b}"] = v
+        for b, v in s["math_adherence"].items():
+            s[f"adherence__{b}"] = v
+        s["pct_equation_accuracy"] = (
+            100 * s["equation_accuracy"] if s["equation_accuracy"] == s["equation_accuracy"] else 0.0
         )
+    logger.info("\n=== Quality buckets (% of outputs) ===")
+    _print_metric_table(summaries, [(f"bucket__{b}", b, "%") for b in ("Excellent", "Good", "Partial", "Poor")])
+    logger.info("\n=== Math format-adherence audit (reviewer response) ===")
+    _print_metric_table(summaries, [
+        ("pct_no_digits", "No digits at all", "%"),
+        ("pct_with_arith_op", "Has arithmetic operator", "%"),
+        ("pct_with_equation", "Has full N op N = N equation", "%"),
+        ("pct_with_correct_equation", "Has ≥1 correct arithmetic equation", "%"),
+        ("pct_any_number_from_input", "≥1 number reused from source", "%"),
+        ("pct_qa_aligned", "Question word + numeric answer", "%"),
+        ("pct_has_final_answer", "Closed final-answer pattern", "%"),
+        ("pct_equation_accuracy", "Per-equation arithmetic accuracy", "%"),
+        ("math_vocab_density_mean", "Math-vocab density (per 100 tokens)", "f"),
+        ("topic_jaccard_mean", "Topic Jaccard with source (mean)", "f"),
+    ])
+    logger.info("\n=== Adherence buckets (% of outputs) ===")
+    _print_metric_table(summaries, [
+        (f"adherence__{b}", b, "%")
+        for b in ("Valid math", "Wrong math", "Implicit math", "Numeric but no math", "No math content")
+    ])
+    # Emit a long-format CSV of the per-model format-adherence audit for the embed.
+    adherence_csv = Path(__file__).parent.parent / "assets/data/math_format_adherence.csv"
+    # direction: "higher" = darker bar for higher value, "lower" = darker for lower,
+    # "neutral" = no bar shading and no best-cell highlight.
+    # description: shown in the embed's hover tooltip to explain how the metric is computed.
+    audit_rows = [
+        (
+            "Adherence buckets",
+            "Valid math (≥1 verifiably correct equation)",
+            "adherence__Valid math", "%", "higher",
+            "Output contains at least one arithmetic statement of the form N op N = N where the math checks out (e.g., 120 × 5 = 600). Mutually exclusive with the other adherence buckets.",
+        ),
+        (
+            "Adherence buckets",
+            "Wrong math (equations but none correct)",
+            "adherence__Wrong math", "%", "lower",
+            "Output contains at least one N op N = N statement but every such equation is arithmetically wrong (e.g., 3 × 5 = -15).",
+        ),
+        (
+            "Adherence buckets",
+            "Implicit math (operators, no closed equation)",
+            "adherence__Implicit math", "%", "neutral",
+            "Output contains arithmetic operators directly between numbers (e.g., '120 × 5') or LaTeX math commands (\\times, \\frac, \\cdot) but no fully closed N op N = N equation.",
+        ),
+        (
+            "Adherence buckets",
+            "Numeric but no math",
+            "adherence__Numeric but no math", "%", "lower",
+            "Output contains digits but no arithmetic operator at all. Numbers appear as quantities, years, or counts without any computation.",
+        ),
+        (
+            "Adherence buckets",
+            "No math content (zero digits)",
+            "adherence__No math content", "%", "lower",
+            "Output contains no digits at all. The dog-sunburn example above is one of these.",
+        ),
+        (
+            "Arithmetic correctness",
+            "Per-equation arithmetic accuracy",
+            "pct_equation_accuracy", "%", "higher",
+            "Of every extracted N op N = N equation across all 10,000 outputs, the percentage that are arithmetically correct (tolerance 0.01 or 0.01% of the expected value, to absorb tiny rounding).",
+        ),
+        (
+            "Problem shape",
+            "Question word + numeric answer",
+            "pct_qa_aligned", "%", "higher",
+            "Output contains a question cue ('how many', 'what is', 'calculate', 'find', 'compute', 'determine', etc.) AND has at least one digit in its final 200 characters.",
+        ),
+        (
+            "Problem shape",
+            "Closed final-answer pattern",
+            "pct_has_final_answer", "%", "higher",
+            "Output matches a regex for closed answer patterns like 'the answer is N', '= N.', 'therefore ... = N', 'answer: N', or 'total = N'.",
+        ),
+        (
+            "Problem shape",
+            "Math-vocab density (per 100 alpha tokens)",
+            "math_vocab_density_mean", "f", "higher",
+            "Mean count of math-vocabulary words per 100 alphabetic tokens, averaged across outputs. Vocabulary: calculate, equation, probability, percentage, ratio, sum, product, average, divide, multiply, formula, fraction, etc.",
+        ),
+        (
+            "Source grounding",
+            "≥1 number reused from source",
+            "pct_any_number_from_input", "%", "higher",
+            "Output shares at least one numeric token with the source document (thousands-commas stripped, so '1,500' matches '1500'). Proxy for whether the math is anchored in source content vs invented.",
+        ),
+        (
+            "Source grounding",
+            "Topic Jaccard with source (mean)",
+            "topic_jaccard_mean", "f", "higher",
+            "Mean Jaccard similarity between content-word sets of input and output (lowercase, ≥4 chars, stopwords removed). Higher = output stays closer to source vocabulary.",
+        ),
+    ]
+    csv_rows = []
+    for group, metric_label, key, fmt, direction, description in audit_rows:
+        for model_name in MODELS:
+            csv_rows.append({
+                "group": group,
+                "metric": metric_label,
+                "model": model_name,
+                "value": summaries[model_name][key],
+                "format": fmt,
+                "direction": direction,
+                "description": description,
+            })
+    pd.DataFrame(csv_rows).to_csv(adherence_csv, index=False)
+    logger.info(f"\nSaved format-adherence audit CSV to {adherence_csv}")
+    # Surface representative "no math content" outputs from any model that has them.
+    for model_name in MODELS:
+        exs = summaries[model_name]["no_math_examples"]
+        if exs:
+            logger.info(f"\nRepresentative {model_name} outputs with Problem/Solution structure but NO digits:")
+            for i, ex in enumerate(exs, 1):
+                logger.info(f"\n  [{i}] {ex[:400].strip()}")
+            break  # one model is enough for the printed examples
     out = Path(__file__).parent / "qwen3_vs_smollm2_math_results.json"
     with out.open("w") as f:
         json.dump(summaries, f, indent=2)

app/src/content/analysis/qwen3_vs_smollm2_math_results.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:17a3285a9ea99ce5e54d541b881ffc8dd14c5879b829488c1df295b134cd6fb3
-size 8998

 version https://git-lfs.github.com/spec/v1
+oid sha256:66c976fc35e2c34273caf330dde6cf27fae80617d0dc5390c13d0015988be3fc
+size 64005

app/src/content/assets/data/math_format_adherence.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef76807af9c4df9338083c60562e43ddc6eae771e496c5008e65f4d5fc140c4e
+size 15803

app/src/content/assets/data/qwen3_vs_smollm2_prefix_collapse.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:671df2da7a38aa498ab6daf7814b9241d8a2badb5c141368dfb391819c62dcf7
-size 1038

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4603016ea1f8e19321f77dbaf691bc00a1a8cb3ba8f2e8326a23ebab0f26269
+size 3651

app/src/content/chapters/4-analyses.mdx CHANGED Viewed

@@ -295,12 +295,12 @@ So output length doesn't predict quality either. But we stumbled onto something
 This was one of our most surprising findings. We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.[^math-analysis-script]
-[^math-analysis-script]: All numbers in this section come from `app/src/content/analysis/qwen3_vs_smollm2_math.py`, which scores the first 1000 documents of `format/math-qwen3-1.7b-hq` and `format/math-smollm2-1.7b-hq` from the [finephrase-rephrased bucket](https://huggingface.co/buckets/HuggingFaceFW/finephrase-rephrased/tree/format). Exact counts shift by a few percent with a different sample (different chunk, different seed), but the qualitative gap between the two models is stable.
 **Qwen3 produced beautiful, structured outputs:**
-- 99.8% had proper Problem/Solution sections
-- 99.1% had step-by-step formatting
 - 59% included LaTeX math notation
 Here's a typical Qwen3 output:
@@ -321,40 +321,73 @@ The disc makes 600 revolutions in 5 minutes.
 ```
 **SmolLM2 was messier:**
-- Only 67.5% had complete solutions
-- Wide variance in output length (5 to 2,900 tokens, vs Qwen3's 113 to 1,250)
 - Mix of formats: questions, partial answers, full solutions
-SmolLM2 outputs ranged from proper solutions to just questions like *"What is the difference between X and Y?"* or even 4-token fragments like *"Areas Where We Service"*.
-Yet models trained on SmolLM2's data **outperformed** those trained on Qwen3's data on downstream benchmarks. We suspect this is due to **template collapse**: Qwen3's outputs were *too* consistent. The chart below shows how many outputs share their first N characters, for N ranging from 4 to 60. At every prefix length, Qwen3 has dramatically more duplication than SmolLM2.
 <HtmlEmbed
   id="prefix-collapse"
   src="d3-prefix-collapse.html"
   data="qwen3_vs_smollm2_prefix_collapse.csv"
-  desc="Template collapse curve across prefix lengths. Toggle between 'Most common prefix count' (how many of 1000 outputs share the single most common opening) and 'Distinct prefixes' (how many unique openings exist). Qwen3 collapses far more aggressively at every prefix length."
 />
-The numbers are striking: at the **first 10 characters**, Qwen3 has only **9 distinct openings** across 1000 outputs (783 of them share the most common one), while SmolLM2 has **270 distinct openings**. At 20 characters, Qwen3 still has 129 outputs sharing one opening (`'**Mathematical Word Problem:** A'`) versus only 39 for SmolLM2. The two models' top 40-character prefixes tell the story directly: Qwen3 produces `'**Mathematical Word Problem:**\n\nA school'` over and over, while SmolLM2's most common opener is the looser `'Question: What is the difference between'`.
 SmolLM2's quality distribution was actually reasonable:
 | Quality | Criteria | Share |
 | --- | --- | --- |
-| Excellent | Has "solution" + structural steps + 80+ tokens | 44.5% |
-| Good | Has "solution" + 50+ tokens | 21.2% |
-| Partial | 30+ tokens but missing structure | 23.4% |
-| Poor | {'<'}30 tokens | 10.9% |
-The lesson: for pretraining data, diversity beats consistency. A model that doesn't follow instructions perfectly can actually produce better training data than one that does. This also helps explain why SmolLM2 dominates the model family comparison: it produces more varied outputs, which may matter more than precise instruction following.
 <Note title="Summary: Analyses" variant="info">
 **Cost**: Small models with simple prompts dominate the Pareto frontier. Invest in prompt design, not model size.<br/>
 **Quality scores**: Neither edu-score nor DCLM-score reliably predicts downstream performance for synthetic data. There is no shortcut to training and evaluating.<br/>
 **Proxy model size**: A 2.9B student reveals three tiers (270M {'<'} 1B {'<'} 4B+) that the 1.7B student compressed. Generator gains above 1B are real but smaller than student-side gains. Student scale is the bigger lever.<br/>
 **Verbosity**: Output length has no meaningful relationship with performance. What matters is content, not compression ratio.<br/>
-**Diversity**: Template collapse hurts more than noisy outputs. A messier model that produces varied text can outperform a polished one that repeats the same template.
 </Note>
 With the experiments and analyses behind us, let's talk about the infrastructure that made all of this possible.

 This was one of our most surprising findings. We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.[^math-analysis-script]
+[^math-analysis-script]: All numbers in this section come from `app/src/content/analysis/qwen3_vs_smollm2_math.py`, which scores the first 10000 documents of each `format/math-{model}-hq` dataset from the [finephrase-rephrased bucket](https://huggingface.co/buckets/HuggingFaceFW/finephrase-rephrased/tree/format). All six generators are small instruction-tuned models in the same order-of-magnitude size range: SmolLM2 (1.7B), Qwen3 (1.7B), Llama 3.2 (1B), Gemma 3 (1B), Falcon 3 (1B), and Granite 3 (1B). Exact counts shift by a fraction of a percent across different chunks; the qualitative gaps between the six are stable.
 **Qwen3 produced beautiful, structured outputs:**
+- 99.7% had proper Problem/Solution sections
+- 98.9% had step-by-step formatting
 - 59% included LaTeX math notation
 Here's a typical Qwen3 output:
 ```
 **SmolLM2 was messier:**
+- Only 68.8% had complete solutions
+- Wide variance in output length (1 to 3,540 tokens, vs Qwen3's 40 to 2,180)
 - Mix of formats: questions, partial answers, full solutions
+SmolLM2 outputs ranged from proper solutions to just questions like *"What is the relationship between X and Y?"* or even tiny fragments like *"Areas Where We Service"*.
+Yet models trained on SmolLM2's data **outperformed** those trained on Qwen3's data on downstream benchmarks. We suspect this is due to **template collapse**: Qwen3's outputs were *too* consistent. The chart below shows how many of 10000 outputs share their first N characters, for N ranging from 2 to 60. At every prefix length, Qwen3 has dramatically more duplication than SmolLM2.
 <HtmlEmbed
   id="prefix-collapse"
   src="d3-prefix-collapse.html"
   data="qwen3_vs_smollm2_prefix_collapse.csv"
+  desc="Template collapse curve across prefix lengths. Toggle between 'Most common prefix count' (how many of 10000 outputs share the single most common opening) and 'Distinct prefixes' (how many unique openings exist). Qwen3 collapses far more aggressively at every prefix length."
 />
+The numbers are striking: at the **first 10 characters**, Qwen3 has only **40 distinct openings** across 10000 outputs (7619 of them share the most common one, `'**Problem:'`), while SmolLM2 has **1897 distinct openings**. At 20 characters, Qwen3 still has 1209 outputs sharing one opening (`'**Mathematical Word '`) versus only 443 for SmolLM2. The two models' top 40-character prefixes tell the story directly: Qwen3 produces `'**Mathematical Word Problem:**\n\nA school'` 87 times, while SmolLM2's most common opener (`'Question: What is the relationship betwe'`) appears only 35 times.
 SmolLM2's quality distribution was actually reasonable:
 | Quality | Criteria | Share |
 | --- | --- | --- |
+| Excellent | Has "solution" + structural steps + 80+ tokens | 44.6% |
+| Good | Has "solution" + 50+ tokens | 22.7% |
+| Partial | 30+ tokens but missing structure | 24.1% |
+| Poor | {'<'}30 tokens | 8.6% |
+#### Does any of this actually contain math?
+The structural and length metrics above tell us *how the outputs look*, not whether they contain valid math. To audit format adherence directly, we run a parser on each output that extracts every `N op N = N` arithmetic statement and checks it for correctness, then bucket each output by what it actually contains (a verifiably correct equation, a wrong equation, a bare arithmetic operator with no closed equation, just numbers, or no digits at all). On top of that we compute five cheap heuristics: whether any number in the output also appears in the source document (number provenance), whether the problem poses a question word and the solution ends with a numeric answer (Q/A alignment), whether the solution contains a closed final-answer pattern, the math-vocabulary density, and the topic Jaccard overlap with the source. We ran the audit on the first 10000 outputs from each of the six generators. Toggle which models are visible:
+<Wide>
+<HtmlEmbed
+  id="math-adherence-audit"
+  src="math-adherence-audit.html"
+  data="math_format_adherence.csv"
+  desc="Per-model format-adherence audit, 10000 outputs per model. Bar-shading and the bold cell mark the best value in each row, with higher- or lower-is-better direction adjusted per metric. Click pills to add or remove models; SmolLM2, Qwen3, and Llama 3.2 are shown by default."
+/>
+</Wide>
+Three findings deserve emphasis. First, **nearly a quarter of SmolLM2's "math" outputs contain no numeric content whatsoever** (23.0%), often producing pseudo-problems on unrelated topics. A representative example:
+```
+Problem: How can dog owners effectively prevent sunburn in their dogs
+and ensure their dogs' skin health remains intact?
+Solution: Dogs can be prevented sunburn using sunscreen, protective
+clothing, shade, and water. Owners should identify their dog's
+sun-vulnerable areas and apply sunscreen accordingly...
+```
+The same problem appears at a much lower rate in every other model: 7.7% for Gemma 3, 2.1% Falcon 3, 1.9% Granite 3, 0.7% Llama 3.2, and 0.1% Qwen3.
+Second, structural perfection is *not* matched by mathematical correctness in any model. Per-equation arithmetic accuracy ranges from **Qwen3's 78.9%** (5072/6432 equations correct) down to **SmolLM2's 57.3%** (1385/2418), with the four others spread in between (Llama 72%, Falcon 71%, Gemma 68%, Granite 62%). No model is a reliable math tutor at the equation level, even when the surface formatting looks immaculate.
+Third, and most surprising: **the model whose outputs stay closest to the source document is Gemma 3**, not the most formatted or the most math-dense one. Topic-Jaccard overlap ranks Gemma (0.187) > Granite (0.126) > SmolLM2 (0.120) > Falcon (0.114) > Llama (0.094) > Qwen3 (0.093). Llama 3.2 has the highest math-vocab density (2.80 math-words per 100 alpha tokens, vs SmolLM2's 1.25), and Qwen3 dominates Q/A formatting (94.4% vs SmolLM2's 43.0%) and structural completion (99.7% Problem/Solution sections). Yet these two most "polished" generators sit at the bottom of the source-grounding ranking, and both trail Gemma 3 on the actual downstream benchmark.
+Put all of this together and the headline result becomes even more striking. **Qwen3 outputs are nearly 4x more likely to contain valid math than SmolLM2's (30.2% vs 8.1%), yet models pretrained on SmolLM2's outputs still perform better downstream.** Whatever pretraining is rewarding here, it isn't the correctness of individual word problems. The model-family comparison is picking up format and topic diversity in the rephrased text more than the validity of the math itself, consistent with the WRAP [@wrap] finding that paraphrase diversity drives downstream gains.
+The lesson: for pretraining data, diversity beats consistency. A model that doesn't follow instructions perfectly, and even produces topically off-target outputs, can still yield better training data than one that produces polished but formulaic math. This also helps explain why SmolLM2 dominates the model family comparison: it produces more varied outputs, which may matter more than precise instruction following.
 <Note title="Summary: Analyses" variant="info">
 **Cost**: Small models with simple prompts dominate the Pareto frontier. Invest in prompt design, not model size.<br/>
 **Quality scores**: Neither edu-score nor DCLM-score reliably predicts downstream performance for synthetic data. There is no shortcut to training and evaluating.<br/>
 **Proxy model size**: A 2.9B student reveals three tiers (270M {'<'} 1B {'<'} 4B+) that the 1.7B student compressed. Generator gains above 1B are real but smaller than student-side gains. Student scale is the bigger lever.<br/>
 **Verbosity**: Output length has no meaningful relationship with performance. What matters is content, not compression ratio.<br/>
+**Diversity**: Template collapse hurts more than noisy outputs. A messier model that produces varied text can outperform a polished one that repeats the same template.<br/>
+**Format adherence**: Per-equation arithmetic accuracy ranges from Qwen3's 78.9% down to SmolLM2's 57.3% across six generators, and 23.0% of SmolLM2 outputs have no numeric content at all. Yet SmolLM2 still wins downstream, so the pretraining signal isn't math validity but format and topic variety.
 </Note>
 With the experiments and analyses behind us, let's talk about the infrastructure that made all of this possible.

app/src/content/embeds/d3-prefix-collapse.html CHANGED Viewed

@@ -2,7 +2,7 @@
 <style>
   .d3-prefix-collapse { position: relative; }
   .d3-prefix-collapse .controls {
-    display: flex; gap: 16px; align-items: center; justify-content: flex-end;
     flex-wrap: wrap; margin: 0 0 8px 0;
   }
   .d3-prefix-collapse .controls .control-group {
@@ -16,6 +16,25 @@
     border: 1px solid var(--border-color); border-radius: 8px;
     background: var(--surface-bg); color: var(--text-color); cursor: pointer;
   }
   .d3-prefix-collapse .legend {
     display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
     margin: 8px 0 0 0;
@@ -89,25 +108,39 @@
     const METRICS = {
       most_common_count: {
-        label: 'Most common prefix (count / 1000)',
         y: 'most_common_count',
         yLabel: 'Outputs sharing the most-common prefix',
         higherIs: 'more collapse',
       },
       distinct: {
-        label: 'Distinct prefixes (out of 1000)',
         y: 'distinct',
-        yLabel: 'Distinct prefixes (out of 1000 outputs)',
         higherIs: 'more diversity',
       },
     };
-    const MODELS = ['SmolLM2', 'Qwen3'];
-    // Controls
     const controls = document.createElement('div');
     controls.className = 'controls';
-    const controlGroup = document.createElement('div');
-    controlGroup.className = 'control-group';
     const labelEl = document.createElement('label');
     const selectId = `metric-select-${Math.random().toString(36).slice(2, 8)}`;
     labelEl.htmlFor = selectId;
@@ -119,9 +152,10 @@
       opt.value = key; opt.textContent = m.label;
       select.appendChild(opt);
     });
-    controlGroup.appendChild(labelEl);
-    controlGroup.appendChild(select);
-    controls.appendChild(controlGroup);
     container.appendChild(controls);
     // Tooltip
@@ -145,39 +179,35 @@
     const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
     const gRoot = svg.append('g');
-    // Legend
-    const legend = document.createElement('div');
-    legend.className = 'legend';
-    const legendTitle = document.createElement('div');
-    legendTitle.className = 'legend-title';
-    legendTitle.textContent = 'Legend';
-    legend.appendChild(legendTitle);
-    const legendItems = document.createElement('div');
-    legendItems.className = 'items';
-    legend.appendChild(legendItems);
-    container.appendChild(legend);
     const getColors = () => {
-      if (window.ColorPalettes) {
-        const cat = window.ColorPalettes.getColors('categorical', 5);
-        return { SmolLM2: cat[2], Qwen3: cat[0] };
-      }
-      return { SmolLM2: '#3fb950', Qwen3: '#f85149' };
     };
-    function buildLegend(colors) {
-      legendItems.innerHTML = '';
       MODELS.forEach(name => {
-        const item = document.createElement('span');
-        item.className = 'item';
-        const sw = document.createElement('span');
-        sw.className = 'swatch';
-        sw.style.background = colors[name];
-        const txt = document.createElement('span');
-        txt.textContent = name;
-        item.appendChild(sw);
-        item.appendChild(txt);
-        legendItems.appendChild(item);
       });
     }
@@ -202,7 +232,8 @@
     function render() {
       if (!chartData) return;
       const colors = getColors();
-      buildLegend(colors);
       const width = container.clientWidth || 800;
       const height = Math.max(280, Math.round(width / 2.6));
@@ -216,10 +247,12 @@
       const yKey = metric.y;
       const xExtent = d3.extent(chartData, d => d.prefix_chars);
-      const yMax = d3.max(chartData, d => d[yKey]);
       const x = d3.scaleLinear().domain(xExtent).range([0, iw]).nice();
-      const y = d3.scaleLinear().domain([0, Math.max(yMax, 1000)]).range([ih, 0]).nice();
       gRoot.selectAll('*').remove();
@@ -245,12 +278,12 @@
         .attr('x', -ih / 2).attr('y', -50)
         .attr('text-anchor', 'middle').text(metric.yLabel);
-      // Lines + points per model
       const line = d3.line()
         .x(d => x(d.prefix_chars))
         .y(d => y(d[yKey]));
-      MODELS.forEach(name => {
         const series = chartData.filter(d => d.model === name).sort((a, b) => a.prefix_chars - b.prefix_chars);
         gRoot.append('path')
           .datum(series)
@@ -268,7 +301,7 @@
           .attr('stroke-width', 1);
       });
-      // Hover overlay: vertical bisector that highlights both series at the
       // nearest prefix length.
       const allPrefix = Array.from(new Set(chartData.map(d => d.prefix_chars))).sort((a, b) => a - b);
       const overlay = gRoot.append('rect')
@@ -285,19 +318,15 @@
           const xv = x.invert(mx);
           const nearest = allPrefix.reduce((a, b) => Math.abs(b - xv) < Math.abs(a - xv) ? b : a);
           hoverLine.attr('x1', x(nearest)).attr('x2', x(nearest)).style('opacity', 1);
-          const sm = chartData.find(d => d.model === 'SmolLM2' && d.prefix_chars === nearest);
-          const qw = chartData.find(d => d.model === 'Qwen3' && d.prefix_chars === nearest);
-          const html = `
-            <div><strong>Prefix: ${nearest} chars</strong></div>
-            <div class="row" style="margin-top:6px">
-              <span class="name"><span class="swatch" style="background:${colors.SmolLM2}"></span>SmolLM2</span>
-              <span><strong>${sm[yKey]}</strong></span>
-            </div>
-            <div class="row">
-              <span class="name"><span class="swatch" style="background:${colors.Qwen3}"></span>Qwen3</span>
-              <span><strong>${qw[yKey]}</strong></span>
             </div>`;
-          showTip(html, event);
         })
         .on('mouseleave', () => { hoverLine.style('opacity', 0); hideTip(); });
     }
@@ -311,6 +340,11 @@
         most_common_count: +d.most_common_count,
         distinct: +d.distinct,
       }));
       render();
     }).catch(err => {
       const pre = document.createElement('pre');

 <style>
   .d3-prefix-collapse { position: relative; }
   .d3-prefix-collapse .controls {
+    display: flex; gap: 24px; align-items: flex-start; justify-content: space-between;
     flex-wrap: wrap; margin: 0 0 8px 0;
   }
   .d3-prefix-collapse .controls .control-group {
     border: 1px solid var(--border-color); border-radius: 8px;
     background: var(--surface-bg); color: var(--text-color); cursor: pointer;
   }
+  .d3-prefix-collapse .pills {
+    display: flex; flex-wrap: wrap; gap: 6px;
+  }
+  .d3-prefix-collapse .pill {
+    font-size: 12px; padding: 6px 12px; border-radius: 999px;
+    border: 1px solid var(--border-color); background: var(--surface-bg);
+    color: var(--muted-color); cursor: pointer; user-select: none;
+    transition: background .12s ease, color .12s ease, border-color .12s ease;
+    display: inline-flex; align-items: center; gap: 6px;
+  }
+  .d3-prefix-collapse .pill:hover { border-color: var(--text-color); }
+  .d3-prefix-collapse .pill.active {
+    color: var(--surface-bg); background: var(--text-color);
+    border-color: var(--text-color); font-weight: 600;
+  }
+  .d3-prefix-collapse .pill .dot {
+    width: 8px; height: 8px; border-radius: 50%;
+    background: var(--pill-color, currentColor);
+  }
   .d3-prefix-collapse .legend {
     display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
     margin: 8px 0 0 0;
     const METRICS = {
       most_common_count: {
+        label: 'Most common prefix (count)',
         y: 'most_common_count',
         yLabel: 'Outputs sharing the most-common prefix',
         higherIs: 'more collapse',
       },
       distinct: {
+        label: 'Distinct prefixes (count)',
         y: 'distinct',
+        yLabel: 'Distinct prefixes (out of all outputs)',
         higherIs: 'more diversity',
       },
     };
+    const DEFAULT_VISIBLE = new Set(['SmolLM2', 'Qwen3']);
+    // Filled once the data loads (preserves CSV insertion order).
+    let MODELS = [];
+    const visible = new Set();
+    // Controls: model toggle pills on the left, metric selector on the right.
     const controls = document.createElement('div');
     controls.className = 'controls';
+    const modelGroup = document.createElement('div');
+    modelGroup.className = 'control-group';
+    const modelLabel = document.createElement('label');
+    modelLabel.textContent = 'Models (click to toggle)';
+    modelGroup.appendChild(modelLabel);
+    const pillRow = document.createElement('div');
+    pillRow.className = 'pills';
+    modelGroup.appendChild(pillRow);
+    controls.appendChild(modelGroup);
+    const metricGroup = document.createElement('div');
+    metricGroup.className = 'control-group';
     const labelEl = document.createElement('label');
     const selectId = `metric-select-${Math.random().toString(36).slice(2, 8)}`;
     labelEl.htmlFor = selectId;
       opt.value = key; opt.textContent = m.label;
       select.appendChild(opt);
     });
+    metricGroup.appendChild(labelEl);
+    metricGroup.appendChild(select);
+    controls.appendChild(metricGroup);
     container.appendChild(controls);
     // Tooltip
     const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
     const gRoot = svg.append('g');
+    // We keep the model toggles in `pillRow`; no separate legend block needed
+    // because the pills themselves serve as the legend.
     const getColors = () => {
+      // Stable categorical assignment by MODELS insertion order.
+      const cat = window.ColorPalettes
+        ? window.ColorPalettes.getColors('categorical', Math.max(6, MODELS.length))
+        : ['#3fb950', '#f85149', '#58a6ff', '#f0883e', '#bc8cff', '#f7c843'];
+      const out = {};
+      MODELS.forEach((m, i) => { out[m] = cat[i % cat.length]; });
+      return out;
     };
+    function buildPills(colors) {
+      pillRow.innerHTML = '';
       MODELS.forEach(name => {
+        const pill = document.createElement('span');
+        pill.className = 'pill' + (visible.has(name) ? ' active' : '');
+        pill.style.setProperty('--pill-color', colors[name]);
+        pill.innerHTML = `<span class="dot"></span>${name}`;
+        pill.addEventListener('click', () => {
+          if (visible.has(name)) {
+            if (visible.size > 1) visible.delete(name);
+          } else {
+            visible.add(name);
+          }
+          render();
+        });
+        pillRow.appendChild(pill);
       });
     }
     function render() {
       if (!chartData) return;
       const colors = getColors();
+      buildPills(colors);
+      const visibleModels = MODELS.filter(m => visible.has(m));
       const width = container.clientWidth || 800;
       const height = Math.max(280, Math.round(width / 2.6));
       const yKey = metric.y;
       const xExtent = d3.extent(chartData, d => d.prefix_chars);
+      // Scale y to the max among visible models so the chart adapts to selection.
+      const visibleData = chartData.filter(d => visible.has(d.model));
+      const yMax = d3.max(visibleData, d => d[yKey]) || 1;
       const x = d3.scaleLinear().domain(xExtent).range([0, iw]).nice();
+      const y = d3.scaleLinear().domain([0, yMax * 1.05]).range([ih, 0]).nice();
       gRoot.selectAll('*').remove();
         .attr('x', -ih / 2).attr('y', -50)
         .attr('text-anchor', 'middle').text(metric.yLabel);
+      // Lines + points per visible model.
       const line = d3.line()
         .x(d => x(d.prefix_chars))
         .y(d => y(d[yKey]));
+      visibleModels.forEach(name => {
         const series = chartData.filter(d => d.model === name).sort((a, b) => a.prefix_chars - b.prefix_chars);
         gRoot.append('path')
           .datum(series)
           .attr('stroke-width', 1);
       });
+      // Hover overlay: vertical bisector that lists every visible model at the
       // nearest prefix length.
       const allPrefix = Array.from(new Set(chartData.map(d => d.prefix_chars))).sort((a, b) => a - b);
       const overlay = gRoot.append('rect')
           const xv = x.invert(mx);
           const nearest = allPrefix.reduce((a, b) => Math.abs(b - xv) < Math.abs(a - xv) ? b : a);
           hoverLine.attr('x1', x(nearest)).attr('x2', x(nearest)).style('opacity', 1);
+          const N = d3.max(chartData, d => d.distinct);
+          const rows = visibleModels.map(m => {
+            const d = chartData.find(r => r.model === m && r.prefix_chars === nearest);
+            return `<div class="row">
+              <span class="name"><span class="swatch" style="background:${colors[m]}"></span>${m}</span>
+              <span><strong>${d[yKey].toLocaleString()}</strong> / ${N.toLocaleString()}</span>
             </div>`;
+          }).join('');
+          showTip(`<div><strong>Prefix: ${nearest} chars</strong></div>${rows}`, event);
         })
         .on('mouseleave', () => { hoverLine.style('opacity', 0); hideTip(); });
     }
         most_common_count: +d.most_common_count,
         distinct: +d.distinct,
       }));
+      // Preserve CSV insertion order for the model list.
+      MODELS = [];
+      for (const r of chartData) if (!MODELS.includes(r.model)) MODELS.push(r.model);
+      const defaults = MODELS.filter(m => DEFAULT_VISIBLE.has(m));
+      (defaults.length ? defaults : MODELS.slice(0, 3)).forEach(m => visible.add(m));
       render();
     }).catch(err => {
       const pre = document.createElement('pre');

app/src/content/embeds/math-adherence-audit.html ADDED Viewed

	@@ -0,0 +1,346 @@

+<div class="math-adherence-audit"></div>
+<style>
+  .math-adherence-audit { position: relative; }
+  .math-adherence-audit .controls {
+    display: flex; gap: 16px; align-items: center; flex-wrap: wrap; margin: 0 0 12px 0;
+  }
+  .math-adherence-audit .controls .control-group {
+    display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
+  }
+  .math-adherence-audit .controls .label {
+    font-size: 12px; font-weight: 700; color: var(--text-color);
+  }
+  .math-adherence-audit .pills {
+    display: flex; flex-wrap: wrap; gap: 6px;
+  }
+  .math-adherence-audit .pill {
+    font-size: 12px; padding: 6px 12px; border-radius: 999px;
+    border: 1px solid var(--border-color); background: var(--surface-bg);
+    color: var(--muted-color); cursor: pointer; user-select: none;
+    transition: background .12s ease, color .12s ease, border-color .12s ease;
+    display: inline-flex; align-items: center; gap: 6px;
+  }
+  .math-adherence-audit .pill:hover {
+    border-color: var(--text-color);
+  }
+  .math-adherence-audit .pill.active {
+    color: var(--surface-bg);
+    background: var(--text-color);
+    border-color: var(--text-color);
+    font-weight: 600;
+  }
+  .math-adherence-audit .pill .dot {
+    width: 8px; height: 8px; border-radius: 50%;
+    background: var(--pill-color, currentColor);
+  }
+  .math-adherence-audit .table-scroll {
+    overflow-x: auto; -webkit-overflow-scrolling: touch;
+  }
+  .math-adherence-audit table {
+    width: 100%; border-collapse: collapse; font-size: 13px;
+    min-width: 100%;
+  }
+  .math-adherence-audit th, .math-adherence-audit td {
+    padding: 8px 10px; text-align: right; color: var(--text-color);
+    border-bottom: 1px solid var(--border-color);
+  }
+  .math-adherence-audit th:first-child, .math-adherence-audit td:first-child {
+    text-align: left; font-weight: 500;
+  }
+  .math-adherence-audit th {
+    font-size: 12px; font-weight: 700; color: var(--text-color);
+    border-bottom: 2px solid var(--border-color);
+  }
+  .math-adherence-audit tr.group-header td {
+    background: transparent;
+    font-size: 11px; font-weight: 700; text-transform: uppercase;
+    letter-spacing: 0.04em; color: var(--muted-color);
+    padding-top: 14px; padding-bottom: 4px;
+    border-bottom: none;
+  }
+  .math-adherence-audit td.value {
+    font-variant-numeric: tabular-nums;
+    position: relative;
+    overflow: hidden;
+  }
+  .math-adherence-audit td.value .bar {
+    position: absolute; left: 0; top: 0; bottom: 0;
+    background: var(--primary-color); opacity: 0.12;
+    pointer-events: none; z-index: 0;
+  }
+  .math-adherence-audit td.value .v {
+    position: relative; z-index: 1;
+  }
+  .math-adherence-audit td.value.is-max .v { font-weight: 700; }
+  .math-adherence-audit td .metric-label {
+    cursor: help;
+    border-bottom: 1px dotted var(--muted-color);
+  }
+  .math-adherence-audit .info-tip {
+    position: absolute; top: 0; left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none; padding: 10px 12px; border-radius: 8px;
+    font-size: 12px; line-height: 1.45;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg); color: var(--text-color);
+    box-shadow: 0 4px 24px rgba(0,0,0,.18);
+    opacity: 0; transition: opacity .12s ease;
+    max-width: 340px;
+    z-index: 20;
+  }
+</style>
+<script>
+(() => {
+  const bootstrap = () => {
+    const scriptEl = document.currentScript;
+    let container = scriptEl ? scriptEl.previousElementSibling : null;
+    if (!(container && container.classList && container.classList.contains('math-adherence-audit'))) {
+      const cs = Array.from(document.querySelectorAll('.math-adherence-audit'))
+        .filter(el => !(el.dataset && el.dataset.mounted === 'true'));
+      container = cs[cs.length - 1] || null;
+    }
+    if (!container) return;
+    if (container.dataset) {
+      if (container.dataset.mounted === 'true') return;
+      container.dataset.mounted = 'true';
+    }
+    const DEFAULT_VISIBLE = new Set(['SmolLM2', 'Qwen3', 'Llama 3.2']);
+    const fetchCSV = async () => {
+      const paths = [
+        '/data/math_format_adherence.csv',
+        './assets/data/math_format_adherence.csv',
+        '../assets/data/math_format_adherence.csv',
+      ];
+      for (const p of paths) {
+        try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return await r.text(); } catch (_) {}
+      }
+      throw new Error('CSV not found');
+    };
+    // CSV parser that handles double-quoted fields (pandas auto-quotes fields
+    // containing commas, e.g. "Implicit math (operators, no closed equation)").
+    const parseCSVLine = (line) => {
+      const out = [];
+      let cur = '';
+      let inQuotes = false;
+      for (let i = 0; i < line.length; i++) {
+        const ch = line[i];
+        if (inQuotes) {
+          if (ch === '"' && line[i + 1] === '"') { cur += '"'; i++; }
+          else if (ch === '"') { inQuotes = false; }
+          else { cur += ch; }
+        } else {
+          if (ch === '"') { inQuotes = true; }
+          else if (ch === ',') { out.push(cur); cur = ''; }
+          else { cur += ch; }
+        }
+      }
+      out.push(cur);
+      return out;
+    };
+    const parseCSV = (text) => {
+      const lines = text.trim().split('\n');
+      const cols = parseCSVLine(lines.shift());
+      return lines.map(l => {
+        const cells = parseCSVLine(l);
+        const o = {};
+        cols.forEach((c, i) => { o[c] = cells[i]; });
+        return o;
+      });
+    };
+    const fmtValue = (v, fmt) => fmt === '%' ? `${(+v).toFixed(1)}%` : (+v).toFixed(3);
+    const render = (rows) => {
+      // Preserve original metric and model order from the CSV.
+      const metrics = [];
+      const metricMeta = {};
+      const models = [];
+      for (const r of rows) {
+        if (!metricMeta[r.metric]) {
+          metricMeta[r.metric] = {
+            group: r.group,
+            format: r.format,
+            direction: r.direction || 'higher',
+            description: r.description || '',
+          };
+          metrics.push(r.metric);
+        }
+        if (!models.includes(r.model)) models.push(r.model);
+      }
+      const grouped = {};
+      for (const r of rows) {
+        grouped[r.metric] = grouped[r.metric] || {};
+        grouped[r.metric][r.model] = +r.value;
+      }
+      const state = {
+        visible: new Set([...models.filter(m => DEFAULT_VISIBLE.has(m))]),
+      };
+      if (state.visible.size === 0) state.visible = new Set(models.slice(0, 3));
+      container.innerHTML = '';
+      // Tooltip for metric descriptions
+      const tip = document.createElement('div');
+      tip.className = 'info-tip';
+      container.appendChild(tip);
+      const showTip = (html, event) => {
+        tip.innerHTML = html;
+        tip.style.opacity = '1';
+        const cr = container.getBoundingClientRect();
+        const [mx, my] = [event.clientX - cr.left, event.clientY - cr.top];
+        const tw = tip.offsetWidth;
+        const x = mx + tw + 16 > cr.width ? Math.max(0, mx - tw - 12) : mx + 14;
+        tip.style.transform = `translate(${x}px, ${my + 14}px)`;
+      };
+      const hideTip = () => { tip.style.opacity = '0'; tip.style.transform = 'translate(-9999px,-9999px)'; };
+      // Pills
+      const controls = document.createElement('div');
+      controls.className = 'controls';
+      const grp = document.createElement('div');
+      grp.className = 'control-group';
+      const lab = document.createElement('div');
+      lab.className = 'label';
+      lab.textContent = 'Models (click to toggle)';
+      grp.appendChild(lab);
+      // Stable categorical color per model, matching the prefix-collapse chart.
+      const palette = window.ColorPalettes
+        ? window.ColorPalettes.getColors('categorical', Math.max(6, models.length))
+        : ['#3fb950', '#f85149', '#58a6ff', '#f0883e', '#bc8cff', '#f7c843'];
+      const modelColor = Object.fromEntries(models.map((m, i) => [m, palette[i % palette.length]]));
+      const pillRow = document.createElement('div');
+      pillRow.className = 'pills';
+      models.forEach(m => {
+        const pill = document.createElement('span');
+        pill.className = 'pill' + (state.visible.has(m) ? ' active' : '');
+        pill.style.setProperty('--pill-color', modelColor[m]);
+        pill.innerHTML = `<span class="dot"></span>${m}`;
+        pill.addEventListener('click', () => {
+          if (state.visible.has(m)) {
+            if (state.visible.size > 1) state.visible.delete(m);
+          } else {
+            state.visible.add(m);
+          }
+          drawTable();
+        });
+        pillRow.appendChild(pill);
+      });
+      grp.appendChild(pillRow);
+      controls.appendChild(grp);
+      container.appendChild(controls);
+      const tableWrap = document.createElement('div');
+      tableWrap.className = 'table-scroll';
+      container.appendChild(tableWrap);
+      const drawTable = () => {
+        // Refresh pills
+        Array.from(pillRow.children).forEach((pill, i) => {
+          pill.classList.toggle('active', state.visible.has(models[i]));
+        });
+        const visibleModels = models.filter(m => state.visible.has(m));
+        const table = document.createElement('table');
+        const thead = document.createElement('thead');
+        const trh = document.createElement('tr');
+        trh.appendChild(Object.assign(document.createElement('th'), { textContent: 'Metric' }));
+        visibleModels.forEach(m => {
+          trh.appendChild(Object.assign(document.createElement('th'), { textContent: m }));
+        });
+        thead.appendChild(trh);
+        table.appendChild(thead);
+        const tbody = document.createElement('tbody');
+        let currentGroup = null;
+        metrics.forEach(metric => {
+          const meta = metricMeta[metric];
+          if (meta.group !== currentGroup) {
+            const groupTr = document.createElement('tr');
+            groupTr.className = 'group-header';
+            const td = document.createElement('td');
+            td.colSpan = visibleModels.length + 1;
+            td.textContent = meta.group;
+            groupTr.appendChild(td);
+            tbody.appendChild(groupTr);
+            currentGroup = meta.group;
+          }
+          const tr = document.createElement('tr');
+          const labelTd = document.createElement('td');
+          const labelSpan = document.createElement('span');
+          labelSpan.className = 'metric-label';
+          labelSpan.textContent = metric;
+          if (meta.description) {
+            const descHTML = `<strong>${metric}</strong><br/><span style="color:var(--muted-color)">${meta.description}</span>`;
+            labelSpan.addEventListener('mouseenter', (e) => showTip(descHTML, e));
+            labelSpan.addEventListener('mousemove', (e) => showTip(descHTML, e));
+            labelSpan.addEventListener('mouseleave', hideTip);
+          }
+          labelTd.appendChild(labelSpan);
+          tr.appendChild(labelTd);
+          const vals = visibleModels.map(m => grouped[metric][m]);
+          const maxVal = Math.max(...vals);
+          const minVal = Math.min(...vals);
+          // Best cell index depends on whether higher or lower is better.
+          // Neutral metrics get no highlight.
+          let bestIdx = -1;
+          if (meta.direction === 'higher' && vals.length > 1) bestIdx = vals.indexOf(maxVal);
+          else if (meta.direction === 'lower' && vals.length > 1) bestIdx = vals.indexOf(minVal);
+          visibleModels.forEach((m, i) => {
+            const td = document.createElement('td');
+            td.className = 'value' + (i === bestIdx ? ' is-max' : '');
+            if (meta.direction !== 'neutral') {
+              const bar = document.createElement('span');
+              bar.className = 'bar';
+              // For "lower-is-better" metrics, invert so the smallest value gets the longest bar.
+              let norm;
+              if (meta.direction === 'higher') {
+                norm = maxVal > 0 ? vals[i] / maxVal : 0;
+              } else {
+                // direction === 'lower'; rescale so min -> 1, max -> small floor
+                const range = maxVal - minVal;
+                norm = range > 0 ? 1 - (vals[i] - minVal) / range * 0.85 : 1;
+              }
+              bar.style.width = `${(norm * 100).toFixed(1)}%`;
+              td.appendChild(bar);
+            }
+            const v = document.createElement('span');
+            v.className = 'v';
+            v.textContent = fmtValue(vals[i], meta.format);
+            td.appendChild(v);
+            tr.appendChild(td);
+          });
+          tbody.appendChild(tr);
+        });
+        table.appendChild(tbody);
+        tableWrap.innerHTML = '';
+        tableWrap.appendChild(table);
+      };
+      drawTable();
+    };
+    fetchCSV().then(text => render(parseCSV(text))).catch(err => {
+      const pre = document.createElement('pre');
+      pre.style.color = 'red';
+      pre.textContent = `Error loading audit data: ${err.message}`;
+      container.appendChild(pre);
+    });
+  };
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', bootstrap, { once: true });
+  } else {
+    bootstrap();
+  }
+})();
+</script>