joelniklaus HF Staff commited on
Commit
20b148e
·
1 Parent(s): 9142722

extend math analysis

Browse files
app/src/content/analysis/qwen3_vs_smollm2_math.py CHANGED
@@ -15,15 +15,28 @@ from collections import Counter
15
  from pathlib import Path
16
 
17
  import pandas as pd
 
 
18
 
19
  logging.basicConfig(level=logging.INFO, format="%(message)s")
20
  logger = logging.getLogger(__name__)
21
 
22
  BUCKET_URI = "hf://buckets/HuggingFaceFW/finephrase-rephrased/format"
23
- CHUNK = "00000_chunk_0.jsonl.gz"
 
 
24
  CACHE_DIR = Path(__file__).parent / ".cache_math_outputs"
25
 
26
- MODELS = {"SmolLM2": "math-smollm2-1.7b-hq", "Qwen3": "math-qwen3-1.7b-hq"}
 
 
 
 
 
 
 
 
 
27
 
28
  # Headers like "Problem:" and "**Problem:**", optionally bolded
29
  PROBLEM_RE = re.compile(r"\*{0,2}\s*problem\s*\*{0,2}\s*:", re.IGNORECASE)
@@ -35,48 +48,207 @@ STEP_RE = re.compile(r"^\s*(?:\d+[\.\)]|Step \d+:|[-*])\s", re.MULTILINE)
35
  LATEX_RE = re.compile(
36
  r"\$\$[^$]+\$\$|\$[^$\n]+\$|\\frac|\\times|\\div|\\cdot|\\sqrt|\\sum|\\int|\\pi\b|\\alpha\b|\\beta\b"
37
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Prefix lengths (in characters) at which to measure template collapse.
39
  START_LENS = list(range(2, 61, 2))
40
 
41
 
42
- def fetch_chunk(model_dir: str) -> Path:
43
- """Download chunk 0 for `model_dir` from the bucket into the local cache."""
44
- CACHE_DIR.mkdir(exist_ok=True)
45
- dst = CACHE_DIR / f"{model_dir}.jsonl.gz"
 
46
  if dst.exists():
47
  return dst
48
- src = f"{BUCKET_URI}/{model_dir}/{CHUNK}"
49
- logger.info(f"Downloading {src} -> {dst}")
50
- subprocess.run(["hf", "buckets", "cp", src, str(dst)], check=True)
 
 
 
51
  return dst
52
 
53
 
54
- def load_outputs(path: Path) -> pd.DataFrame:
55
- """Load rephrased outputs and per-row structural features."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  rows = []
57
- with gzip.open(path, "rt") as f:
58
- for line in f:
59
- d = json.loads(line)
60
- # Some SmolLM2 rows had vLLM context-length errors and lack `text`
61
- text = d["text"] if "text" in d else ""
62
- tokens = d["metadata"]["token_count"]
63
- ir = d["metadata"]["inference_results"][0]
64
- finish_reason = ir["finish_reason"] if "finish_reason" in ir else "error"
65
- has_problem = bool(PROBLEM_RE.search(text))
66
- has_solution = bool(SOLUTION_RE.search(text))
67
- row = {
68
- "text": text,
69
- "tokens": tokens,
70
- "finish_reason": finish_reason,
71
- "has_problem": has_problem,
72
- "has_solution": has_solution,
73
- "has_problem_solution": has_problem and has_solution,
74
- "has_step_by_step": len(STEP_RE.findall(text)) >= 2,
75
- "has_latex": bool(LATEX_RE.search(text)),
76
- }
77
- for n in START_LENS:
78
- row[f"start_{n}"] = text[:n]
79
- rows.append(row)
80
  return pd.DataFrame(rows)
81
 
82
 
@@ -93,10 +265,36 @@ def quality_bucket(row: pd.Series) -> str:
93
  return "Poor"
94
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def summarise(name: str, df: pd.DataFrame) -> dict:
97
  """Compute the headline numbers for one model."""
98
  n = len(df)
99
  buckets = df.apply(quality_bucket, axis=1).value_counts().to_dict()
 
 
 
 
 
 
 
 
 
100
  # Exclude failed inferences (empty text) from prefix-collapse to avoid
101
  # an artificial "" cluster.
102
  non_empty = df[df["text"].str.len() > 0]
@@ -128,57 +326,105 @@ def summarise(name: str, df: pd.DataFrame) -> dict:
128
  "n_errored": int((df["finish_reason"] == "error").sum()),
129
  "prefix_collapse": prefix_collapse,
130
  "buckets": {k: 100 * buckets.get(k, 0) / n for k in ("Excellent", "Good", "Partial", "Poor")},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
 
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def main() -> None:
 
135
  summaries = {}
136
- for name, model_dir in MODELS.items():
137
- path = fetch_chunk(model_dir)
138
- df = load_outputs(path)
 
 
139
  summaries[name] = summarise(name, df)
140
 
141
  logger.info("\n=== Structural quality (per 1000 outputs) ===")
142
- logger.info(f"{'Metric':<32} {'SmolLM2':>10} {'Qwen3':>10}")
143
- logger.info("-" * 54)
144
- for key, label in [
145
- ("pct_problem_solution", "Problem + Solution sections"),
146
- ("pct_step_by_step", "Numbered step-by-step"),
147
- ("pct_latex", "LaTeX math notation"),
148
- ("pct_has_solution", "Contains the word 'solution'"),
149
- ]:
150
- logger.info(
151
- f"{label:<32} {summaries['SmolLM2'][key]:>9.1f}% {summaries['Qwen3'][key]:>9.1f}%"
152
- )
153
 
154
  logger.info("\n=== Output length (tokens, output_tokenizer) ===")
155
- logger.info(f"{'Statistic':<32} {'SmolLM2':>10} {'Qwen3':>10}")
156
- logger.info("-" * 54)
157
- for key, label in [
158
- ("token_min", "min (all)"),
159
- ("token_median", "median (all)"),
160
- ("token_max", "max (all)"),
161
- ("stop_token_min", "min (finish=stop)"),
162
- ("stop_token_max", "max (finish=stop)"),
163
- ("n_truncated", "# length-truncated"),
164
- ("n_errored", "# inference errors"),
165
- ]:
166
- logger.info(f"{label:<32} {summaries['SmolLM2'][key]:>10} {summaries['Qwen3'][key]:>10}")
167
 
168
  logger.info("\n=== Template collapse (most common prefix at varying lengths) ===")
169
- logger.info(f"{'Prefix chars':<14} {'SmolLM2 (top/distinct)':>26} {'Qwen3 (top/distinct)':>26}")
170
- logger.info("-" * 70)
 
 
 
171
  for size in [10, 20, 40, 60]:
172
- s = summaries["SmolLM2"]["prefix_collapse"][size]
173
- q = summaries["Qwen3"]["prefix_collapse"][size]
174
- logger.info(
175
- f"{size:<14} "
176
- f"{s['most_common_count']:>10}/1000 ({s['distinct']:>4} unique) "
177
- f"{q['most_common_count']:>10}/1000 ({q['distinct']:>4} unique)"
178
- )
179
  logger.info("\nMost common 40-char prefix per model:")
180
- for name in ("SmolLM2", "Qwen3"):
181
- logger.info(f" {name}: {summaries[name]['prefix_collapse'][40]['example']!r}")
182
 
183
  # Emit the dense prefix-collapse curve as CSV for the d3 chart.
184
  csv_path = Path(__file__).parent.parent / "assets/data/qwen3_vs_smollm2_prefix_collapse.csv"
@@ -196,16 +442,136 @@ def main() -> None:
196
  pd.DataFrame(csv_rows).to_csv(csv_path, index=False)
197
  logger.info(f"\nSaved prefix-collapse curve to {csv_path}")
198
 
199
- logger.info("\n=== Quality buckets (% of outputs) ===")
200
- logger.info(f"{'Bucket':<12} {'SmolLM2':>10} {'Qwen3':>10}")
201
- logger.info("-" * 34)
202
- for bucket in ("Excellent", "Good", "Partial", "Poor"):
203
- logger.info(
204
- f"{bucket:<12} "
205
- f"{summaries['SmolLM2']['buckets'][bucket]:>9.1f}% "
206
- f"{summaries['Qwen3']['buckets'][bucket]:>9.1f}%"
207
  )
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  out = Path(__file__).parent / "qwen3_vs_smollm2_math_results.json"
210
  with out.open("w") as f:
211
  json.dump(summaries, f, indent=2)
 
15
  from pathlib import Path
16
 
17
  import pandas as pd
18
+ from joblib import Parallel, delayed
19
+ from tqdm.auto import tqdm
20
 
21
  logging.basicConfig(level=logging.INFO, format="%(message)s")
22
  logger = logging.getLogger(__name__)
23
 
24
  BUCKET_URI = "hf://buckets/HuggingFaceFW/finephrase-rephrased/format"
25
+ # Each chunk holds 1000 documents; 10 chunks = 10000 outputs per model.
26
+ N_CHUNKS = 10
27
+ CHUNKS = [f"00000_chunk_{i}.jsonl.gz" for i in range(N_CHUNKS)]
28
  CACHE_DIR = Path(__file__).parent / ".cache_math_outputs"
29
 
30
+ MODELS = {
31
+ "SmolLM2": "math-smollm2-1.7b-hq",
32
+ "Qwen3": "math-qwen3-1.7b-hq",
33
+ "Llama 3.2": "math-llama3.2-1b-hq",
34
+ "Gemma 3": "math-1b-hq",
35
+ "Falcon 3": "math-falcon3-1b-hq",
36
+ "Granite 3": "math-granite3-1b-hq",
37
+ }
38
+ # Default-visible models in the interactive comparison.
39
+ DEFAULT_VISIBLE = ["SmolLM2", "Qwen3", "Llama 3.2"]
40
 
41
  # Headers like "Problem:" and "**Problem:**", optionally bolded
42
  PROBLEM_RE = re.compile(r"\*{0,2}\s*problem\s*\*{0,2}\s*:", re.IGNORECASE)
 
48
  LATEX_RE = re.compile(
49
  r"\$\$[^$]+\$\$|\$[^$\n]+\$|\\frac|\\times|\\div|\\cdot|\\sqrt|\\sum|\\int|\\pi\b|\\alpha\b|\\beta\b"
50
  )
51
+
52
+ # Format-adherence patterns for the math audit (Section 5.2 reviewer response).
53
+ DIGIT_RE = re.compile(r"\d")
54
+ # Any arithmetic op directly between two numbers, or common LaTeX math ops.
55
+ ARITH_OP_RE = re.compile(r"\d\s*[+\-×÷*/]\s*\d|\\times|\\div|\\cdot|\\frac")
56
+ # A full "N op N = N" arithmetic statement; allows commas (e.g. "1,500") and decimals.
57
+ _NUM = r"-?\d{1,3}(?:,\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?"
58
+ ARITH_EQ_RE = re.compile(rf"({_NUM})\s*([+\-×÷*/])\s*({_NUM})\s*=\s*({_NUM})")
59
+ # Standalone numeric tokens for the number-provenance check.
60
+ NUM_TOKEN_RE = re.compile(r"\d+(?:[.,]\d+)*")
61
+ # Interrogative / imperative cues that mark a real math question.
62
+ QUESTION_CUE_RE = re.compile(
63
+ r"\b(how many|how much|how long|how often|what is|what was|what are|"
64
+ r"calculate|compute|determine|find|solve)\b",
65
+ re.IGNORECASE,
66
+ )
67
+ # Patterns at the end of a solution that indicate a closed numeric answer.
68
+ FINAL_ANSWER_RE = re.compile(
69
+ r"(the (final )?answer is\s*\$?\d|therefore[, ]+[^.\n]{0,40}=\s*-?\d|"
70
+ r"=\s*-?\d+(?:\.\d+)?\s*[.\n]|answer\s*[:=]\s*-?\d|total\s*[:=]\s*-?\d)",
71
+ re.IGNORECASE,
72
+ )
73
+ # Lexicon used for math-vocabulary density (per 100 alpha tokens).
74
+ MATH_WORDS = frozenset({
75
+ "calculate", "calculation", "equation", "equations", "probability", "percentage",
76
+ "percent", "ratio", "sum", "product", "average", "mean", "median",
77
+ "divided", "multiplied", "equals", "subtract", "subtraction", "add", "addition",
78
+ "multiply", "multiplication", "divide", "division", "formula", "variable",
79
+ "solve", "value", "integer", "decimal", "fraction", "remainder", "quotient",
80
+ })
81
+ # English stopwords for the topic-drift Jaccard check.
82
+ STOPWORDS = frozenset((
83
+ "the a an and or but if then so as is are was were be been being have has "
84
+ "had do does did of to in on at by for with from up about into through "
85
+ "during before after above below between under further once here there "
86
+ "when where why how all any both each few more most other some such no nor "
87
+ "not only own same so than too very can will just don should now this that "
88
+ "these those it its their them they we you our your his her him she he"
89
+ ).split())
90
+
91
  # Prefix lengths (in characters) at which to measure template collapse.
92
  START_LENS = list(range(2, 61, 2))
93
 
94
 
95
+ def _download_one(model_dir: str, chunk_name: str) -> Path:
96
+ """Download a single chunk if not already cached. Returns the destination path."""
97
+ dst_dir = CACHE_DIR / model_dir
98
+ dst_dir.mkdir(parents=True, exist_ok=True)
99
+ dst = dst_dir / chunk_name
100
  if dst.exists():
101
  return dst
102
+ src = f"{BUCKET_URI}/{model_dir}/{chunk_name}"
103
+ subprocess.run(
104
+ ["hf", "buckets", "cp", src, str(dst)],
105
+ check=True,
106
+ capture_output=True,
107
+ )
108
  return dst
109
 
110
 
111
+ def fetch_chunks(model_dir: str, chunks: list[str] = CHUNKS) -> list[Path]:
112
+ """Download `chunks` for `model_dir` in parallel; reuses anything already cached."""
113
+ missing = [c for c in chunks if not (CACHE_DIR / model_dir / c).exists()]
114
+ if missing:
115
+ logger.info(f" Downloading {len(missing)} chunk(s) for {model_dir}...")
116
+ Parallel(n_jobs=4, backend="threading")(
117
+ delayed(_download_one)(model_dir, c)
118
+ for c in tqdm(missing, desc=f" {model_dir}", leave=False)
119
+ )
120
+ return [CACHE_DIR / model_dir / c for c in chunks]
121
+
122
+
123
+ def _verify_equation(a_s: str, op: str, b_s: str, r_s: str) -> bool:
124
+ """Return True iff `a op b == r` (with small tolerance, ignoring thousands commas)."""
125
+ try:
126
+ a = float(a_s.replace(",", ""))
127
+ b = float(b_s.replace(",", ""))
128
+ r = float(r_s.replace(",", ""))
129
+ except ValueError:
130
+ return False
131
+ if op == "+":
132
+ expected = a + b
133
+ elif op == "-":
134
+ expected = a - b
135
+ elif op in ("×", "*"):
136
+ expected = a * b
137
+ elif op in ("÷", "/"):
138
+ if b == 0:
139
+ return False
140
+ expected = a / b
141
+ else:
142
+ return False
143
+ return abs(expected - r) < max(0.01, abs(expected) * 1e-4)
144
+
145
+
146
+ def _content_words(text: str) -> set[str]:
147
+ """Lowercase alphabetic tokens >=4 chars, excluding stopwords."""
148
+ return {w for w in re.findall(r"[a-z]{4,}", text.lower()) if w not in STOPWORDS}
149
+
150
+
151
+ def _normalize_numbers(text: str) -> set[str]:
152
+ """Set of distinct numeric tokens with thousands-commas stripped."""
153
+ return {n.replace(",", "") for n in NUM_TOKEN_RE.findall(text)}
154
+
155
+
156
+ def audit_math_validity(text: str, input_text: str) -> dict:
157
+ """Quantify math format-adherence at the row level for the reviewer audit.
158
+
159
+ Aggregates five complementary signals, each addressing a different failure mode
160
+ the regex audit catches imperfectly on its own:
161
+
162
+ 1. Number provenance: are the numbers in the output anchored in the source
163
+ document, or invented?
164
+ 2. Question/answer alignment: does the problem pose a real question and the
165
+ solution end with a numeric answer?
166
+ 3. Final-answer pattern: is there a closed "the answer is N" termination?
167
+ 4. Math vocabulary density: how math-flavored is the prose, normalized per
168
+ 100 alphabetic tokens?
169
+ 5. Topic Jaccard: content-word overlap with the source document, a cheap
170
+ proxy for whether the rephrasing stayed on topic.
171
+ """
172
+ eqs = ARITH_EQ_RE.findall(text)
173
+ n_correct = sum(_verify_equation(*e) for e in eqs)
174
+
175
+ out_numbers = _normalize_numbers(text)
176
+ in_numbers = _normalize_numbers(input_text)
177
+ shared_numbers = out_numbers & in_numbers
178
+
179
+ in_words = _content_words(input_text)
180
+ out_words = _content_words(text)
181
+ jaccard = (
182
+ len(in_words & out_words) / len(in_words | out_words)
183
+ if (in_words or out_words)
184
+ else 0.0
185
+ )
186
+
187
+ alpha_tokens = re.findall(r"[a-zA-Z]+", text.lower())
188
+ math_vocab_density = (
189
+ 100.0 * sum(1 for t in alpha_tokens if t in MATH_WORDS) / len(alpha_tokens)
190
+ if alpha_tokens
191
+ else 0.0
192
+ )
193
+
194
+ return {
195
+ # Structural
196
+ "has_digits": bool(DIGIT_RE.search(text)),
197
+ "has_arith_op": bool(ARITH_OP_RE.search(text)),
198
+ "n_equations": len(eqs),
199
+ "n_correct_equations": n_correct,
200
+ "has_correct_equation": n_correct > 0,
201
+ # Audit #1: number provenance
202
+ "n_out_numbers": len(out_numbers),
203
+ "n_shared_numbers": len(shared_numbers),
204
+ "any_number_from_input": len(shared_numbers) > 0,
205
+ # Audit #2: question/answer alignment
206
+ "has_question_cue": bool(QUESTION_CUE_RE.search(text)),
207
+ "has_numeric_in_tail": bool(re.search(r"\d", text[-200:])),
208
+ "qa_aligned": bool(QUESTION_CUE_RE.search(text)) and bool(re.search(r"\d", text[-200:])),
209
+ # Audit #3: final-answer pattern
210
+ "has_final_answer": bool(FINAL_ANSWER_RE.search(text)),
211
+ # Audit #4: math vocabulary density (per 100 alpha tokens)
212
+ "math_vocab_density": math_vocab_density,
213
+ # Audit #5: topic Jaccard with input
214
+ "topic_jaccard": jaccard,
215
+ }
216
+
217
+
218
+ def _features_for_line(line: str) -> dict:
219
+ """Compute structural + audit features for one JSONL row."""
220
+ d = json.loads(line)
221
+ # Some SmolLM2 rows had vLLM context-length errors and lack `text`
222
+ text = d["text"] if "text" in d else ""
223
+ input_text = d["metadata"]["input"]["text"]
224
+ tokens = d["metadata"]["token_count"]
225
+ ir = d["metadata"]["inference_results"][0]
226
+ finish_reason = ir["finish_reason"] if "finish_reason" in ir else "error"
227
+ has_problem = bool(PROBLEM_RE.search(text))
228
+ has_solution = bool(SOLUTION_RE.search(text))
229
+ row = {
230
+ "text": text,
231
+ "tokens": tokens,
232
+ "finish_reason": finish_reason,
233
+ "has_problem": has_problem,
234
+ "has_solution": has_solution,
235
+ "has_problem_solution": has_problem and has_solution,
236
+ "has_step_by_step": len(STEP_RE.findall(text)) >= 2,
237
+ "has_latex": bool(LATEX_RE.search(text)),
238
+ **audit_math_validity(text, input_text),
239
+ }
240
+ for n in START_LENS:
241
+ row[f"start_{n}"] = text[:n]
242
+ return row
243
+
244
+
245
+ def load_outputs(paths: list[Path]) -> pd.DataFrame:
246
+ """Load rephrased outputs from one or more chunks and compute per-row features."""
247
  rows = []
248
+ for path in paths:
249
+ with gzip.open(path, "rt") as f:
250
+ for line in f:
251
+ rows.append(_features_for_line(line))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  return pd.DataFrame(rows)
253
 
254
 
 
265
  return "Poor"
266
 
267
 
268
+ def math_adherence_bucket(row: pd.Series) -> str:
269
+ """Categorize each output by how well it actually performs the math task.
270
+
271
+ Independent of surface structure ("Problem:" / "Solution:" markers); this
272
+ audit answers the reviewer concern that some "math" outputs contain no math.
273
+ """
274
+ if row["has_correct_equation"]:
275
+ return "Valid math" # Has at least one verifiably correct N op N = N
276
+ if row["n_equations"] > 0:
277
+ return "Wrong math" # Has equations but none arithmetically correct
278
+ if row["has_arith_op"]:
279
+ return "Implicit math" # Operators between numbers but no closed equation
280
+ if row["has_digits"]:
281
+ return "Numeric but no math" # Numbers present, no arithmetic at all
282
+ return "No math content" # No digits at all (the "Paul Revere Williams" case)
283
+
284
+
285
  def summarise(name: str, df: pd.DataFrame) -> dict:
286
  """Compute the headline numbers for one model."""
287
  n = len(df)
288
  buckets = df.apply(quality_bucket, axis=1).value_counts().to_dict()
289
+ adherence = df.apply(math_adherence_bucket, axis=1).value_counts().to_dict()
290
+ # Sample a few "No math content" outputs (the Paul Revere case) for the appendix.
291
+ no_math_examples = (
292
+ df[df["has_problem_solution"] & ~df["has_digits"]]["text"]
293
+ .head(3)
294
+ .tolist()
295
+ )
296
+ total_eqs = int(df["n_equations"].sum())
297
+ correct_eqs = int(df["n_correct_equations"].sum())
298
  # Exclude failed inferences (empty text) from prefix-collapse to avoid
299
  # an artificial "" cluster.
300
  non_empty = df[df["text"].str.len() > 0]
 
326
  "n_errored": int((df["finish_reason"] == "error").sum()),
327
  "prefix_collapse": prefix_collapse,
328
  "buckets": {k: 100 * buckets.get(k, 0) / n for k in ("Excellent", "Good", "Partial", "Poor")},
329
+ "math_adherence": {
330
+ k: 100 * adherence.get(k, 0) / n
331
+ for k in ("Valid math", "Wrong math", "Implicit math", "Numeric but no math", "No math content")
332
+ },
333
+ "pct_with_equation": 100 * (df["n_equations"] > 0).mean(),
334
+ "pct_with_correct_equation": 100 * df["has_correct_equation"].mean(),
335
+ "pct_with_arith_op": 100 * df["has_arith_op"].mean(),
336
+ "pct_no_digits": 100 * (~df["has_digits"]).mean(),
337
+ "equation_accuracy": (correct_eqs / total_eqs) if total_eqs else float("nan"),
338
+ "total_equations": total_eqs,
339
+ "correct_equations": correct_eqs,
340
+ # Tier-1 audits
341
+ "pct_any_number_from_input": 100 * df["any_number_from_input"].mean(),
342
+ "pct_qa_aligned": 100 * df["qa_aligned"].mean(),
343
+ "pct_has_final_answer": 100 * df["has_final_answer"].mean(),
344
+ "math_vocab_density_mean": float(df["math_vocab_density"].mean()),
345
+ "topic_jaccard_mean": float(df["topic_jaccard"].mean()),
346
+ "no_math_examples": no_math_examples,
347
  }
348
 
349
 
350
+ def _print_metric_table(summaries: dict, rows: list[tuple[str, str, str]]) -> None:
351
+ """Print a single metric table with all models as columns.
352
+
353
+ `rows` is a list of (summary_key, display_label, format_spec) where
354
+ format_spec is "%" for percentages, "i" for integers, "f" for floats.
355
+ """
356
+ names = list(summaries)
357
+ col_w = max(11, max(len(n) for n in names) + 1)
358
+ header = f"{'Metric':<42}" + "".join(f"{n:>{col_w}}" for n in names)
359
+ logger.info(header)
360
+ logger.info("-" * len(header))
361
+ for key, label, fmt in rows:
362
+ cells = []
363
+ for n in names:
364
+ v = summaries[n][key]
365
+ if fmt == "%":
366
+ cells.append(f"{v:>{col_w - 1}.1f}%")
367
+ elif fmt == "i":
368
+ cells.append(f"{v:>{col_w}}")
369
+ else:
370
+ cells.append(f"{v:>{col_w}.3f}")
371
+ logger.info(f"{label:<42}" + "".join(cells))
372
+
373
+
374
+ def _migrate_old_cache() -> None:
375
+ """Move single-file caches (e.g. `math-qwen3-1.7b-hq.jsonl.gz`) into per-model
376
+ subfolders so the new multi-chunk layout can reuse them as chunk 0."""
377
+ if not CACHE_DIR.exists():
378
+ return
379
+ for model_dir in MODELS.values():
380
+ old_path = CACHE_DIR / f"{model_dir}.jsonl.gz"
381
+ new_path = CACHE_DIR / model_dir / CHUNKS[0]
382
+ if old_path.exists() and not new_path.exists():
383
+ new_path.parent.mkdir(parents=True, exist_ok=True)
384
+ old_path.rename(new_path)
385
+
386
+
387
  def main() -> None:
388
+ _migrate_old_cache()
389
  summaries = {}
390
+ for name, model_dir in tqdm(MODELS.items(), desc="Models", unit="model"):
391
+ logger.info(f"\nProcessing {name} ({model_dir})...")
392
+ paths = fetch_chunks(model_dir)
393
+ df = load_outputs(paths)
394
+ logger.info(f" Loaded {len(df)} outputs.")
395
  summaries[name] = summarise(name, df)
396
 
397
  logger.info("\n=== Structural quality (per 1000 outputs) ===")
398
+ _print_metric_table(summaries, [
399
+ ("pct_problem_solution", "Problem + Solution sections", "%"),
400
+ ("pct_step_by_step", "Numbered step-by-step", "%"),
401
+ ("pct_latex", "LaTeX math notation", "%"),
402
+ ("pct_has_solution", "Contains the word 'solution'", "%"),
403
+ ])
 
 
 
 
 
404
 
405
  logger.info("\n=== Output length (tokens, output_tokenizer) ===")
406
+ _print_metric_table(summaries, [
407
+ ("token_min", "min (all)", "i"),
408
+ ("token_median", "median (all)", "i"),
409
+ ("token_max", "max (all)", "i"),
410
+ ("stop_token_min", "min (finish=stop)", "i"),
411
+ ("stop_token_max", "max (finish=stop)", "i"),
412
+ ("n_truncated", "# length-truncated", "i"),
413
+ ("n_errored", "# inference errors", "i"),
414
+ ])
 
 
 
415
 
416
  logger.info("\n=== Template collapse (most common prefix at varying lengths) ===")
417
+ names = list(summaries)
418
+ col_w = max(15, max(len(n) for n in names) + 4)
419
+ header = f"{'Prefix chars':<14}" + "".join(f"{n:>{col_w}}" for n in names)
420
+ logger.info(header)
421
+ logger.info("-" * len(header))
422
  for size in [10, 20, 40, 60]:
423
+ cells = [f"{summaries[n]['prefix_collapse'][size]['most_common_count']:>4}/1000".rjust(col_w) for n in names]
424
+ logger.info(f"{size:<14}" + "".join(cells))
 
 
 
 
 
425
  logger.info("\nMost common 40-char prefix per model:")
426
+ for n in names:
427
+ logger.info(f" {n}: {summaries[n]['prefix_collapse'][40]['example']!r}")
428
 
429
  # Emit the dense prefix-collapse curve as CSV for the d3 chart.
430
  csv_path = Path(__file__).parent.parent / "assets/data/qwen3_vs_smollm2_prefix_collapse.csv"
 
442
  pd.DataFrame(csv_rows).to_csv(csv_path, index=False)
443
  logger.info(f"\nSaved prefix-collapse curve to {csv_path}")
444
 
445
+ # Flatten bucket/adherence dicts into top-level keys so the table helper can read them.
446
+ for s in summaries.values():
447
+ for b, v in s["buckets"].items():
448
+ s[f"bucket__{b}"] = v
449
+ for b, v in s["math_adherence"].items():
450
+ s[f"adherence__{b}"] = v
451
+ s["pct_equation_accuracy"] = (
452
+ 100 * s["equation_accuracy"] if s["equation_accuracy"] == s["equation_accuracy"] else 0.0
453
  )
454
 
455
+ logger.info("\n=== Quality buckets (% of outputs) ===")
456
+ _print_metric_table(summaries, [(f"bucket__{b}", b, "%") for b in ("Excellent", "Good", "Partial", "Poor")])
457
+
458
+ logger.info("\n=== Math format-adherence audit (reviewer response) ===")
459
+ _print_metric_table(summaries, [
460
+ ("pct_no_digits", "No digits at all", "%"),
461
+ ("pct_with_arith_op", "Has arithmetic operator", "%"),
462
+ ("pct_with_equation", "Has full N op N = N equation", "%"),
463
+ ("pct_with_correct_equation", "Has ≥1 correct arithmetic equation", "%"),
464
+ ("pct_any_number_from_input", "≥1 number reused from source", "%"),
465
+ ("pct_qa_aligned", "Question word + numeric answer", "%"),
466
+ ("pct_has_final_answer", "Closed final-answer pattern", "%"),
467
+ ("pct_equation_accuracy", "Per-equation arithmetic accuracy", "%"),
468
+ ("math_vocab_density_mean", "Math-vocab density (per 100 tokens)", "f"),
469
+ ("topic_jaccard_mean", "Topic Jaccard with source (mean)", "f"),
470
+ ])
471
+
472
+ logger.info("\n=== Adherence buckets (% of outputs) ===")
473
+ _print_metric_table(summaries, [
474
+ (f"adherence__{b}", b, "%")
475
+ for b in ("Valid math", "Wrong math", "Implicit math", "Numeric but no math", "No math content")
476
+ ])
477
+
478
+ # Emit a long-format CSV of the per-model format-adherence audit for the embed.
479
+ adherence_csv = Path(__file__).parent.parent / "assets/data/math_format_adherence.csv"
480
+ # direction: "higher" = darker bar for higher value, "lower" = darker for lower,
481
+ # "neutral" = no bar shading and no best-cell highlight.
482
+ # description: shown in the embed's hover tooltip to explain how the metric is computed.
483
+ audit_rows = [
484
+ (
485
+ "Adherence buckets",
486
+ "Valid math (≥1 verifiably correct equation)",
487
+ "adherence__Valid math", "%", "higher",
488
+ "Output contains at least one arithmetic statement of the form N op N = N where the math checks out (e.g., 120 × 5 = 600). Mutually exclusive with the other adherence buckets.",
489
+ ),
490
+ (
491
+ "Adherence buckets",
492
+ "Wrong math (equations but none correct)",
493
+ "adherence__Wrong math", "%", "lower",
494
+ "Output contains at least one N op N = N statement but every such equation is arithmetically wrong (e.g., 3 × 5 = -15).",
495
+ ),
496
+ (
497
+ "Adherence buckets",
498
+ "Implicit math (operators, no closed equation)",
499
+ "adherence__Implicit math", "%", "neutral",
500
+ "Output contains arithmetic operators directly between numbers (e.g., '120 × 5') or LaTeX math commands (\\times, \\frac, \\cdot) but no fully closed N op N = N equation.",
501
+ ),
502
+ (
503
+ "Adherence buckets",
504
+ "Numeric but no math",
505
+ "adherence__Numeric but no math", "%", "lower",
506
+ "Output contains digits but no arithmetic operator at all. Numbers appear as quantities, years, or counts without any computation.",
507
+ ),
508
+ (
509
+ "Adherence buckets",
510
+ "No math content (zero digits)",
511
+ "adherence__No math content", "%", "lower",
512
+ "Output contains no digits at all. The dog-sunburn example above is one of these.",
513
+ ),
514
+ (
515
+ "Arithmetic correctness",
516
+ "Per-equation arithmetic accuracy",
517
+ "pct_equation_accuracy", "%", "higher",
518
+ "Of every extracted N op N = N equation across all 10,000 outputs, the percentage that are arithmetically correct (tolerance 0.01 or 0.01% of the expected value, to absorb tiny rounding).",
519
+ ),
520
+ (
521
+ "Problem shape",
522
+ "Question word + numeric answer",
523
+ "pct_qa_aligned", "%", "higher",
524
+ "Output contains a question cue ('how many', 'what is', 'calculate', 'find', 'compute', 'determine', etc.) AND has at least one digit in its final 200 characters.",
525
+ ),
526
+ (
527
+ "Problem shape",
528
+ "Closed final-answer pattern",
529
+ "pct_has_final_answer", "%", "higher",
530
+ "Output matches a regex for closed answer patterns like 'the answer is N', '= N.', 'therefore ... = N', 'answer: N', or 'total = N'.",
531
+ ),
532
+ (
533
+ "Problem shape",
534
+ "Math-vocab density (per 100 alpha tokens)",
535
+ "math_vocab_density_mean", "f", "higher",
536
+ "Mean count of math-vocabulary words per 100 alphabetic tokens, averaged across outputs. Vocabulary: calculate, equation, probability, percentage, ratio, sum, product, average, divide, multiply, formula, fraction, etc.",
537
+ ),
538
+ (
539
+ "Source grounding",
540
+ "≥1 number reused from source",
541
+ "pct_any_number_from_input", "%", "higher",
542
+ "Output shares at least one numeric token with the source document (thousands-commas stripped, so '1,500' matches '1500'). Proxy for whether the math is anchored in source content vs invented.",
543
+ ),
544
+ (
545
+ "Source grounding",
546
+ "Topic Jaccard with source (mean)",
547
+ "topic_jaccard_mean", "f", "higher",
548
+ "Mean Jaccard similarity between content-word sets of input and output (lowercase, ≥4 chars, stopwords removed). Higher = output stays closer to source vocabulary.",
549
+ ),
550
+ ]
551
+ csv_rows = []
552
+ for group, metric_label, key, fmt, direction, description in audit_rows:
553
+ for model_name in MODELS:
554
+ csv_rows.append({
555
+ "group": group,
556
+ "metric": metric_label,
557
+ "model": model_name,
558
+ "value": summaries[model_name][key],
559
+ "format": fmt,
560
+ "direction": direction,
561
+ "description": description,
562
+ })
563
+ pd.DataFrame(csv_rows).to_csv(adherence_csv, index=False)
564
+ logger.info(f"\nSaved format-adherence audit CSV to {adherence_csv}")
565
+
566
+ # Surface representative "no math content" outputs from any model that has them.
567
+ for model_name in MODELS:
568
+ exs = summaries[model_name]["no_math_examples"]
569
+ if exs:
570
+ logger.info(f"\nRepresentative {model_name} outputs with Problem/Solution structure but NO digits:")
571
+ for i, ex in enumerate(exs, 1):
572
+ logger.info(f"\n [{i}] {ex[:400].strip()}")
573
+ break # one model is enough for the printed examples
574
+
575
  out = Path(__file__).parent / "qwen3_vs_smollm2_math_results.json"
576
  with out.open("w") as f:
577
  json.dump(summaries, f, indent=2)
app/src/content/analysis/qwen3_vs_smollm2_math_results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a3285a9ea99ce5e54d541b881ffc8dd14c5879b829488c1df295b134cd6fb3
3
- size 8998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66c976fc35e2c34273caf330dde6cf27fae80617d0dc5390c13d0015988be3fc
3
+ size 64005
app/src/content/assets/data/math_format_adherence.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef76807af9c4df9338083c60562e43ddc6eae771e496c5008e65f4d5fc140c4e
3
+ size 15803
app/src/content/assets/data/qwen3_vs_smollm2_prefix_collapse.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:671df2da7a38aa498ab6daf7814b9241d8a2badb5c141368dfb391819c62dcf7
3
- size 1038
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4603016ea1f8e19321f77dbaf691bc00a1a8cb3ba8f2e8326a23ebab0f26269
3
+ size 3651
app/src/content/chapters/4-analyses.mdx CHANGED
@@ -295,12 +295,12 @@ So output length doesn't predict quality either. But we stumbled onto something
295
 
296
  This was one of our most surprising findings. We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.[^math-analysis-script]
297
 
298
- [^math-analysis-script]: All numbers in this section come from `app/src/content/analysis/qwen3_vs_smollm2_math.py`, which scores the first 1000 documents of `format/math-qwen3-1.7b-hq` and `format/math-smollm2-1.7b-hq` from the [finephrase-rephrased bucket](https://huggingface.co/buckets/HuggingFaceFW/finephrase-rephrased/tree/format). Exact counts shift by a few percent with a different sample (different chunk, different seed), but the qualitative gap between the two models is stable.
299
 
300
  **Qwen3 produced beautiful, structured outputs:**
301
 
302
- - 99.8% had proper Problem/Solution sections
303
- - 99.1% had step-by-step formatting
304
  - 59% included LaTeX math notation
305
 
306
  Here's a typical Qwen3 output:
@@ -321,40 +321,73 @@ The disc makes 600 revolutions in 5 minutes.
321
  ```
322
  **SmolLM2 was messier:**
323
 
324
- - Only 67.5% had complete solutions
325
- - Wide variance in output length (5 to 2,900 tokens, vs Qwen3's 113 to 1,250)
326
  - Mix of formats: questions, partial answers, full solutions
327
 
328
- SmolLM2 outputs ranged from proper solutions to just questions like *"What is the difference between X and Y?"* or even 4-token fragments like *"Areas Where We Service"*.
329
 
330
- Yet models trained on SmolLM2's data **outperformed** those trained on Qwen3's data on downstream benchmarks. We suspect this is due to **template collapse**: Qwen3's outputs were *too* consistent. The chart below shows how many outputs share their first N characters, for N ranging from 4 to 60. At every prefix length, Qwen3 has dramatically more duplication than SmolLM2.
331
 
332
  <HtmlEmbed
333
  id="prefix-collapse"
334
  src="d3-prefix-collapse.html"
335
  data="qwen3_vs_smollm2_prefix_collapse.csv"
336
- desc="Template collapse curve across prefix lengths. Toggle between 'Most common prefix count' (how many of 1000 outputs share the single most common opening) and 'Distinct prefixes' (how many unique openings exist). Qwen3 collapses far more aggressively at every prefix length."
337
  />
338
 
339
- The numbers are striking: at the **first 10 characters**, Qwen3 has only **9 distinct openings** across 1000 outputs (783 of them share the most common one), while SmolLM2 has **270 distinct openings**. At 20 characters, Qwen3 still has 129 outputs sharing one opening (`'**Mathematical Word Problem:** A'`) versus only 39 for SmolLM2. The two models' top 40-character prefixes tell the story directly: Qwen3 produces `'**Mathematical Word Problem:**\n\nA school'` over and over, while SmolLM2's most common opener is the looser `'Question: What is the difference between'`.
340
 
341
  SmolLM2's quality distribution was actually reasonable:
342
 
343
  | Quality | Criteria | Share |
344
  | --- | --- | --- |
345
- | Excellent | Has "solution" + structural steps + 80+ tokens | 44.5% |
346
- | Good | Has "solution" + 50+ tokens | 21.2% |
347
- | Partial | 30+ tokens but missing structure | 23.4% |
348
- | Poor | {'<'}30 tokens | 10.9% |
349
 
350
- The lesson: for pretraining data, diversity beats consistency. A model that doesn't follow instructions perfectly can actually produce better training data than one that does. This also helps explain why SmolLM2 dominates the model family comparison: it produces more varied outputs, which may matter more than precise instruction following.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  <Note title="Summary: Analyses" variant="info">
353
  **Cost**: Small models with simple prompts dominate the Pareto frontier. Invest in prompt design, not model size.<br/>
354
  **Quality scores**: Neither edu-score nor DCLM-score reliably predicts downstream performance for synthetic data. There is no shortcut to training and evaluating.<br/>
355
  **Proxy model size**: A 2.9B student reveals three tiers (270M {'<'} 1B {'<'} 4B+) that the 1.7B student compressed. Generator gains above 1B are real but smaller than student-side gains. Student scale is the bigger lever.<br/>
356
  **Verbosity**: Output length has no meaningful relationship with performance. What matters is content, not compression ratio.<br/>
357
- **Diversity**: Template collapse hurts more than noisy outputs. A messier model that produces varied text can outperform a polished one that repeats the same template.
 
358
  </Note>
359
 
360
  With the experiments and analyses behind us, let's talk about the infrastructure that made all of this possible.
 
295
 
296
  This was one of our most surprising findings. We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.[^math-analysis-script]
297
 
298
+ [^math-analysis-script]: All numbers in this section come from `app/src/content/analysis/qwen3_vs_smollm2_math.py`, which scores the first 10000 documents of each `format/math-{model}-hq` dataset from the [finephrase-rephrased bucket](https://huggingface.co/buckets/HuggingFaceFW/finephrase-rephrased/tree/format). All six generators are small instruction-tuned models in the same order-of-magnitude size range: SmolLM2 (1.7B), Qwen3 (1.7B), Llama 3.2 (1B), Gemma 3 (1B), Falcon 3 (1B), and Granite 3 (1B). Exact counts shift by a fraction of a percent across different chunks; the qualitative gaps between the six are stable.
299
 
300
  **Qwen3 produced beautiful, structured outputs:**
301
 
302
+ - 99.7% had proper Problem/Solution sections
303
+ - 98.9% had step-by-step formatting
304
  - 59% included LaTeX math notation
305
 
306
  Here's a typical Qwen3 output:
 
321
  ```
322
  **SmolLM2 was messier:**
323
 
324
+ - Only 68.8% had complete solutions
325
+ - Wide variance in output length (1 to 3,540 tokens, vs Qwen3's 40 to 2,180)
326
  - Mix of formats: questions, partial answers, full solutions
327
 
328
+ SmolLM2 outputs ranged from proper solutions to just questions like *"What is the relationship between X and Y?"* or even tiny fragments like *"Areas Where We Service"*.
329
 
330
+ Yet models trained on SmolLM2's data **outperformed** those trained on Qwen3's data on downstream benchmarks. We suspect this is due to **template collapse**: Qwen3's outputs were *too* consistent. The chart below shows how many of 10000 outputs share their first N characters, for N ranging from 2 to 60. At every prefix length, Qwen3 has dramatically more duplication than SmolLM2.
331
 
332
  <HtmlEmbed
333
  id="prefix-collapse"
334
  src="d3-prefix-collapse.html"
335
  data="qwen3_vs_smollm2_prefix_collapse.csv"
336
+ desc="Template collapse curve across prefix lengths. Toggle between 'Most common prefix count' (how many of 10000 outputs share the single most common opening) and 'Distinct prefixes' (how many unique openings exist). Qwen3 collapses far more aggressively at every prefix length."
337
  />
338
 
339
+ The numbers are striking: at the **first 10 characters**, Qwen3 has only **40 distinct openings** across 10000 outputs (7619 of them share the most common one, `'**Problem:'`), while SmolLM2 has **1897 distinct openings**. At 20 characters, Qwen3 still has 1209 outputs sharing one opening (`'**Mathematical Word '`) versus only 443 for SmolLM2. The two models' top 40-character prefixes tell the story directly: Qwen3 produces `'**Mathematical Word Problem:**\n\nA school'` 87 times, while SmolLM2's most common opener (`'Question: What is the relationship betwe'`) appears only 35 times.
340
 
341
  SmolLM2's quality distribution was actually reasonable:
342
 
343
  | Quality | Criteria | Share |
344
  | --- | --- | --- |
345
+ | Excellent | Has "solution" + structural steps + 80+ tokens | 44.6% |
346
+ | Good | Has "solution" + 50+ tokens | 22.7% |
347
+ | Partial | 30+ tokens but missing structure | 24.1% |
348
+ | Poor | {'<'}30 tokens | 8.6% |
349
 
350
+ #### Does any of this actually contain math?
351
+
352
+ The structural and length metrics above tell us *how the outputs look*, not whether they contain valid math. To audit format adherence directly, we run a parser on each output that extracts every `N op N = N` arithmetic statement and checks it for correctness, then bucket each output by what it actually contains (a verifiably correct equation, a wrong equation, a bare arithmetic operator with no closed equation, just numbers, or no digits at all). On top of that we compute five cheap heuristics: whether any number in the output also appears in the source document (number provenance), whether the problem poses a question word and the solution ends with a numeric answer (Q/A alignment), whether the solution contains a closed final-answer pattern, the math-vocabulary density, and the topic Jaccard overlap with the source. We ran the audit on the first 10000 outputs from each of the six generators. Toggle which models are visible:
353
+
354
+ <Wide>
355
+ <HtmlEmbed
356
+ id="math-adherence-audit"
357
+ src="math-adherence-audit.html"
358
+ data="math_format_adherence.csv"
359
+ desc="Per-model format-adherence audit, 10000 outputs per model. Bar-shading and the bold cell mark the best value in each row, with higher- or lower-is-better direction adjusted per metric. Click pills to add or remove models; SmolLM2, Qwen3, and Llama 3.2 are shown by default."
360
+ />
361
+ </Wide>
362
+
363
+ Three findings deserve emphasis. First, **nearly a quarter of SmolLM2's "math" outputs contain no numeric content whatsoever** (23.0%), often producing pseudo-problems on unrelated topics. A representative example:
364
+
365
+ ```
366
+ Problem: How can dog owners effectively prevent sunburn in their dogs
367
+ and ensure their dogs' skin health remains intact?
368
+
369
+ Solution: Dogs can be prevented sunburn using sunscreen, protective
370
+ clothing, shade, and water. Owners should identify their dog's
371
+ sun-vulnerable areas and apply sunscreen accordingly...
372
+ ```
373
+
374
+ The same problem appears at a much lower rate in every other model: 7.7% for Gemma 3, 2.1% Falcon 3, 1.9% Granite 3, 0.7% Llama 3.2, and 0.1% Qwen3.
375
+
376
+ Second, structural perfection is *not* matched by mathematical correctness in any model. Per-equation arithmetic accuracy ranges from **Qwen3's 78.9%** (5072/6432 equations correct) down to **SmolLM2's 57.3%** (1385/2418), with the four others spread in between (Llama 72%, Falcon 71%, Gemma 68%, Granite 62%). No model is a reliable math tutor at the equation level, even when the surface formatting looks immaculate.
377
+
378
+ Third, and most surprising: **the model whose outputs stay closest to the source document is Gemma 3**, not the most formatted or the most math-dense one. Topic-Jaccard overlap ranks Gemma (0.187) > Granite (0.126) > SmolLM2 (0.120) > Falcon (0.114) > Llama (0.094) > Qwen3 (0.093). Llama 3.2 has the highest math-vocab density (2.80 math-words per 100 alpha tokens, vs SmolLM2's 1.25), and Qwen3 dominates Q/A formatting (94.4% vs SmolLM2's 43.0%) and structural completion (99.7% Problem/Solution sections). Yet these two most "polished" generators sit at the bottom of the source-grounding ranking, and both trail Gemma 3 on the actual downstream benchmark.
379
+
380
+ Put all of this together and the headline result becomes even more striking. **Qwen3 outputs are nearly 4x more likely to contain valid math than SmolLM2's (30.2% vs 8.1%), yet models pretrained on SmolLM2's outputs still perform better downstream.** Whatever pretraining is rewarding here, it isn't the correctness of individual word problems. The model-family comparison is picking up format and topic diversity in the rephrased text more than the validity of the math itself, consistent with the WRAP [@wrap] finding that paraphrase diversity drives downstream gains.
381
+
382
+ The lesson: for pretraining data, diversity beats consistency. A model that doesn't follow instructions perfectly, and even produces topically off-target outputs, can still yield better training data than one that produces polished but formulaic math. This also helps explain why SmolLM2 dominates the model family comparison: it produces more varied outputs, which may matter more than precise instruction following.
383
 
384
  <Note title="Summary: Analyses" variant="info">
385
  **Cost**: Small models with simple prompts dominate the Pareto frontier. Invest in prompt design, not model size.<br/>
386
  **Quality scores**: Neither edu-score nor DCLM-score reliably predicts downstream performance for synthetic data. There is no shortcut to training and evaluating.<br/>
387
  **Proxy model size**: A 2.9B student reveals three tiers (270M {'<'} 1B {'<'} 4B+) that the 1.7B student compressed. Generator gains above 1B are real but smaller than student-side gains. Student scale is the bigger lever.<br/>
388
  **Verbosity**: Output length has no meaningful relationship with performance. What matters is content, not compression ratio.<br/>
389
+ **Diversity**: Template collapse hurts more than noisy outputs. A messier model that produces varied text can outperform a polished one that repeats the same template.<br/>
390
+ **Format adherence**: Per-equation arithmetic accuracy ranges from Qwen3's 78.9% down to SmolLM2's 57.3% across six generators, and 23.0% of SmolLM2 outputs have no numeric content at all. Yet SmolLM2 still wins downstream, so the pretraining signal isn't math validity but format and topic variety.
391
  </Note>
392
 
393
  With the experiments and analyses behind us, let's talk about the infrastructure that made all of this possible.
app/src/content/embeds/d3-prefix-collapse.html CHANGED
@@ -2,7 +2,7 @@
2
  <style>
3
  .d3-prefix-collapse { position: relative; }
4
  .d3-prefix-collapse .controls {
5
- display: flex; gap: 16px; align-items: center; justify-content: flex-end;
6
  flex-wrap: wrap; margin: 0 0 8px 0;
7
  }
8
  .d3-prefix-collapse .controls .control-group {
@@ -16,6 +16,25 @@
16
  border: 1px solid var(--border-color); border-radius: 8px;
17
  background: var(--surface-bg); color: var(--text-color); cursor: pointer;
18
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  .d3-prefix-collapse .legend {
20
  display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
21
  margin: 8px 0 0 0;
@@ -89,25 +108,39 @@
89
 
90
  const METRICS = {
91
  most_common_count: {
92
- label: 'Most common prefix (count / 1000)',
93
  y: 'most_common_count',
94
  yLabel: 'Outputs sharing the most-common prefix',
95
  higherIs: 'more collapse',
96
  },
97
  distinct: {
98
- label: 'Distinct prefixes (out of 1000)',
99
  y: 'distinct',
100
- yLabel: 'Distinct prefixes (out of 1000 outputs)',
101
  higherIs: 'more diversity',
102
  },
103
  };
104
- const MODELS = ['SmolLM2', 'Qwen3'];
 
 
 
105
 
106
- // Controls
107
  const controls = document.createElement('div');
108
  controls.className = 'controls';
109
- const controlGroup = document.createElement('div');
110
- controlGroup.className = 'control-group';
 
 
 
 
 
 
 
 
 
 
 
111
  const labelEl = document.createElement('label');
112
  const selectId = `metric-select-${Math.random().toString(36).slice(2, 8)}`;
113
  labelEl.htmlFor = selectId;
@@ -119,9 +152,10 @@
119
  opt.value = key; opt.textContent = m.label;
120
  select.appendChild(opt);
121
  });
122
- controlGroup.appendChild(labelEl);
123
- controlGroup.appendChild(select);
124
- controls.appendChild(controlGroup);
 
125
  container.appendChild(controls);
126
 
127
  // Tooltip
@@ -145,39 +179,35 @@
145
  const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
146
  const gRoot = svg.append('g');
147
 
148
- // Legend
149
- const legend = document.createElement('div');
150
- legend.className = 'legend';
151
- const legendTitle = document.createElement('div');
152
- legendTitle.className = 'legend-title';
153
- legendTitle.textContent = 'Legend';
154
- legend.appendChild(legendTitle);
155
- const legendItems = document.createElement('div');
156
- legendItems.className = 'items';
157
- legend.appendChild(legendItems);
158
- container.appendChild(legend);
159
 
160
  const getColors = () => {
161
- if (window.ColorPalettes) {
162
- const cat = window.ColorPalettes.getColors('categorical', 5);
163
- return { SmolLM2: cat[2], Qwen3: cat[0] };
164
- }
165
- return { SmolLM2: '#3fb950', Qwen3: '#f85149' };
 
 
166
  };
167
 
168
- function buildLegend(colors) {
169
- legendItems.innerHTML = '';
170
  MODELS.forEach(name => {
171
- const item = document.createElement('span');
172
- item.className = 'item';
173
- const sw = document.createElement('span');
174
- sw.className = 'swatch';
175
- sw.style.background = colors[name];
176
- const txt = document.createElement('span');
177
- txt.textContent = name;
178
- item.appendChild(sw);
179
- item.appendChild(txt);
180
- legendItems.appendChild(item);
 
 
 
181
  });
182
  }
183
 
@@ -202,7 +232,8 @@
202
  function render() {
203
  if (!chartData) return;
204
  const colors = getColors();
205
- buildLegend(colors);
 
206
 
207
  const width = container.clientWidth || 800;
208
  const height = Math.max(280, Math.round(width / 2.6));
@@ -216,10 +247,12 @@
216
  const yKey = metric.y;
217
 
218
  const xExtent = d3.extent(chartData, d => d.prefix_chars);
219
- const yMax = d3.max(chartData, d => d[yKey]);
 
 
220
 
221
  const x = d3.scaleLinear().domain(xExtent).range([0, iw]).nice();
222
- const y = d3.scaleLinear().domain([0, Math.max(yMax, 1000)]).range([ih, 0]).nice();
223
 
224
  gRoot.selectAll('*').remove();
225
 
@@ -245,12 +278,12 @@
245
  .attr('x', -ih / 2).attr('y', -50)
246
  .attr('text-anchor', 'middle').text(metric.yLabel);
247
 
248
- // Lines + points per model
249
  const line = d3.line()
250
  .x(d => x(d.prefix_chars))
251
  .y(d => y(d[yKey]));
252
 
253
- MODELS.forEach(name => {
254
  const series = chartData.filter(d => d.model === name).sort((a, b) => a.prefix_chars - b.prefix_chars);
255
  gRoot.append('path')
256
  .datum(series)
@@ -268,7 +301,7 @@
268
  .attr('stroke-width', 1);
269
  });
270
 
271
- // Hover overlay: vertical bisector that highlights both series at the
272
  // nearest prefix length.
273
  const allPrefix = Array.from(new Set(chartData.map(d => d.prefix_chars))).sort((a, b) => a - b);
274
  const overlay = gRoot.append('rect')
@@ -285,19 +318,15 @@
285
  const xv = x.invert(mx);
286
  const nearest = allPrefix.reduce((a, b) => Math.abs(b - xv) < Math.abs(a - xv) ? b : a);
287
  hoverLine.attr('x1', x(nearest)).attr('x2', x(nearest)).style('opacity', 1);
288
- const sm = chartData.find(d => d.model === 'SmolLM2' && d.prefix_chars === nearest);
289
- const qw = chartData.find(d => d.model === 'Qwen3' && d.prefix_chars === nearest);
290
- const html = `
291
- <div><strong>Prefix: ${nearest} chars</strong></div>
292
- <div class="row" style="margin-top:6px">
293
- <span class="name"><span class="swatch" style="background:${colors.SmolLM2}"></span>SmolLM2</span>
294
- <span><strong>${sm[yKey]}</strong></span>
295
- </div>
296
- <div class="row">
297
- <span class="name"><span class="swatch" style="background:${colors.Qwen3}"></span>Qwen3</span>
298
- <span><strong>${qw[yKey]}</strong></span>
299
  </div>`;
300
- showTip(html, event);
 
301
  })
302
  .on('mouseleave', () => { hoverLine.style('opacity', 0); hideTip(); });
303
  }
@@ -311,6 +340,11 @@
311
  most_common_count: +d.most_common_count,
312
  distinct: +d.distinct,
313
  }));
 
 
 
 
 
314
  render();
315
  }).catch(err => {
316
  const pre = document.createElement('pre');
 
2
  <style>
3
  .d3-prefix-collapse { position: relative; }
4
  .d3-prefix-collapse .controls {
5
+ display: flex; gap: 24px; align-items: flex-start; justify-content: space-between;
6
  flex-wrap: wrap; margin: 0 0 8px 0;
7
  }
8
  .d3-prefix-collapse .controls .control-group {
 
16
  border: 1px solid var(--border-color); border-radius: 8px;
17
  background: var(--surface-bg); color: var(--text-color); cursor: pointer;
18
  }
19
+ .d3-prefix-collapse .pills {
20
+ display: flex; flex-wrap: wrap; gap: 6px;
21
+ }
22
+ .d3-prefix-collapse .pill {
23
+ font-size: 12px; padding: 6px 12px; border-radius: 999px;
24
+ border: 1px solid var(--border-color); background: var(--surface-bg);
25
+ color: var(--muted-color); cursor: pointer; user-select: none;
26
+ transition: background .12s ease, color .12s ease, border-color .12s ease;
27
+ display: inline-flex; align-items: center; gap: 6px;
28
+ }
29
+ .d3-prefix-collapse .pill:hover { border-color: var(--text-color); }
30
+ .d3-prefix-collapse .pill.active {
31
+ color: var(--surface-bg); background: var(--text-color);
32
+ border-color: var(--text-color); font-weight: 600;
33
+ }
34
+ .d3-prefix-collapse .pill .dot {
35
+ width: 8px; height: 8px; border-radius: 50%;
36
+ background: var(--pill-color, currentColor);
37
+ }
38
  .d3-prefix-collapse .legend {
39
  display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
40
  margin: 8px 0 0 0;
 
108
 
109
  const METRICS = {
110
  most_common_count: {
111
+ label: 'Most common prefix (count)',
112
  y: 'most_common_count',
113
  yLabel: 'Outputs sharing the most-common prefix',
114
  higherIs: 'more collapse',
115
  },
116
  distinct: {
117
+ label: 'Distinct prefixes (count)',
118
  y: 'distinct',
119
+ yLabel: 'Distinct prefixes (out of all outputs)',
120
  higherIs: 'more diversity',
121
  },
122
  };
123
+ const DEFAULT_VISIBLE = new Set(['SmolLM2', 'Qwen3']);
124
+ // Filled once the data loads (preserves CSV insertion order).
125
+ let MODELS = [];
126
+ const visible = new Set();
127
 
128
+ // Controls: model toggle pills on the left, metric selector on the right.
129
  const controls = document.createElement('div');
130
  controls.className = 'controls';
131
+
132
+ const modelGroup = document.createElement('div');
133
+ modelGroup.className = 'control-group';
134
+ const modelLabel = document.createElement('label');
135
+ modelLabel.textContent = 'Models (click to toggle)';
136
+ modelGroup.appendChild(modelLabel);
137
+ const pillRow = document.createElement('div');
138
+ pillRow.className = 'pills';
139
+ modelGroup.appendChild(pillRow);
140
+ controls.appendChild(modelGroup);
141
+
142
+ const metricGroup = document.createElement('div');
143
+ metricGroup.className = 'control-group';
144
  const labelEl = document.createElement('label');
145
  const selectId = `metric-select-${Math.random().toString(36).slice(2, 8)}`;
146
  labelEl.htmlFor = selectId;
 
152
  opt.value = key; opt.textContent = m.label;
153
  select.appendChild(opt);
154
  });
155
+ metricGroup.appendChild(labelEl);
156
+ metricGroup.appendChild(select);
157
+ controls.appendChild(metricGroup);
158
+
159
  container.appendChild(controls);
160
 
161
  // Tooltip
 
179
  const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
180
  const gRoot = svg.append('g');
181
 
182
+ // We keep the model toggles in `pillRow`; no separate legend block needed
183
+ // because the pills themselves serve as the legend.
 
 
 
 
 
 
 
 
 
184
 
185
  const getColors = () => {
186
+ // Stable categorical assignment by MODELS insertion order.
187
+ const cat = window.ColorPalettes
188
+ ? window.ColorPalettes.getColors('categorical', Math.max(6, MODELS.length))
189
+ : ['#3fb950', '#f85149', '#58a6ff', '#f0883e', '#bc8cff', '#f7c843'];
190
+ const out = {};
191
+ MODELS.forEach((m, i) => { out[m] = cat[i % cat.length]; });
192
+ return out;
193
  };
194
 
195
+ function buildPills(colors) {
196
+ pillRow.innerHTML = '';
197
  MODELS.forEach(name => {
198
+ const pill = document.createElement('span');
199
+ pill.className = 'pill' + (visible.has(name) ? ' active' : '');
200
+ pill.style.setProperty('--pill-color', colors[name]);
201
+ pill.innerHTML = `<span class="dot"></span>${name}`;
202
+ pill.addEventListener('click', () => {
203
+ if (visible.has(name)) {
204
+ if (visible.size > 1) visible.delete(name);
205
+ } else {
206
+ visible.add(name);
207
+ }
208
+ render();
209
+ });
210
+ pillRow.appendChild(pill);
211
  });
212
  }
213
 
 
232
  function render() {
233
  if (!chartData) return;
234
  const colors = getColors();
235
+ buildPills(colors);
236
+ const visibleModels = MODELS.filter(m => visible.has(m));
237
 
238
  const width = container.clientWidth || 800;
239
  const height = Math.max(280, Math.round(width / 2.6));
 
247
  const yKey = metric.y;
248
 
249
  const xExtent = d3.extent(chartData, d => d.prefix_chars);
250
+ // Scale y to the max among visible models so the chart adapts to selection.
251
+ const visibleData = chartData.filter(d => visible.has(d.model));
252
+ const yMax = d3.max(visibleData, d => d[yKey]) || 1;
253
 
254
  const x = d3.scaleLinear().domain(xExtent).range([0, iw]).nice();
255
+ const y = d3.scaleLinear().domain([0, yMax * 1.05]).range([ih, 0]).nice();
256
 
257
  gRoot.selectAll('*').remove();
258
 
 
278
  .attr('x', -ih / 2).attr('y', -50)
279
  .attr('text-anchor', 'middle').text(metric.yLabel);
280
 
281
+ // Lines + points per visible model.
282
  const line = d3.line()
283
  .x(d => x(d.prefix_chars))
284
  .y(d => y(d[yKey]));
285
 
286
+ visibleModels.forEach(name => {
287
  const series = chartData.filter(d => d.model === name).sort((a, b) => a.prefix_chars - b.prefix_chars);
288
  gRoot.append('path')
289
  .datum(series)
 
301
  .attr('stroke-width', 1);
302
  });
303
 
304
+ // Hover overlay: vertical bisector that lists every visible model at the
305
  // nearest prefix length.
306
  const allPrefix = Array.from(new Set(chartData.map(d => d.prefix_chars))).sort((a, b) => a - b);
307
  const overlay = gRoot.append('rect')
 
318
  const xv = x.invert(mx);
319
  const nearest = allPrefix.reduce((a, b) => Math.abs(b - xv) < Math.abs(a - xv) ? b : a);
320
  hoverLine.attr('x1', x(nearest)).attr('x2', x(nearest)).style('opacity', 1);
321
+ const N = d3.max(chartData, d => d.distinct);
322
+ const rows = visibleModels.map(m => {
323
+ const d = chartData.find(r => r.model === m && r.prefix_chars === nearest);
324
+ return `<div class="row">
325
+ <span class="name"><span class="swatch" style="background:${colors[m]}"></span>${m}</span>
326
+ <span><strong>${d[yKey].toLocaleString()}</strong> / ${N.toLocaleString()}</span>
 
 
 
 
 
327
  </div>`;
328
+ }).join('');
329
+ showTip(`<div><strong>Prefix: ${nearest} chars</strong></div>${rows}`, event);
330
  })
331
  .on('mouseleave', () => { hoverLine.style('opacity', 0); hideTip(); });
332
  }
 
340
  most_common_count: +d.most_common_count,
341
  distinct: +d.distinct,
342
  }));
343
+ // Preserve CSV insertion order for the model list.
344
+ MODELS = [];
345
+ for (const r of chartData) if (!MODELS.includes(r.model)) MODELS.push(r.model);
346
+ const defaults = MODELS.filter(m => DEFAULT_VISIBLE.has(m));
347
+ (defaults.length ? defaults : MODELS.slice(0, 3)).forEach(m => visible.add(m));
348
  render();
349
  }).catch(err => {
350
  const pre = document.createElement('pre');
app/src/content/embeds/math-adherence-audit.html ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="math-adherence-audit"></div>
2
+ <style>
3
+ .math-adherence-audit { position: relative; }
4
+ .math-adherence-audit .controls {
5
+ display: flex; gap: 16px; align-items: center; flex-wrap: wrap; margin: 0 0 12px 0;
6
+ }
7
+ .math-adherence-audit .controls .control-group {
8
+ display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
9
+ }
10
+ .math-adherence-audit .controls .label {
11
+ font-size: 12px; font-weight: 700; color: var(--text-color);
12
+ }
13
+ .math-adherence-audit .pills {
14
+ display: flex; flex-wrap: wrap; gap: 6px;
15
+ }
16
+ .math-adherence-audit .pill {
17
+ font-size: 12px; padding: 6px 12px; border-radius: 999px;
18
+ border: 1px solid var(--border-color); background: var(--surface-bg);
19
+ color: var(--muted-color); cursor: pointer; user-select: none;
20
+ transition: background .12s ease, color .12s ease, border-color .12s ease;
21
+ display: inline-flex; align-items: center; gap: 6px;
22
+ }
23
+ .math-adherence-audit .pill:hover {
24
+ border-color: var(--text-color);
25
+ }
26
+ .math-adherence-audit .pill.active {
27
+ color: var(--surface-bg);
28
+ background: var(--text-color);
29
+ border-color: var(--text-color);
30
+ font-weight: 600;
31
+ }
32
+ .math-adherence-audit .pill .dot {
33
+ width: 8px; height: 8px; border-radius: 50%;
34
+ background: var(--pill-color, currentColor);
35
+ }
36
+ .math-adherence-audit .table-scroll {
37
+ overflow-x: auto; -webkit-overflow-scrolling: touch;
38
+ }
39
+ .math-adherence-audit table {
40
+ width: 100%; border-collapse: collapse; font-size: 13px;
41
+ min-width: 100%;
42
+ }
43
+ .math-adherence-audit th, .math-adherence-audit td {
44
+ padding: 8px 10px; text-align: right; color: var(--text-color);
45
+ border-bottom: 1px solid var(--border-color);
46
+ }
47
+ .math-adherence-audit th:first-child, .math-adherence-audit td:first-child {
48
+ text-align: left; font-weight: 500;
49
+ }
50
+ .math-adherence-audit th {
51
+ font-size: 12px; font-weight: 700; color: var(--text-color);
52
+ border-bottom: 2px solid var(--border-color);
53
+ }
54
+ .math-adherence-audit tr.group-header td {
55
+ background: transparent;
56
+ font-size: 11px; font-weight: 700; text-transform: uppercase;
57
+ letter-spacing: 0.04em; color: var(--muted-color);
58
+ padding-top: 14px; padding-bottom: 4px;
59
+ border-bottom: none;
60
+ }
61
+ .math-adherence-audit td.value {
62
+ font-variant-numeric: tabular-nums;
63
+ position: relative;
64
+ overflow: hidden;
65
+ }
66
+ .math-adherence-audit td.value .bar {
67
+ position: absolute; left: 0; top: 0; bottom: 0;
68
+ background: var(--primary-color); opacity: 0.12;
69
+ pointer-events: none; z-index: 0;
70
+ }
71
+ .math-adherence-audit td.value .v {
72
+ position: relative; z-index: 1;
73
+ }
74
+ .math-adherence-audit td.value.is-max .v { font-weight: 700; }
75
+ .math-adherence-audit td .metric-label {
76
+ cursor: help;
77
+ border-bottom: 1px dotted var(--muted-color);
78
+ }
79
+ .math-adherence-audit .info-tip {
80
+ position: absolute; top: 0; left: 0;
81
+ transform: translate(-9999px, -9999px);
82
+ pointer-events: none; padding: 10px 12px; border-radius: 8px;
83
+ font-size: 12px; line-height: 1.45;
84
+ border: 1px solid var(--border-color);
85
+ background: var(--surface-bg); color: var(--text-color);
86
+ box-shadow: 0 4px 24px rgba(0,0,0,.18);
87
+ opacity: 0; transition: opacity .12s ease;
88
+ max-width: 340px;
89
+ z-index: 20;
90
+ }
91
+ </style>
92
+ <script>
93
+ (() => {
94
+ const bootstrap = () => {
95
+ const scriptEl = document.currentScript;
96
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
97
+ if (!(container && container.classList && container.classList.contains('math-adherence-audit'))) {
98
+ const cs = Array.from(document.querySelectorAll('.math-adherence-audit'))
99
+ .filter(el => !(el.dataset && el.dataset.mounted === 'true'));
100
+ container = cs[cs.length - 1] || null;
101
+ }
102
+ if (!container) return;
103
+ if (container.dataset) {
104
+ if (container.dataset.mounted === 'true') return;
105
+ container.dataset.mounted = 'true';
106
+ }
107
+
108
+ const DEFAULT_VISIBLE = new Set(['SmolLM2', 'Qwen3', 'Llama 3.2']);
109
+
110
+ const fetchCSV = async () => {
111
+ const paths = [
112
+ '/data/math_format_adherence.csv',
113
+ './assets/data/math_format_adherence.csv',
114
+ '../assets/data/math_format_adherence.csv',
115
+ ];
116
+ for (const p of paths) {
117
+ try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return await r.text(); } catch (_) {}
118
+ }
119
+ throw new Error('CSV not found');
120
+ };
121
+
122
+ // CSV parser that handles double-quoted fields (pandas auto-quotes fields
123
+ // containing commas, e.g. "Implicit math (operators, no closed equation)").
124
+ const parseCSVLine = (line) => {
125
+ const out = [];
126
+ let cur = '';
127
+ let inQuotes = false;
128
+ for (let i = 0; i < line.length; i++) {
129
+ const ch = line[i];
130
+ if (inQuotes) {
131
+ if (ch === '"' && line[i + 1] === '"') { cur += '"'; i++; }
132
+ else if (ch === '"') { inQuotes = false; }
133
+ else { cur += ch; }
134
+ } else {
135
+ if (ch === '"') { inQuotes = true; }
136
+ else if (ch === ',') { out.push(cur); cur = ''; }
137
+ else { cur += ch; }
138
+ }
139
+ }
140
+ out.push(cur);
141
+ return out;
142
+ };
143
+ const parseCSV = (text) => {
144
+ const lines = text.trim().split('\n');
145
+ const cols = parseCSVLine(lines.shift());
146
+ return lines.map(l => {
147
+ const cells = parseCSVLine(l);
148
+ const o = {};
149
+ cols.forEach((c, i) => { o[c] = cells[i]; });
150
+ return o;
151
+ });
152
+ };
153
+
154
+ const fmtValue = (v, fmt) => fmt === '%' ? `${(+v).toFixed(1)}%` : (+v).toFixed(3);
155
+
156
+ const render = (rows) => {
157
+ // Preserve original metric and model order from the CSV.
158
+ const metrics = [];
159
+ const metricMeta = {};
160
+ const models = [];
161
+ for (const r of rows) {
162
+ if (!metricMeta[r.metric]) {
163
+ metricMeta[r.metric] = {
164
+ group: r.group,
165
+ format: r.format,
166
+ direction: r.direction || 'higher',
167
+ description: r.description || '',
168
+ };
169
+ metrics.push(r.metric);
170
+ }
171
+ if (!models.includes(r.model)) models.push(r.model);
172
+ }
173
+ const grouped = {};
174
+ for (const r of rows) {
175
+ grouped[r.metric] = grouped[r.metric] || {};
176
+ grouped[r.metric][r.model] = +r.value;
177
+ }
178
+
179
+ const state = {
180
+ visible: new Set([...models.filter(m => DEFAULT_VISIBLE.has(m))]),
181
+ };
182
+ if (state.visible.size === 0) state.visible = new Set(models.slice(0, 3));
183
+
184
+ container.innerHTML = '';
185
+
186
+ // Tooltip for metric descriptions
187
+ const tip = document.createElement('div');
188
+ tip.className = 'info-tip';
189
+ container.appendChild(tip);
190
+ const showTip = (html, event) => {
191
+ tip.innerHTML = html;
192
+ tip.style.opacity = '1';
193
+ const cr = container.getBoundingClientRect();
194
+ const [mx, my] = [event.clientX - cr.left, event.clientY - cr.top];
195
+ const tw = tip.offsetWidth;
196
+ const x = mx + tw + 16 > cr.width ? Math.max(0, mx - tw - 12) : mx + 14;
197
+ tip.style.transform = `translate(${x}px, ${my + 14}px)`;
198
+ };
199
+ const hideTip = () => { tip.style.opacity = '0'; tip.style.transform = 'translate(-9999px,-9999px)'; };
200
+
201
+ // Pills
202
+ const controls = document.createElement('div');
203
+ controls.className = 'controls';
204
+ const grp = document.createElement('div');
205
+ grp.className = 'control-group';
206
+ const lab = document.createElement('div');
207
+ lab.className = 'label';
208
+ lab.textContent = 'Models (click to toggle)';
209
+ grp.appendChild(lab);
210
+ // Stable categorical color per model, matching the prefix-collapse chart.
211
+ const palette = window.ColorPalettes
212
+ ? window.ColorPalettes.getColors('categorical', Math.max(6, models.length))
213
+ : ['#3fb950', '#f85149', '#58a6ff', '#f0883e', '#bc8cff', '#f7c843'];
214
+ const modelColor = Object.fromEntries(models.map((m, i) => [m, palette[i % palette.length]]));
215
+
216
+ const pillRow = document.createElement('div');
217
+ pillRow.className = 'pills';
218
+ models.forEach(m => {
219
+ const pill = document.createElement('span');
220
+ pill.className = 'pill' + (state.visible.has(m) ? ' active' : '');
221
+ pill.style.setProperty('--pill-color', modelColor[m]);
222
+ pill.innerHTML = `<span class="dot"></span>${m}`;
223
+ pill.addEventListener('click', () => {
224
+ if (state.visible.has(m)) {
225
+ if (state.visible.size > 1) state.visible.delete(m);
226
+ } else {
227
+ state.visible.add(m);
228
+ }
229
+ drawTable();
230
+ });
231
+ pillRow.appendChild(pill);
232
+ });
233
+ grp.appendChild(pillRow);
234
+ controls.appendChild(grp);
235
+ container.appendChild(controls);
236
+
237
+ const tableWrap = document.createElement('div');
238
+ tableWrap.className = 'table-scroll';
239
+ container.appendChild(tableWrap);
240
+
241
+ const drawTable = () => {
242
+ // Refresh pills
243
+ Array.from(pillRow.children).forEach((pill, i) => {
244
+ pill.classList.toggle('active', state.visible.has(models[i]));
245
+ });
246
+
247
+ const visibleModels = models.filter(m => state.visible.has(m));
248
+ const table = document.createElement('table');
249
+ const thead = document.createElement('thead');
250
+ const trh = document.createElement('tr');
251
+ trh.appendChild(Object.assign(document.createElement('th'), { textContent: 'Metric' }));
252
+ visibleModels.forEach(m => {
253
+ trh.appendChild(Object.assign(document.createElement('th'), { textContent: m }));
254
+ });
255
+ thead.appendChild(trh);
256
+ table.appendChild(thead);
257
+
258
+ const tbody = document.createElement('tbody');
259
+ let currentGroup = null;
260
+ metrics.forEach(metric => {
261
+ const meta = metricMeta[metric];
262
+ if (meta.group !== currentGroup) {
263
+ const groupTr = document.createElement('tr');
264
+ groupTr.className = 'group-header';
265
+ const td = document.createElement('td');
266
+ td.colSpan = visibleModels.length + 1;
267
+ td.textContent = meta.group;
268
+ groupTr.appendChild(td);
269
+ tbody.appendChild(groupTr);
270
+ currentGroup = meta.group;
271
+ }
272
+
273
+ const tr = document.createElement('tr');
274
+ const labelTd = document.createElement('td');
275
+ const labelSpan = document.createElement('span');
276
+ labelSpan.className = 'metric-label';
277
+ labelSpan.textContent = metric;
278
+ if (meta.description) {
279
+ const descHTML = `<strong>${metric}</strong><br/><span style="color:var(--muted-color)">${meta.description}</span>`;
280
+ labelSpan.addEventListener('mouseenter', (e) => showTip(descHTML, e));
281
+ labelSpan.addEventListener('mousemove', (e) => showTip(descHTML, e));
282
+ labelSpan.addEventListener('mouseleave', hideTip);
283
+ }
284
+ labelTd.appendChild(labelSpan);
285
+ tr.appendChild(labelTd);
286
+
287
+ const vals = visibleModels.map(m => grouped[metric][m]);
288
+ const maxVal = Math.max(...vals);
289
+ const minVal = Math.min(...vals);
290
+ // Best cell index depends on whether higher or lower is better.
291
+ // Neutral metrics get no highlight.
292
+ let bestIdx = -1;
293
+ if (meta.direction === 'higher' && vals.length > 1) bestIdx = vals.indexOf(maxVal);
294
+ else if (meta.direction === 'lower' && vals.length > 1) bestIdx = vals.indexOf(minVal);
295
+
296
+ visibleModels.forEach((m, i) => {
297
+ const td = document.createElement('td');
298
+ td.className = 'value' + (i === bestIdx ? ' is-max' : '');
299
+ if (meta.direction !== 'neutral') {
300
+ const bar = document.createElement('span');
301
+ bar.className = 'bar';
302
+ // For "lower-is-better" metrics, invert so the smallest value gets the longest bar.
303
+ let norm;
304
+ if (meta.direction === 'higher') {
305
+ norm = maxVal > 0 ? vals[i] / maxVal : 0;
306
+ } else {
307
+ // direction === 'lower'; rescale so min -> 1, max -> small floor
308
+ const range = maxVal - minVal;
309
+ norm = range > 0 ? 1 - (vals[i] - minVal) / range * 0.85 : 1;
310
+ }
311
+ bar.style.width = `${(norm * 100).toFixed(1)}%`;
312
+ td.appendChild(bar);
313
+ }
314
+ const v = document.createElement('span');
315
+ v.className = 'v';
316
+ v.textContent = fmtValue(vals[i], meta.format);
317
+ td.appendChild(v);
318
+ tr.appendChild(td);
319
+ });
320
+
321
+ tbody.appendChild(tr);
322
+ });
323
+ table.appendChild(tbody);
324
+
325
+ tableWrap.innerHTML = '';
326
+ tableWrap.appendChild(table);
327
+ };
328
+
329
+ drawTable();
330
+ };
331
+
332
+ fetchCSV().then(text => render(parseCSV(text))).catch(err => {
333
+ const pre = document.createElement('pre');
334
+ pre.style.color = 'red';
335
+ pre.textContent = `Error loading audit data: ${err.message}`;
336
+ container.appendChild(pre);
337
+ });
338
+ };
339
+
340
+ if (document.readyState === 'loading') {
341
+ document.addEventListener('DOMContentLoaded', bootstrap, { once: true });
342
+ } else {
343
+ bootstrap();
344
+ }
345
+ })();
346
+ </script>