Update 14_d5_arch_config.py
Browse files- 14_d5_arch_config.py +1034 -0
14_d5_arch_config.py
CHANGED
|
@@ -0,0 +1,1034 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ablation_configs.py
|
| 3 |
+
====================
|
| 4 |
+
The ablation matrix for the three-band SVAE validation sweep.
|
| 5 |
+
|
| 6 |
+
Each config is a dict of overrides on the baseline PatchSVAE_F trainer.
|
| 7 |
+
The trainer expects:
|
| 8 |
+
- band: 'LOW' | 'MID' | 'HIGH' (selects the base architecture)
|
| 9 |
+
- variant: unique identifier for this variant within the group
|
| 10 |
+
- seed: random seed
|
| 11 |
+
- phase: 1 (1000-batch triage) | 2 (30-epoch full)
|
| 12 |
+
- overrides: dict of RunConfig field overrides
|
| 13 |
+
|
| 14 |
+
Three band representatives (kept constant across every test):
|
| 15 |
+
LOW: S=64, V=64, D=16, h=64, d=1, patch=16, 184K params, CV target β 0.21
|
| 16 |
+
MID: S=64, V=64, D=8, h=64, d=1, patch=16, 183K params, CV target β 0.39
|
| 17 |
+
HIGH: S=64, V=32, D=4, h=64, d=1, patch=4, 41K params, CV target β 1.10
|
| 18 |
+
|
| 19 |
+
Phase 1 early-stop:
|
| 20 |
+
- LOW/MID bands: train to batch 1000, record CV_ema, classify band
|
| 21 |
+
- HIGH band: train to batch 100, record CV_ema, classify band
|
| 22 |
+
|
| 23 |
+
Phase 2 full run:
|
| 24 |
+
- Group E (soft-hand): 30 epochs, 10 seeds per variant
|
| 25 |
+
- Group H (SVD necessity): 30 epochs, 3 seeds per variant
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from typing import Dict, List, Any
|
| 29 |
+
from dataclasses import dataclass, field, asdict
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ----------------------------------------------------------------------------
|
| 33 |
+
# Band representatives β the three anchor configs
|
| 34 |
+
# ----------------------------------------------------------------------------
|
| 35 |
+
|
| 36 |
+
BAND_REPS = {
|
| 37 |
+
'LOW': {
|
| 38 |
+
'img_size': 64,
|
| 39 |
+
'V': 64,
|
| 40 |
+
'D': 16,
|
| 41 |
+
'hidden': 64,
|
| 42 |
+
'depth': 1,
|
| 43 |
+
'patch_size': 16,
|
| 44 |
+
'n_cross': 1,
|
| 45 |
+
'expected_cv': 0.21,
|
| 46 |
+
'expected_params': 184_000,
|
| 47 |
+
},
|
| 48 |
+
'MID': {
|
| 49 |
+
'img_size': 64,
|
| 50 |
+
'V': 64,
|
| 51 |
+
'D': 8,
|
| 52 |
+
'hidden': 64,
|
| 53 |
+
'depth': 1,
|
| 54 |
+
'patch_size': 16,
|
| 55 |
+
'n_cross': 1,
|
| 56 |
+
'expected_cv': 0.39,
|
| 57 |
+
'expected_params': 183_000,
|
| 58 |
+
},
|
| 59 |
+
'HIGH': {
|
| 60 |
+
'img_size': 64,
|
| 61 |
+
'V': 32,
|
| 62 |
+
'D': 4,
|
| 63 |
+
'hidden': 64,
|
| 64 |
+
'depth': 1,
|
| 65 |
+
'patch_size': 4,
|
| 66 |
+
'n_cross': 1,
|
| 67 |
+
'expected_cv': 1.10,
|
| 68 |
+
'expected_params': 41_000,
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def band_classifier(cv_ema: float) -> str:
|
| 74 |
+
"""Classify a final CV-EMA value into a band."""
|
| 75 |
+
if cv_ema < 0.30:
|
| 76 |
+
return 'LOW'
|
| 77 |
+
elif cv_ema < 0.55:
|
| 78 |
+
return 'MID'
|
| 79 |
+
elif cv_ema > 0.80:
|
| 80 |
+
return 'HIGH'
|
| 81 |
+
return 'UNCLASSIFIED'
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def phase1_batch_limit(band: str) -> int:
|
| 85 |
+
"""How many batches to train before stopping for Phase 1 band classification."""
|
| 86 |
+
if band == 'HIGH':
|
| 87 |
+
return 100
|
| 88 |
+
return 1000
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def phase2_batch_limit(config: Dict[str, Any]) -> int:
|
| 92 |
+
"""How many batches per epoch for Phase 2.
|
| 93 |
+
|
| 94 |
+
Per-config override: if the config specifies 'batch_limit', use it.
|
| 95 |
+
This allows the floor sweep (P group) to cap at a few dozen batches
|
| 96 |
+
without changing defaults for existing phase-2 configs.
|
| 97 |
+
|
| 98 |
+
Default behavior (unchanged):
|
| 99 |
+
- Adam at batch_size=256: 1_000_000 / 256 β 3900 batches
|
| 100 |
+
- LBFGS at batch_size=32: normally 31250 batches, but LBFGS
|
| 101 |
+
does 20 inner iterations per outer step so ~40k gradient steps
|
| 102 |
+
per batch β we cap at 2000 outer batches = ~40k gradient steps
|
| 103 |
+
which is plenty for within-attractor convergence
|
| 104 |
+
|
| 105 |
+
The batch_size is read from the config (Phase 2 configs include
|
| 106 |
+
an explicit batch_size field).
|
| 107 |
+
"""
|
| 108 |
+
# Per-config explicit batch_limit takes precedence
|
| 109 |
+
if 'batch_limit' in config:
|
| 110 |
+
return config['batch_limit']
|
| 111 |
+
|
| 112 |
+
overrides = config.get('overrides', {})
|
| 113 |
+
if overrides.get('optimizer') == 'lbfgs':
|
| 114 |
+
return 2000 # cap for LBFGS wallclock
|
| 115 |
+
|
| 116 |
+
batch_size = config.get('batch_size', 256)
|
| 117 |
+
return 1_000_000 // batch_size
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# ----------------------------------------------------------------------------
|
| 121 |
+
# Ablation group definitions
|
| 122 |
+
# ----------------------------------------------------------------------------
|
| 123 |
+
|
| 124 |
+
def group_A_seed_replication() -> List[Dict[str, Any]]:
|
| 125 |
+
"""Reproducibility: 5 seeds Γ 3 bands = 15 runs.
|
| 126 |
+
|
| 127 |
+
Tests whether each band reproducibly appears across random inits.
|
| 128 |
+
Acceptance: >=4/5 seeds per band within +/-0.02 of expected CV.
|
| 129 |
+
"""
|
| 130 |
+
configs = []
|
| 131 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 132 |
+
for seed in range(5):
|
| 133 |
+
configs.append({
|
| 134 |
+
'group': 'A',
|
| 135 |
+
'variant': 'baseline',
|
| 136 |
+
'band': band,
|
| 137 |
+
'seed': seed,
|
| 138 |
+
'phase': 1,
|
| 139 |
+
'overrides': {}, # no overrides, just seed variation
|
| 140 |
+
'description': f'A-{band}-baseline-s{seed}',
|
| 141 |
+
})
|
| 142 |
+
return configs
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def group_B_dataset_composition() -> List[Dict[str, Any]]:
|
| 146 |
+
"""Noise-type dependence: 6 variants Γ 3 bands = 18 runs.
|
| 147 |
+
|
| 148 |
+
Tests whether band structure is architecture-driven or data-driven.
|
| 149 |
+
"""
|
| 150 |
+
variants = {
|
| 151 |
+
'B1_all16': list(range(16)),
|
| 152 |
+
'B2_gaussian_only': [0],
|
| 153 |
+
'B3_structured': [3, 4, 5, 11, 13], # block, gradient, checker, mixed, structural
|
| 154 |
+
'B4_heavy_tailed': [6, 7, 10], # cauchy, laplace, exponential (check indices)
|
| 155 |
+
'B5_first_half': list(range(8)),
|
| 156 |
+
'B6_even_indices': [0, 2, 4, 6, 8, 10, 12, 14],
|
| 157 |
+
}
|
| 158 |
+
configs = []
|
| 159 |
+
for variant_name, types in variants.items():
|
| 160 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 161 |
+
configs.append({
|
| 162 |
+
'group': 'B',
|
| 163 |
+
'variant': variant_name,
|
| 164 |
+
'band': band,
|
| 165 |
+
'seed': 0,
|
| 166 |
+
'phase': 1,
|
| 167 |
+
'overrides': {'noise_types': types},
|
| 168 |
+
'description': f'B-{band}-{variant_name}',
|
| 169 |
+
})
|
| 170 |
+
return configs
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def group_C_optimizer() -> List[Dict[str, Any]]:
|
| 174 |
+
"""Optimizer dependence: 4 variants Γ 3 bands = 12 runs.
|
| 175 |
+
|
| 176 |
+
Tests whether attractor is Adam-specific.
|
| 177 |
+
|
| 178 |
+
NOTE: LBFGS was originally included as C5 but removed 2026-04-20
|
| 179 |
+
after empirical evidence that it is incompatible with the sphere-
|
| 180 |
+
normed architecture as currently constructed. LBFGS's flat-space
|
| 181 |
+
strong Wolfe line search drives parameters away from the sphere
|
| 182 |
+
manifold during line search, producing ill-conditioned SVD inputs.
|
| 183 |
+
Symptoms observed: D=16 crashed in torch.linalg.eigh with "failed
|
| 184 |
+
to converge β ill-conditioned or too many repeated eigenvalues";
|
| 185 |
+
D=8 and D=4 completed but produced NaN MSE (CV measurements at
|
| 186 |
+
intermediate batches were valid β 0.3373 MID, 0.9435 HIGH β but
|
| 187 |
+
final test MSE was NaN, indicating parameters went non-finite
|
| 188 |
+
during training).
|
| 189 |
+
|
| 190 |
+
This is NOT a finding about LBFGS as an optimizer β it's a finding
|
| 191 |
+
about the LBFGS-sphere_norm interaction. Proper test requires
|
| 192 |
+
Riemannian LBFGS with constraint-aware line search. See scratchpad
|
| 193 |
+
entry 000080 for the dedicated LBFGS engineering pass TODO.
|
| 194 |
+
"""
|
| 195 |
+
variants = [
|
| 196 |
+
('C1_adam', {'optimizer': 'adam', 'lr': 1e-4, 'weight_decay': 0.0}),
|
| 197 |
+
('C2_sgd', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.0}),
|
| 198 |
+
('C3_sgd_momentum', {'optimizer': 'sgd', 'lr': 1e-2, 'momentum': 0.9}),
|
| 199 |
+
('C4_adamw', {'optimizer': 'adamw', 'lr': 1e-4, 'weight_decay': 0.01}),
|
| 200 |
+
]
|
| 201 |
+
configs = []
|
| 202 |
+
for variant_name, overrides in variants:
|
| 203 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 204 |
+
configs.append({
|
| 205 |
+
'group': 'C',
|
| 206 |
+
'variant': variant_name,
|
| 207 |
+
'band': band,
|
| 208 |
+
'seed': 0,
|
| 209 |
+
'phase': 1,
|
| 210 |
+
'overrides': overrides,
|
| 211 |
+
'description': f'C-{band}-{variant_name}',
|
| 212 |
+
})
|
| 213 |
+
return configs
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def group_D_schedule() -> List[Dict[str, Any]]:
|
| 217 |
+
"""LR schedule: 5 variants Γ 3 bands = 15 runs."""
|
| 218 |
+
variants = [
|
| 219 |
+
('D1_cosine', {'scheduler': 'cosine'}),
|
| 220 |
+
('D2_constant', {'scheduler': 'constant'}),
|
| 221 |
+
('D3_linear_decay', {'scheduler': 'linear'}),
|
| 222 |
+
('D4_warm_restart', {'scheduler': 'cosine_warm_restarts', 'T_0': 1000}),
|
| 223 |
+
('D5_one_cycle', {'scheduler': 'one_cycle'}),
|
| 224 |
+
]
|
| 225 |
+
configs = []
|
| 226 |
+
for variant_name, overrides in variants:
|
| 227 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 228 |
+
configs.append({
|
| 229 |
+
'group': 'D',
|
| 230 |
+
'variant': variant_name,
|
| 231 |
+
'band': band,
|
| 232 |
+
'seed': 0,
|
| 233 |
+
'phase': 1,
|
| 234 |
+
'overrides': overrides,
|
| 235 |
+
'description': f'D-{band}-{variant_name}',
|
| 236 |
+
})
|
| 237 |
+
return configs
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def group_E_soft_hand() -> List[Dict[str, Any]]:
|
| 241 |
+
"""Soft-hand guidance β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
|
| 242 |
+
|
| 243 |
+
Phase 1 E_preview already showed all four variants reach the same band
|
| 244 |
+
at 1000 batches (all within 0.0014 CV). The Phase 2 question is NO
|
| 245 |
+
LONGER "does the attractor survive" β that's settled β but rather:
|
| 246 |
+
"what's the within-attractor reconstruction MSE under each soft-hand
|
| 247 |
+
regime over a full epoch?"
|
| 248 |
+
|
| 249 |
+
Primary comparison: E1 (full soft-hand) vs E2 (pure MSE). If MSE
|
| 250 |
+
differs meaningfully, soft-hand is trading reconstruction quality
|
| 251 |
+
for geometric coherence at an epoch-scale budget.
|
| 252 |
+
|
| 253 |
+
4 variants Γ 3 bands Γ 3 seeds = 36 runs.
|
| 254 |
+
"""
|
| 255 |
+
variants = [
|
| 256 |
+
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
|
| 257 |
+
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
|
| 258 |
+
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
|
| 259 |
+
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
|
| 260 |
+
]
|
| 261 |
+
configs = []
|
| 262 |
+
for variant_name, overrides in variants:
|
| 263 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 264 |
+
for seed in range(3):
|
| 265 |
+
configs.append({
|
| 266 |
+
'group': 'E',
|
| 267 |
+
'variant': variant_name,
|
| 268 |
+
'band': band,
|
| 269 |
+
'seed': seed,
|
| 270 |
+
'phase': 2,
|
| 271 |
+
'num_epochs': 1,
|
| 272 |
+
'batch_size': 256,
|
| 273 |
+
'overrides': overrides,
|
| 274 |
+
'description': f'E-{band}-{variant_name}-s{seed}',
|
| 275 |
+
})
|
| 276 |
+
return configs
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def group_E_subset_phase1() -> List[Dict[str, Any]]:
|
| 280 |
+
"""E subset for Phase 1 preview β 1 seed per variant, 1000 batches.
|
| 281 |
+
|
| 282 |
+
Quick read on whether E2 even approaches the attractor before
|
| 283 |
+
committing to full Phase 2 Group E. 4 variants Γ 3 bands = 12 runs.
|
| 284 |
+
"""
|
| 285 |
+
variants = [
|
| 286 |
+
('E1_full_softhand', {'soft_hand': True, 'boost': 0.5, 'cv_penalty': 0.3}),
|
| 287 |
+
('E2_pure_mse', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
|
| 288 |
+
('E3_measure_only', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
|
| 289 |
+
('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
|
| 290 |
+
]
|
| 291 |
+
configs = []
|
| 292 |
+
for variant_name, overrides in variants:
|
| 293 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 294 |
+
configs.append({
|
| 295 |
+
'group': 'E_preview',
|
| 296 |
+
'variant': variant_name,
|
| 297 |
+
'band': band,
|
| 298 |
+
'seed': 0,
|
| 299 |
+
'phase': 1,
|
| 300 |
+
'overrides': overrides,
|
| 301 |
+
'description': f'Eprev-{band}-{variant_name}',
|
| 302 |
+
})
|
| 303 |
+
return configs
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def group_F_activation() -> List[Dict[str, Any]]:
|
| 307 |
+
"""Activation function: 5 variants Γ 3 bands = 15 runs."""
|
| 308 |
+
variants = [
|
| 309 |
+
('F1_gelu', {'activation': 'gelu'}),
|
| 310 |
+
('F2_relu', {'activation': 'relu'}),
|
| 311 |
+
('F3_silu', {'activation': 'silu'}),
|
| 312 |
+
('F4_tanh', {'activation': 'tanh'}),
|
| 313 |
+
('F5_identity', {'activation': 'identity'}),
|
| 314 |
+
]
|
| 315 |
+
configs = []
|
| 316 |
+
for variant_name, overrides in variants:
|
| 317 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 318 |
+
configs.append({
|
| 319 |
+
'group': 'F',
|
| 320 |
+
'variant': variant_name,
|
| 321 |
+
'band': band,
|
| 322 |
+
'seed': 0,
|
| 323 |
+
'phase': 1,
|
| 324 |
+
'overrides': overrides,
|
| 325 |
+
'description': f'F-{band}-{variant_name}',
|
| 326 |
+
})
|
| 327 |
+
return configs
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def group_G_sphere_norm() -> List[Dict[str, Any]]:
|
| 331 |
+
"""Sphere-norm ablation: 4 variants Γ 3 bands = 12 runs.
|
| 332 |
+
|
| 333 |
+
Expected per framework: G2 (no sphere-norm) reproduces charge-
|
| 334 |
+
discharge catastrophe. G3/G4 may or may not preserve the band.
|
| 335 |
+
"""
|
| 336 |
+
variants = [
|
| 337 |
+
('G1_sphere_norm', {'row_norm': 'sphere'}), # baseline, F.normalize(dim=-1)
|
| 338 |
+
('G2_no_norm', {'row_norm': 'none'}), # raw M to SVD
|
| 339 |
+
('G3_layer_norm', {'row_norm': 'layer_norm'}),
|
| 340 |
+
('G4_scale_only', {'row_norm': 'scale_only'}),
|
| 341 |
+
]
|
| 342 |
+
configs = []
|
| 343 |
+
for variant_name, overrides in variants:
|
| 344 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 345 |
+
configs.append({
|
| 346 |
+
'group': 'G',
|
| 347 |
+
'variant': variant_name,
|
| 348 |
+
'band': band,
|
| 349 |
+
'seed': 0,
|
| 350 |
+
'phase': 1,
|
| 351 |
+
'overrides': overrides,
|
| 352 |
+
'description': f'G-{band}-{variant_name}',
|
| 353 |
+
})
|
| 354 |
+
return configs
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def group_H_svd_necessity() -> List[Dict[str, Any]]:
|
| 358 |
+
"""SVD necessity β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
|
| 359 |
+
|
| 360 |
+
Tests whether learned linear readout can match SVD, and whether
|
| 361 |
+
fp64 SVD precision and per-batch SVD are load-bearing.
|
| 362 |
+
|
| 363 |
+
Staged seed counts based on the question each variant answers:
|
| 364 |
+
- H1/H2/H3 (3 seeds): core SVD-vs-linear comparison, needs variance
|
| 365 |
+
- H4/H5 (2 seeds): precision/batching questions, binary yes/no
|
| 366 |
+
- H6 (1 seed): expected-failure confirmation
|
| 367 |
+
|
| 368 |
+
Total: 3Γ3 + 3Γ3 + 3Γ3 + 3Γ2 + 3Γ2 + 3Γ1 = 42 runs
|
| 369 |
+
"""
|
| 370 |
+
variants_full = [ # 3 seeds
|
| 371 |
+
('H1_svd_fp64', {'svd': 'fp64'}),
|
| 372 |
+
('H2_linear_matched', {'svd': 'none', 'linear_readout': True, 'match_params': True}),
|
| 373 |
+
('H3_linear_unmatched', {'svd': 'none', 'linear_readout': True, 'match_params': False}),
|
| 374 |
+
]
|
| 375 |
+
variants_probe = [ # 2 seeds
|
| 376 |
+
('H4_svd_fp32', {'svd': 'fp32'}),
|
| 377 |
+
('H5_batch_shared_svd', {'svd': 'batch_shared'}),
|
| 378 |
+
]
|
| 379 |
+
variants_confirm = [ # 1 seed, expected failure
|
| 380 |
+
('H6_no_svd_direct', {'svd': 'none', 'linear_readout': False}),
|
| 381 |
+
]
|
| 382 |
+
configs = []
|
| 383 |
+
for variants, n_seeds in [(variants_full, 3), (variants_probe, 2), (variants_confirm, 1)]:
|
| 384 |
+
for variant_name, overrides in variants:
|
| 385 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 386 |
+
for seed in range(n_seeds):
|
| 387 |
+
configs.append({
|
| 388 |
+
'group': 'H',
|
| 389 |
+
'variant': variant_name,
|
| 390 |
+
'band': band,
|
| 391 |
+
'seed': seed,
|
| 392 |
+
'phase': 2,
|
| 393 |
+
'num_epochs': 1,
|
| 394 |
+
'batch_size': 256,
|
| 395 |
+
'overrides': overrides,
|
| 396 |
+
'description': f'H-{band}-{variant_name}-s{seed}',
|
| 397 |
+
})
|
| 398 |
+
return configs
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def group_L2_lbfgs() -> List[Dict[str, Any]]:
|
| 402 |
+
"""LBFGS characterization β PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
|
| 403 |
+
|
| 404 |
+
Front-loads LBFGS investigation after Phil's isolated test at 100
|
| 405 |
+
batches showed LBFGS + pure MSE + no soft-hand reaches the HIGH
|
| 406 |
+
attractor (CV 0.869) with better within-attractor reconstruction MSE
|
| 407 |
+
(0.0644) than Adam + soft-hand achieves at 30 epochs (0.072).
|
| 408 |
+
|
| 409 |
+
Phase 2 L2 tests whether this gap holds at epoch scale and whether
|
| 410 |
+
MID band shows a similar effect.
|
| 411 |
+
|
| 412 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 413 |
+
STIPEND: LOW band (D=16) OMITTED pending LBFGS engineering pass.
|
| 414 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 415 |
+
Isolated test in Phase 1 session confirmed LBFGS + sphere_norm +
|
| 416 |
+
D=16 crashes torch.linalg.eigh (error code 15, ill-conditioned
|
| 417 |
+
Gram matrix). PyTorch LBFGS's flat-space strong Wolfe line search
|
| 418 |
+
drives parameters off the sphere manifold, producing degenerate
|
| 419 |
+
SVD inputs. Fix requires Riemannian (constraint-aware) line
|
| 420 |
+
search β see scratchpad entry 000080 for the engineering pass
|
| 421 |
+
TODO. L2-LOW will be runnable once RLBFGS integration lands.
|
| 422 |
+
|
| 423 |
+
Current scope: MID + HIGH only, pure MSE + no soft-hand
|
| 424 |
+
(matching the Phil isolated test configuration that produced
|
| 425 |
+
the 0.869/0.0644 data point).
|
| 426 |
+
|
| 427 |
+
2 bands Γ 3 seeds = 6 runs.
|
| 428 |
+
"""
|
| 429 |
+
variants = [
|
| 430 |
+
('L2_lbfgs_pure_mse', {
|
| 431 |
+
'optimizer': 'lbfgs',
|
| 432 |
+
'lr': 1.0,
|
| 433 |
+
'batch_size': 32, # LBFGS small-batch required for closure stability
|
| 434 |
+
'soft_hand': False, # no soft-hand (corrupted Hessian approximation)
|
| 435 |
+
'boost': 0.0,
|
| 436 |
+
'cv_penalty': 0.0,
|
| 437 |
+
}),
|
| 438 |
+
]
|
| 439 |
+
configs = []
|
| 440 |
+
for variant_name, overrides in variants:
|
| 441 |
+
for band in ['MID', 'HIGH']: # LOW stipended β see docstring
|
| 442 |
+
for seed in range(3):
|
| 443 |
+
configs.append({
|
| 444 |
+
'group': 'L2',
|
| 445 |
+
'variant': variant_name,
|
| 446 |
+
'band': band,
|
| 447 |
+
'seed': seed,
|
| 448 |
+
'phase': 2,
|
| 449 |
+
'num_epochs': 1,
|
| 450 |
+
'batch_size': 32, # overrides default (LBFGS needs small batch)
|
| 451 |
+
'overrides': overrides,
|
| 452 |
+
'description': f'L2-{band}-{variant_name}-s{seed}',
|
| 453 |
+
})
|
| 454 |
+
return configs
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def group_I_cross_attention() -> List[Dict[str, Any]]:
|
| 458 |
+
"""Cross-attention necessity: 4 variants Γ 3 bands = 12 runs."""
|
| 459 |
+
variants = [
|
| 460 |
+
('I1_1layer', {'n_cross': 1, 'max_alpha': 0.2}),
|
| 461 |
+
('I2_0layers', {'n_cross': 0}),
|
| 462 |
+
('I3_2layers', {'n_cross': 2, 'max_alpha': 0.2}),
|
| 463 |
+
('I4_unbounded_alpha', {'n_cross': 1, 'max_alpha': 1.0}),
|
| 464 |
+
]
|
| 465 |
+
configs = []
|
| 466 |
+
for variant_name, overrides in variants:
|
| 467 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 468 |
+
configs.append({
|
| 469 |
+
'group': 'I',
|
| 470 |
+
'variant': variant_name,
|
| 471 |
+
'band': band,
|
| 472 |
+
'seed': 0,
|
| 473 |
+
'phase': 1,
|
| 474 |
+
'overrides': overrides,
|
| 475 |
+
'description': f'I-{band}-{variant_name}',
|
| 476 |
+
})
|
| 477 |
+
return configs
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def group_J_capacity_within_LOW() -> List[Dict[str, Any]]:
|
| 481 |
+
"""Minimum on-attractor parameter count β LOW band only, 5 variants."""
|
| 482 |
+
variants = [
|
| 483 |
+
('J1_V64_h64', {'V': 64, 'hidden': 64}), # baseline, 184K
|
| 484 |
+
('J2_V32_h32', {'V': 32, 'hidden': 32}), # ~50K
|
| 485 |
+
('J3_V16_h32', {'V': 16, 'hidden': 32}), # ~30K
|
| 486 |
+
('J4_V64_h32', {'V': 64, 'hidden': 32}), # ~100K
|
| 487 |
+
('J5_V128_h128', {'V': 128, 'hidden': 128}), # ~528K
|
| 488 |
+
]
|
| 489 |
+
configs = []
|
| 490 |
+
for variant_name, overrides in variants:
|
| 491 |
+
configs.append({
|
| 492 |
+
'group': 'J',
|
| 493 |
+
'variant': variant_name,
|
| 494 |
+
'band': 'LOW',
|
| 495 |
+
'seed': 0,
|
| 496 |
+
'phase': 1,
|
| 497 |
+
'overrides': overrides,
|
| 498 |
+
'description': f'J-LOW-{variant_name}',
|
| 499 |
+
})
|
| 500 |
+
return configs
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
def group_K_batch_size() -> List[Dict[str, Any]]:
|
| 504 |
+
"""Batch size sensitivity: 4 variants Γ 3 bands = 12 runs."""
|
| 505 |
+
variants = [
|
| 506 |
+
('K1_bs128', {'batch_size': 128}),
|
| 507 |
+
('K2_bs32', {'batch_size': 32}),
|
| 508 |
+
('K3_bs512', {'batch_size': 512}),
|
| 509 |
+
('K4_bs1024', {'batch_size': 1024}),
|
| 510 |
+
]
|
| 511 |
+
configs = []
|
| 512 |
+
for variant_name, overrides in variants:
|
| 513 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 514 |
+
configs.append({
|
| 515 |
+
'group': 'K',
|
| 516 |
+
'variant': variant_name,
|
| 517 |
+
'band': band,
|
| 518 |
+
'seed': 0,
|
| 519 |
+
'phase': 1,
|
| 520 |
+
'overrides': overrides,
|
| 521 |
+
'description': f'K-{band}-{variant_name}',
|
| 522 |
+
})
|
| 523 |
+
return configs
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def group_L_initialization() -> List[Dict[str, Any]]:
|
| 527 |
+
"""Init: 4 variants Γ 3 bands = 12 runs."""
|
| 528 |
+
variants = [
|
| 529 |
+
('L1_orthogonal', {'init': 'orthogonal'}),
|
| 530 |
+
('L2_kaiming', {'init': 'kaiming_normal'}),
|
| 531 |
+
('L3_xavier', {'init': 'xavier_uniform'}),
|
| 532 |
+
('L4_normal_small', {'init': 'normal_0_02'}),
|
| 533 |
+
]
|
| 534 |
+
configs = []
|
| 535 |
+
for variant_name, overrides in variants:
|
| 536 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 537 |
+
configs.append({
|
| 538 |
+
'group': 'L',
|
| 539 |
+
'variant': variant_name,
|
| 540 |
+
'band': band,
|
| 541 |
+
'seed': 0,
|
| 542 |
+
'phase': 1,
|
| 543 |
+
'overrides': overrides,
|
| 544 |
+
'description': f'L-{band}-{variant_name}',
|
| 545 |
+
})
|
| 546 |
+
return configs
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
def group_M_brute_force_sgd() -> List[Dict[str, Any]]:
|
| 550 |
+
"""Brute-force SGD stress: 3 variants Γ 3 bands = 9 runs."""
|
| 551 |
+
variants = [
|
| 552 |
+
('M1_sgd_aggressive', {'optimizer': 'sgd', 'lr': 1e-1, 'momentum': 0.0, 'warmup': 0}),
|
| 553 |
+
('M2_sgd_huge_lr', {'optimizer': 'sgd', 'lr': 1.0, 'momentum': 0.0, 'grad_clip': 1.0}),
|
| 554 |
+
('M3_sgd_high_momentum',{'optimizer': 'sgd', 'lr': 3e-3, 'momentum': 0.99}),
|
| 555 |
+
]
|
| 556 |
+
configs = []
|
| 557 |
+
for variant_name, overrides in variants:
|
| 558 |
+
for band in ['LOW', 'MID', 'HIGH']:
|
| 559 |
+
configs.append({
|
| 560 |
+
'group': 'M',
|
| 561 |
+
'variant': variant_name,
|
| 562 |
+
'band': band,
|
| 563 |
+
'seed': 0,
|
| 564 |
+
'phase': 1,
|
| 565 |
+
'overrides': overrides,
|
| 566 |
+
'description': f'M-{band}-{variant_name}',
|
| 567 |
+
})
|
| 568 |
+
return configs
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def group_N_uniformity_diagnostic() -> List[Dict[str, Any]]:
|
| 572 |
+
"""Attractor uniformity diagnostic β NOT a standalone group.
|
| 573 |
+
|
| 574 |
+
Instead, ADDED TO EVERY other variant's post-training analysis:
|
| 575 |
+
1. Extract final sphere-normed rows
|
| 576 |
+
2. Compute pentachoron CV at n_samples=2000
|
| 577 |
+
3. Compare to uniform-sphere prediction for that D
|
| 578 |
+
4. Record observed_CV, uniform_CV, deviation in final_report.json
|
| 579 |
+
|
| 580 |
+
This function returns 0 standalone configs β Group N is a flag
|
| 581 |
+
that every other group's runs should include the diagnostic.
|
| 582 |
+
"""
|
| 583 |
+
return []
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
# ----------------------------------------------------------------------------
|
| 587 |
+
# Full matrix assembly
|
| 588 |
+
# ----------------------------------------------------------------------------
|
| 589 |
+
|
| 590 |
+
def get_phase1_configs() -> List[Dict[str, Any]]:
|
| 591 |
+
"""Phase 1 matrix β all band-classification ablations.
|
| 592 |
+
|
| 593 |
+
Recommended run order (most informative first):
|
| 594 |
+
1. Group A (seed replication) β foundational
|
| 595 |
+
2. Group G (sphere-norm) β framework verification
|
| 596 |
+
3. Group E_preview (soft-hand 1000-batch preview)
|
| 597 |
+
4. Group B, C, D, F, I, J, K, L, M β remaining ablations
|
| 598 |
+
"""
|
| 599 |
+
return (
|
| 600 |
+
group_A_seed_replication() # 15 runs
|
| 601 |
+
+ group_G_sphere_norm() # 12 runs
|
| 602 |
+
+ group_E_subset_phase1() # 12 runs
|
| 603 |
+
+ group_B_dataset_composition() # 18 runs
|
| 604 |
+
+ group_C_optimizer() # 15 runs
|
| 605 |
+
+ group_D_schedule() # 15 runs
|
| 606 |
+
+ group_F_activation() # 15 runs
|
| 607 |
+
+ group_I_cross_attention() # 12 runs
|
| 608 |
+
+ group_J_capacity_within_LOW() # 5 runs
|
| 609 |
+
+ group_K_batch_size() # 12 runs
|
| 610 |
+
+ group_L_initialization() # 12 runs
|
| 611 |
+
+ group_M_brute_force_sgd() # 9 runs
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def group_P_small_battery_floor() -> List[Dict[str, Any]]:
|
| 616 |
+
"""Small-battery floor sweep β PHASE 2 variant with tiny batch budget.
|
| 617 |
+
|
| 618 |
+
Grid-sweeps architecture at the H2_linear_matched baseline to find
|
| 619 |
+
the smallest battery that still reconstructs gaussian within a
|
| 620 |
+
reasonable multiplier of the h2-64 floor AND lands in a valid
|
| 621 |
+
geometric attractor (CV in MID/HIGH range).
|
| 622 |
+
|
| 623 |
+
Grid axes:
|
| 624 |
+
hidden: {4, 8, 16, 32, 64} 5
|
| 625 |
+
V: {2, 4, 8, 16, 32} 5
|
| 626 |
+
D: {2, 3, 4} 3
|
| 627 |
+
depth: {0, 1} 2
|
| 628 |
+
n_cross: {0, 1} 2
|
| 629 |
+
optimizer: {'adam', 'lbfgs'} 2
|
| 630 |
+
|
| 631 |
+
Full product: 5 Γ 5 Γ 3 Γ 2 Γ 2 Γ 2 = 600 runs.
|
| 632 |
+
|
| 633 |
+
Pins (H2_linear_matched baseline):
|
| 634 |
+
svd='none', linear_readout=True, match_params=True
|
| 635 |
+
band='HIGH' (patch_size=4, img_size=64)
|
| 636 |
+
batch_size=256
|
| 637 |
+
batch_limit=20 (5120 samples seen β matches floor-sweep budget)
|
| 638 |
+
|
| 639 |
+
NOTE: smooth_mid is NOT varied here β PatchSVAE_F_Ablation doesn't
|
| 640 |
+
expose it as a parameter. All configs use the PatchSVAE_F_Ablation
|
| 641 |
+
default BoundarySmooth. If smooth_mid variation is needed later,
|
| 642 |
+
plumb it through the model class and add it as a grid axis.
|
| 643 |
+
|
| 644 |
+
LIMITATION: cv_of() returns 0 for V<5 (pentachoron volume needs β₯5
|
| 645 |
+
points). Vβ{2,4} configs will have observed_sphere_cv=0, cv_ema=0,
|
| 646 |
+
and predicted_band='LOW'. This is an architectural constraint of
|
| 647 |
+
the geometric validity metric, not a training failure. Use
|
| 648 |
+
test_mse_per_noise[0] and train_loss_trajectory as the primary
|
| 649 |
+
quality metrics for those configs; CV-based analysis applies only
|
| 650 |
+
to Vβ₯8 configs.
|
| 651 |
+
|
| 652 |
+
Records via run_ablation_config's full report: CV_ema, cv_last,
|
| 653 |
+
S0, SD, ratio, erank, observed_sphere_cv, band_deviation,
|
| 654 |
+
predicted_band, band_match, params_finite, cv_trajectory,
|
| 655 |
+
train_loss_trajectory, test_mse, test_mse_per_noise, plus
|
| 656 |
+
per-config wallclock and batches_completed.
|
| 657 |
+
"""
|
| 658 |
+
configs = []
|
| 659 |
+
for hidden in [4, 8, 16, 32, 64]:
|
| 660 |
+
for V in [2, 4, 8, 16, 32]:
|
| 661 |
+
for D in [2, 3, 4]:
|
| 662 |
+
for depth in [0, 1]:
|
| 663 |
+
for n_cross in [0, 1]:
|
| 664 |
+
for optimizer in ['adam', 'lbfgs']:
|
| 665 |
+
variant_name = (
|
| 666 |
+
f"P_h{hidden}_V{V}_D{D}_dp{depth}"
|
| 667 |
+
f"_nx{n_cross}_{optimizer}"
|
| 668 |
+
)
|
| 669 |
+
# Per-optimizer LR tuned for the 20-step budget:
|
| 670 |
+
# Adam at 1e-4 (Phase-2 default) barely moves in
|
| 671 |
+
# 20 steps on small models. LBFGS's line search
|
| 672 |
+
# handles its own step sizing; 1.0 is the library
|
| 673 |
+
# default for unit-Wolfe-step.
|
| 674 |
+
lr = 3e-3 if optimizer == 'adam' else 1.0
|
| 675 |
+
configs.append({
|
| 676 |
+
'group': 'P',
|
| 677 |
+
'variant': variant_name,
|
| 678 |
+
'band': 'HIGH',
|
| 679 |
+
'seed': 42,
|
| 680 |
+
'phase': 2,
|
| 681 |
+
'num_epochs': 1,
|
| 682 |
+
'batch_size': 256,
|
| 683 |
+
'batch_limit': 20,
|
| 684 |
+
'overrides': {
|
| 685 |
+
# H2_linear_matched baseline
|
| 686 |
+
'svd': 'none',
|
| 687 |
+
'linear_readout': True,
|
| 688 |
+
'match_params': True,
|
| 689 |
+
# Size axes
|
| 690 |
+
'hidden': hidden,
|
| 691 |
+
'V': V,
|
| 692 |
+
'D': D,
|
| 693 |
+
'depth': depth,
|
| 694 |
+
'n_cross': n_cross,
|
| 695 |
+
# Pin n_heads=1: D varies {2,3,4},
|
| 696 |
+
# default n_heads=4 would fail D=2,3
|
| 697 |
+
'n_heads': 1,
|
| 698 |
+
# Optimizer + LR tuned for short budget
|
| 699 |
+
'optimizer': optimizer,
|
| 700 |
+
'lr': lr,
|
| 701 |
+
# Gradient clipping catches LBFGS
|
| 702 |
+
# explosions (both initial-step Wolfe
|
| 703 |
+
# failures on tiny params and mid-training
|
| 704 |
+
# Hessian-approximation corruption on
|
| 705 |
+
# depth=1 + n_cross=1 configs). Standard
|
| 706 |
+
# defensive practice for small-model
|
| 707 |
+
# sweeps; no cost when not triggered.
|
| 708 |
+
'grad_clip': 1.0,
|
| 709 |
+
# Measure CV every 2 batches (was 50 β
|
| 710 |
+
# too coarse for a 20-batch sweep).
|
| 711 |
+
'cv_measure_every': 2,
|
| 712 |
+
# Pure MSE, no soft-hand (per 000079 β LBFGS
|
| 713 |
+
# Hessian corruption avoidance)
|
| 714 |
+
'soft_hand': False,
|
| 715 |
+
# Training: gaussian only (for floor detection)
|
| 716 |
+
'noise_types': [0],
|
| 717 |
+
# Testing: all 16 noises, 256 each.
|
| 718 |
+
# Separate from training distribution so
|
| 719 |
+
# per-noise generalization is measured.
|
| 720 |
+
'test_noise_types': list(range(16)),
|
| 721 |
+
'test_samples_per_noise': 256,
|
| 722 |
+
'test_batch_size': 64,
|
| 723 |
+
},
|
| 724 |
+
'description': (
|
| 725 |
+
f'P-HIGH-{variant_name} '
|
| 726 |
+
f'(floor sweep, 20-batch budget)'
|
| 727 |
+
),
|
| 728 |
+
})
|
| 729 |
+
return configs
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
def group_implicit_solver_A_d5_spherical() -> List[Dict[str, Any]]:
|
| 733 |
+
"""Implicit-solver A-set: D=5 spherical reference batteries.
|
| 734 |
+
|
| 735 |
+
Three configs to test the projective-axis hypothesis at D=5:
|
| 736 |
+
A3a: V=16, D=5 β minimal V, may force more antipodal collapses
|
| 737 |
+
A3b: V=32, D=5 β direct comparator to H2a (V=32, D=4)
|
| 738 |
+
A3c: V=64, D=5 β extra V room, may reduce antipodal pair count
|
| 739 |
+
|
| 740 |
+
All configs match Q-rank02 (H2a) baseline:
|
| 741 |
+
H2_linear_matched: svd=none, linear_readout=True, match_params=True
|
| 742 |
+
Adam @ lr=3e-3, depth=0, n_cross=0, n_heads=1
|
| 743 |
+
1000 batches, gaussian-only training
|
| 744 |
+
Per-noise test on all 16 noise types
|
| 745 |
+
|
| 746 |
+
Predicted (if 000101 generalizes to D=5):
|
| 747 |
+
- All three converge with finite MSE
|
| 748 |
+
- All three show projective-uniform distribution on βPβ΄
|
| 749 |
+
- Axis count grows with V; antipodal pair count grows with V/D
|
| 750 |
+
- Effective rank stays near full (~4.95/5)
|
| 751 |
+
|
| 752 |
+
A3b is the critical test (matches H2a config except D bumped to 5).
|
| 753 |
+
"""
|
| 754 |
+
A_CONFIGS = [
|
| 755 |
+
# (V, D, label)
|
| 756 |
+
(16, 5, 'A3a_V16_D5'),
|
| 757 |
+
(32, 5, 'A3b_V32_D5'),
|
| 758 |
+
(64, 5, 'A3c_V64_D5'),
|
| 759 |
+
]
|
| 760 |
+
|
| 761 |
+
configs = []
|
| 762 |
+
for V, D, label in A_CONFIGS:
|
| 763 |
+
variant_name = f"{label}_h64_dp0_nx0_adam"
|
| 764 |
+
configs.append({
|
| 765 |
+
'group': 'implicit_solver_A',
|
| 766 |
+
'variant': variant_name,
|
| 767 |
+
'band': 'HIGH', # nominally HIGH β D=5 is a new regime
|
| 768 |
+
'seed': 42,
|
| 769 |
+
'phase': 2,
|
| 770 |
+
'num_epochs': 1,
|
| 771 |
+
'batch_size': 256,
|
| 772 |
+
'batch_limit': 1000,
|
| 773 |
+
'overrides': {
|
| 774 |
+
'svd': 'none',
|
| 775 |
+
'linear_readout': True,
|
| 776 |
+
'match_params': True,
|
| 777 |
+
'hidden': 64,
|
| 778 |
+
'V': V,
|
| 779 |
+
'D': D,
|
| 780 |
+
'depth': 0,
|
| 781 |
+
'n_cross': 0,
|
| 782 |
+
'n_heads': 1,
|
| 783 |
+
'optimizer': 'adam',
|
| 784 |
+
'lr': 3e-3,
|
| 785 |
+
'grad_clip': 1.0,
|
| 786 |
+
'cv_measure_every': 50,
|
| 787 |
+
'soft_hand': False,
|
| 788 |
+
'noise_types': [0],
|
| 789 |
+
'test_noise_types': list(range(16)),
|
| 790 |
+
'test_samples_per_noise': 256,
|
| 791 |
+
'test_batch_size': 64,
|
| 792 |
+
},
|
| 793 |
+
'description': (
|
| 794 |
+
f'implicit_solver_A-{variant_name} '
|
| 795 |
+
f'(D=5 spherical reference, projective probe target)'
|
| 796 |
+
),
|
| 797 |
+
})
|
| 798 |
+
return configs
|
| 799 |
+
|
| 800 |
+
|
| 801 |
+
def get_implicit_solver_A_configs() -> List[Dict[str, Any]]:
|
| 802 |
+
"""Implicit-solver A-set Stage 1: D=5 spherical references."""
|
| 803 |
+
return group_implicit_solver_A_d5_spherical()
|
| 804 |
+
|
| 805 |
+
|
| 806 |
+
def group_R_packed_polytope_test() -> List[Dict[str, Any]]:
|
| 807 |
+
"""Sphere-packing prediction test β does V Γ D matter geometrically?
|
| 808 |
+
|
| 809 |
+
Hypothesis (from G-Class probe v3): the 32-row Γ D=3 G-Class behavior
|
| 810 |
+
(rotating antipodal frame) emerged because 32 points cannot be
|
| 811 |
+
uniformly arranged on SΒ² β geometric frustration. When V matches a
|
| 812 |
+
natural polytope vertex count for S^(D-1), training should produce
|
| 813 |
+
STATIC sphere-solver rows instead.
|
| 814 |
+
|
| 815 |
+
Three test configs (each predicted to produce H2-LIKE static rows):
|
| 816 |
+
- D=4, V=16: 16-cell (4-orthoplex) vertex count on SΒ³
|
| 817 |
+
- D=4, V=8: 16-cell again (8 vertices = 4D cross-polytope subset)
|
| 818 |
+
or 8-cell (tesseract) β 8 is canonical for both
|
| 819 |
+
- D=3, V=20: dodecahedron vertex count on SΒ²
|
| 820 |
+
|
| 821 |
+
All else matches H2a (Q-rank02): adam, lr=3e-3, depth=0, n_cross=0,
|
| 822 |
+
H2_linear_matched (svd=none, linear_readout=True, match_params=True).
|
| 823 |
+
1000 batches, gaussian-only training, 16-noise per-noise test.
|
| 824 |
+
|
| 825 |
+
Predicted result: all three produce row_stability > 0.85, antipodal
|
| 826 |
+
pair fraction < 0.55 β i.e. H2-LIKE character on the v3 probe.
|
| 827 |
+
"""
|
| 828 |
+
POLYTOPE_CONFIGS = [
|
| 829 |
+
# (V, D, polytope_name)
|
| 830 |
+
(16, 4, '16cell_orthoplex'),
|
| 831 |
+
(8, 4, '8cell_or_16cell_subset'),
|
| 832 |
+
(20, 3, 'dodecahedron'),
|
| 833 |
+
]
|
| 834 |
+
|
| 835 |
+
configs = []
|
| 836 |
+
for V, D, polytope in POLYTOPE_CONFIGS:
|
| 837 |
+
variant_name = f"R_h64_V{V}_D{D}_{polytope}_adam"
|
| 838 |
+
configs.append({
|
| 839 |
+
'group': 'R',
|
| 840 |
+
'variant': variant_name,
|
| 841 |
+
'band': 'HIGH',
|
| 842 |
+
'seed': 42,
|
| 843 |
+
'phase': 2,
|
| 844 |
+
'num_epochs': 1,
|
| 845 |
+
'batch_size': 256,
|
| 846 |
+
'batch_limit': 1000,
|
| 847 |
+
'overrides': {
|
| 848 |
+
'svd': 'none',
|
| 849 |
+
'linear_readout': True,
|
| 850 |
+
'match_params': True,
|
| 851 |
+
'hidden': 64,
|
| 852 |
+
'V': V,
|
| 853 |
+
'D': D,
|
| 854 |
+
'depth': 0,
|
| 855 |
+
'n_cross': 0,
|
| 856 |
+
'n_heads': 1,
|
| 857 |
+
'optimizer': 'adam',
|
| 858 |
+
'lr': 3e-3,
|
| 859 |
+
'grad_clip': 1.0,
|
| 860 |
+
'cv_measure_every': 50,
|
| 861 |
+
'soft_hand': False,
|
| 862 |
+
'noise_types': [0],
|
| 863 |
+
'test_noise_types': list(range(16)),
|
| 864 |
+
'test_samples_per_noise': 256,
|
| 865 |
+
'test_batch_size': 64,
|
| 866 |
+
},
|
| 867 |
+
'description': (
|
| 868 |
+
f'R-HIGH-{variant_name} '
|
| 869 |
+
f'(packing test, predicted H2-LIKE)'
|
| 870 |
+
),
|
| 871 |
+
})
|
| 872 |
+
return configs
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
def get_phaseR_configs() -> List[Dict[str, Any]]:
|
| 876 |
+
"""Phase R β sphere-packing prediction test (3 configs)."""
|
| 877 |
+
return group_R_packed_polytope_test()
|
| 878 |
+
|
| 879 |
+
|
| 880 |
+
def group_Q_h2_candidates() -> List[Dict[str, Any]]:
|
| 881 |
+
"""Top-10 P-sweep winners extended to 1000 batches.
|
| 882 |
+
|
| 883 |
+
These are the 10 configs flagged by the P-sweep analyzer's
|
| 884 |
+
continued-training-potential ranking. Each is re-run with the
|
| 885 |
+
same architecture and optimizer but with batch_limit=1000 (50Γ
|
| 886 |
+
the P sweep's 20-batch budget).
|
| 887 |
+
|
| 888 |
+
Purpose: answer the classification questions the P sweep couldn't:
|
| 889 |
+
- What's the actual convergence floor per config?
|
| 890 |
+
- Does Adam catch LBFGS with enough budget? (6 Adam / 4 LBFGS in top 10)
|
| 891 |
+
- Where does the loss trajectory flatten?
|
| 892 |
+
- Does discrimination ratio sharpen with more training?
|
| 893 |
+
- Does final CV land in the valid band (0.13-0.30)?
|
| 894 |
+
|
| 895 |
+
Results feed into H2 class-rank assignment.
|
| 896 |
+
|
| 897 |
+
cv_measure_every=50 so we get ~20 CV measurements across the run
|
| 898 |
+
(P sweep used 2, which would be 500 measurements at 1000 batches β
|
| 899 |
+
too many).
|
| 900 |
+
"""
|
| 901 |
+
# Top 10 from P-sweep analyzer (ranked by continued_training_potential)
|
| 902 |
+
TOP_10 = [
|
| 903 |
+
# (hidden, V, D, depth, n_cross, optimizer)
|
| 904 |
+
(64, 32, 4, 1, 0, 'lbfgs'), # 1 β 57123 params, P-MSE 0.053
|
| 905 |
+
(64, 32, 4, 0, 0, 'adam'), # 2 β 40227 params, P-MSE 0.572
|
| 906 |
+
(64, 32, 4, 0, 1, 'adam'), # 3 β 40319 params, P-MSE 0.584
|
| 907 |
+
(64, 32, 4, 0, 1, 'lbfgs'), # 4 β 40319 params, P-MSE 0.041
|
| 908 |
+
(64, 16, 4, 1, 1, 'lbfgs'), # 5 β 36607 params, P-MSE 0.115
|
| 909 |
+
(64, 32, 3, 1, 1, 'adam'), # 6 β 45852 params, P-MSE 0.656
|
| 910 |
+
(64, 32, 3, 0, 1, 'adam'), # 7 β 28956 params, P-MSE 0.641
|
| 911 |
+
(64, 32, 4, 1, 1, 'adam'), # 8 β 57215 params, P-MSE 0.620
|
| 912 |
+
(64, 32, 3, 0, 0, 'adam'), # 9 β 28899 params, P-MSE 0.638
|
| 913 |
+
(64, 32, 2, 0, 1, 'adam'), # 10 β 19649 params, P-MSE 0.736
|
| 914 |
+
]
|
| 915 |
+
|
| 916 |
+
configs = []
|
| 917 |
+
for rank, (hidden, V, D, depth, n_cross, optimizer) in enumerate(TOP_10, start=1):
|
| 918 |
+
variant_name = (
|
| 919 |
+
f"Q_rank{rank:02d}_h{hidden}_V{V}_D{D}_dp{depth}"
|
| 920 |
+
f"_nx{n_cross}_{optimizer}"
|
| 921 |
+
)
|
| 922 |
+
# Same LR as P sweep: Adam 3e-3, LBFGS 1.0
|
| 923 |
+
lr = 3e-3 if optimizer == 'adam' else 1.0
|
| 924 |
+
configs.append({
|
| 925 |
+
'group': 'Q',
|
| 926 |
+
'variant': variant_name,
|
| 927 |
+
'band': 'HIGH',
|
| 928 |
+
'seed': 42,
|
| 929 |
+
'phase': 2,
|
| 930 |
+
'num_epochs': 1,
|
| 931 |
+
'batch_size': 256,
|
| 932 |
+
'batch_limit': 1000, # 50Γ the P sweep
|
| 933 |
+
'overrides': {
|
| 934 |
+
# H2_linear_matched baseline
|
| 935 |
+
'svd': 'none',
|
| 936 |
+
'linear_readout': True,
|
| 937 |
+
'match_params': True,
|
| 938 |
+
# Size axes (from P winner)
|
| 939 |
+
'hidden': hidden,
|
| 940 |
+
'V': V,
|
| 941 |
+
'D': D,
|
| 942 |
+
'depth': depth,
|
| 943 |
+
'n_cross': n_cross,
|
| 944 |
+
'n_heads': 1,
|
| 945 |
+
# Optimizer
|
| 946 |
+
'optimizer': optimizer,
|
| 947 |
+
'lr': lr,
|
| 948 |
+
'grad_clip': 1.0,
|
| 949 |
+
# CV measurement β every 50 gives ~20 measurements
|
| 950 |
+
# across the 1000-batch run. P used 2 (too frequent
|
| 951 |
+
# at this budget).
|
| 952 |
+
'cv_measure_every': 50,
|
| 953 |
+
# Pure MSE, no soft-hand
|
| 954 |
+
'soft_hand': False,
|
| 955 |
+
# Training: gaussian only (matches P sweep)
|
| 956 |
+
'noise_types': [0],
|
| 957 |
+
# Full 16-noise test at end
|
| 958 |
+
'test_noise_types': list(range(16)),
|
| 959 |
+
'test_samples_per_noise': 256,
|
| 960 |
+
'test_batch_size': 64,
|
| 961 |
+
},
|
| 962 |
+
'description': (
|
| 963 |
+
f'Q-HIGH-{variant_name} '
|
| 964 |
+
f'(H2 candidate extended sweep, 1000 batches)'
|
| 965 |
+
),
|
| 966 |
+
})
|
| 967 |
+
return configs
|
| 968 |
+
|
| 969 |
+
|
| 970 |
+
def get_phaseQ_configs() -> List[Dict[str, Any]]:
|
| 971 |
+
"""Phase Q β top-10 P winners at 1000 batches for H2 class-rank assignment."""
|
| 972 |
+
return group_Q_h2_candidates()
|
| 973 |
+
|
| 974 |
+
|
| 975 |
+
def get_phaseP_configs() -> List[Dict[str, Any]]:
|
| 976 |
+
"""Phase P (floor sweep) β 600 configs at 20 batches each."""
|
| 977 |
+
return group_P_small_battery_floor()
|
| 978 |
+
|
| 979 |
+
|
| 980 |
+
def get_phase2_configs() -> List[Dict[str, Any]]:
|
| 981 |
+
"""Phase 2 matrix β 1 epoch each at batch_size=256, resume-capable.
|
| 982 |
+
|
| 983 |
+
Revised from original 174-config design after Phase 1 settled the
|
| 984 |
+
"does the attractor survive" question. Phase 2 now characterizes
|
| 985 |
+
WITHIN-ATTRACTOR behavior over one full epoch (~3900 batches):
|
| 986 |
+
|
| 987 |
+
- Group E (36 runs): within-attractor MSE under each soft-hand regime
|
| 988 |
+
- Group H (42 runs): SVD necessity (vs learned linear readout)
|
| 989 |
+
- Group L2 (6 runs): LBFGS within-attractor MSE characterization
|
| 990 |
+
(MID + HIGH only; LOW stipended pending RLBFGS
|
| 991 |
+
engineering pass β see group_L2_lbfgs docstring)
|
| 992 |
+
|
| 993 |
+
Total: 84 runs. Intriguing cases can be continued to epoch 3 or 5
|
| 994 |
+
using the orchestrator's continue_training() function.
|
| 995 |
+
"""
|
| 996 |
+
return (
|
| 997 |
+
group_E_soft_hand() # 36 runs
|
| 998 |
+
+ group_H_svd_necessity() # 42 runs
|
| 999 |
+
+ group_L2_lbfgs() # 6 runs
|
| 1000 |
+
)
|
| 1001 |
+
|
| 1002 |
+
|
| 1003 |
+
def summarize(configs: List[Dict[str, Any]]) -> None:
|
| 1004 |
+
"""Print a breakdown of the matrix for sanity-check."""
|
| 1005 |
+
by_group = {}
|
| 1006 |
+
by_band = {}
|
| 1007 |
+
by_phase = {}
|
| 1008 |
+
for c in configs:
|
| 1009 |
+
by_group[c['group']] = by_group.get(c['group'], 0) + 1
|
| 1010 |
+
by_band[c['band']] = by_band.get(c['band'], 0) + 1
|
| 1011 |
+
by_phase[c['phase']] = by_phase.get(c['phase'], 0) + 1
|
| 1012 |
+
|
| 1013 |
+
print(f"Total configs: {len(configs)}")
|
| 1014 |
+
print(f"\nBy group:")
|
| 1015 |
+
for g, n in sorted(by_group.items()):
|
| 1016 |
+
print(f" {g}: {n}")
|
| 1017 |
+
print(f"\nBy band:")
|
| 1018 |
+
for b, n in sorted(by_band.items()):
|
| 1019 |
+
print(f" {b}: {n}")
|
| 1020 |
+
print(f"\nBy phase:")
|
| 1021 |
+
for p, n in sorted(by_phase.items()):
|
| 1022 |
+
print(f" Phase {p}: {n}")
|
| 1023 |
+
|
| 1024 |
+
|
| 1025 |
+
if __name__ == '__main__':
|
| 1026 |
+
print("=" * 60)
|
| 1027 |
+
print("PHASE 1 MATRIX")
|
| 1028 |
+
print("=" * 60)
|
| 1029 |
+
summarize(get_phase1_configs())
|
| 1030 |
+
print()
|
| 1031 |
+
print("=" * 60)
|
| 1032 |
+
print("PHASE 2 MATRIX")
|
| 1033 |
+
print("=" * 60)
|
| 1034 |
+
summarize(get_phase2_configs())
|