AbstractPhil
/

geolip-svae-implicit-solver-experiments

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on 3 days ago

Commit

418e2a3

verified ·

1 Parent(s): 550ab47

Update 14_d5_arch_config.py

Browse files

Files changed (1) hide show

14_d5_arch_config.py +1034 -0

14_d5_arch_config.py CHANGED Viewed

	@@ -0,0 +1,1034 @@

+"""
+ablation_configs.py
+====================
+The ablation matrix for the three-band SVAE validation sweep.
+Each config is a dict of overrides on the baseline PatchSVAE_F trainer.
+The trainer expects:
+  - band: 'LOW' | 'MID' | 'HIGH'  (selects the base architecture)
+  - variant: unique identifier for this variant within the group
+  - seed: random seed
+  - phase: 1 (1000-batch triage) | 2 (30-epoch full)
+  - overrides: dict of RunConfig field overrides
+Three band representatives (kept constant across every test):
+  LOW:  S=64, V=64, D=16, h=64, d=1, patch=16, 184K params, CV target ≈ 0.21
+  MID:  S=64, V=64, D=8,  h=64, d=1, patch=16, 183K params, CV target ≈ 0.39
+  HIGH: S=64, V=32, D=4,  h=64, d=1, patch=4,   41K params, CV target ≈ 1.10
+Phase 1 early-stop:
+  - LOW/MID bands: train to batch 1000, record CV_ema, classify band
+  - HIGH band:     train to batch 100, record CV_ema, classify band
+Phase 2 full run:
+  - Group E (soft-hand): 30 epochs, 10 seeds per variant
+  - Group H (SVD necessity): 30 epochs, 3 seeds per variant
+"""
+from typing import Dict, List, Any
+from dataclasses import dataclass, field, asdict
+# ----------------------------------------------------------------------------
+# Band representatives — the three anchor configs
+# ----------------------------------------------------------------------------
+BAND_REPS = {
+    'LOW': {
+        'img_size': 64,
+        'V': 64,
+        'D': 16,
+        'hidden': 64,
+        'depth': 1,
+        'patch_size': 16,
+        'n_cross': 1,
+        'expected_cv': 0.21,
+        'expected_params': 184_000,
+    },
+    'MID': {
+        'img_size': 64,
+        'V': 64,
+        'D': 8,
+        'hidden': 64,
+        'depth': 1,
+        'patch_size': 16,
+        'n_cross': 1,
+        'expected_cv': 0.39,
+        'expected_params': 183_000,
+    },
+    'HIGH': {
+        'img_size': 64,
+        'V': 32,
+        'D': 4,
+        'hidden': 64,
+        'depth': 1,
+        'patch_size': 4,
+        'n_cross': 1,
+        'expected_cv': 1.10,
+        'expected_params': 41_000,
+    },
+}
+def band_classifier(cv_ema: float) -> str:
+    """Classify a final CV-EMA value into a band."""
+    if cv_ema < 0.30:
+        return 'LOW'
+    elif cv_ema < 0.55:
+        return 'MID'
+    elif cv_ema > 0.80:
+        return 'HIGH'
+    return 'UNCLASSIFIED'
+def phase1_batch_limit(band: str) -> int:
+    """How many batches to train before stopping for Phase 1 band classification."""
+    if band == 'HIGH':
+        return 100
+    return 1000
+def phase2_batch_limit(config: Dict[str, Any]) -> int:
+    """How many batches per epoch for Phase 2.
+    Per-config override: if the config specifies 'batch_limit', use it.
+    This allows the floor sweep (P group) to cap at a few dozen batches
+    without changing defaults for existing phase-2 configs.
+    Default behavior (unchanged):
+    - Adam at batch_size=256: 1_000_000 / 256 ≈ 3900 batches
+    - LBFGS at batch_size=32: normally 31250 batches, but LBFGS
+      does 20 inner iterations per outer step so ~40k gradient steps
+      per batch — we cap at 2000 outer batches = ~40k gradient steps
+      which is plenty for within-attractor convergence
+    The batch_size is read from the config (Phase 2 configs include
+    an explicit batch_size field).
+    """
+    # Per-config explicit batch_limit takes precedence
+    if 'batch_limit' in config:
+        return config['batch_limit']
+    overrides = config.get('overrides', {})
+    if overrides.get('optimizer') == 'lbfgs':
+        return 2000  # cap for LBFGS wallclock
+    batch_size = config.get('batch_size', 256)
+    return 1_000_000 // batch_size
+# ----------------------------------------------------------------------------
+# Ablation group definitions
+# ----------------------------------------------------------------------------
+def group_A_seed_replication() -> List[Dict[str, Any]]:
+    """Reproducibility: 5 seeds × 3 bands = 15 runs.
+    Tests whether each band reproducibly appears across random inits.
+    Acceptance: >=4/5 seeds per band within +/-0.02 of expected CV.
+    """
+    configs = []
+    for band in ['LOW', 'MID', 'HIGH']:
+        for seed in range(5):
+            configs.append({
+                'group': 'A',
+                'variant': 'baseline',
+                'band': band,
+                'seed': seed,
+                'phase': 1,
+                'overrides': {},  # no overrides, just seed variation
+                'description': f'A-{band}-baseline-s{seed}',
+            })
+    return configs
+def group_B_dataset_composition() -> List[Dict[str, Any]]:
+    """Noise-type dependence: 6 variants × 3 bands = 18 runs.
+    Tests whether band structure is architecture-driven or data-driven.
+    """
+    variants = {
+        'B1_all16': list(range(16)),
+        'B2_gaussian_only': [0],
+        'B3_structured': [3, 4, 5, 11, 13],   # block, gradient, checker, mixed, structural
+        'B4_heavy_tailed': [6, 7, 10],         # cauchy, laplace, exponential (check indices)
+        'B5_first_half': list(range(8)),
+        'B6_even_indices': [0, 2, 4, 6, 8, 10, 12, 14],
+    }
+    configs = []
+    for variant_name, types in variants.items():
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'B',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': {'noise_types': types},
+                'description': f'B-{band}-{variant_name}',
+            })
+    return configs
+def group_C_optimizer() -> List[Dict[str, Any]]:
+    """Optimizer dependence: 4 variants × 3 bands = 12 runs.
+    Tests whether attractor is Adam-specific.
+    NOTE: LBFGS was originally included as C5 but removed 2026-04-20
+    after empirical evidence that it is incompatible with the sphere-
+    normed architecture as currently constructed. LBFGS's flat-space
+    strong Wolfe line search drives parameters away from the sphere
+    manifold during line search, producing ill-conditioned SVD inputs.
+    Symptoms observed: D=16 crashed in torch.linalg.eigh with "failed
+    to converge — ill-conditioned or too many repeated eigenvalues";
+    D=8 and D=4 completed but produced NaN MSE (CV measurements at
+    intermediate batches were valid — 0.3373 MID, 0.9435 HIGH — but
+    final test MSE was NaN, indicating parameters went non-finite
+    during training).
+    This is NOT a finding about LBFGS as an optimizer — it's a finding
+    about the LBFGS-sphere_norm interaction. Proper test requires
+    Riemannian LBFGS with constraint-aware line search. See scratchpad
+    entry 000080 for the dedicated LBFGS engineering pass TODO.
+    """
+    variants = [
+        ('C1_adam',         {'optimizer': 'adam',  'lr': 1e-4, 'weight_decay': 0.0}),
+        ('C2_sgd',          {'optimizer': 'sgd',   'lr': 1e-2, 'momentum': 0.0}),
+        ('C3_sgd_momentum', {'optimizer': 'sgd',   'lr': 1e-2, 'momentum': 0.9}),
+        ('C4_adamw',        {'optimizer': 'adamw', 'lr': 1e-4, 'weight_decay': 0.01}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'C',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'C-{band}-{variant_name}',
+            })
+    return configs
+def group_D_schedule() -> List[Dict[str, Any]]:
+    """LR schedule: 5 variants × 3 bands = 15 runs."""
+    variants = [
+        ('D1_cosine',       {'scheduler': 'cosine'}),
+        ('D2_constant',     {'scheduler': 'constant'}),
+        ('D3_linear_decay', {'scheduler': 'linear'}),
+        ('D4_warm_restart', {'scheduler': 'cosine_warm_restarts', 'T_0': 1000}),
+        ('D5_one_cycle',    {'scheduler': 'one_cycle'}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'D',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'D-{band}-{variant_name}',
+            })
+    return configs
+def group_E_soft_hand() -> List[Dict[str, Any]]:
+    """Soft-hand guidance — PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
+    Phase 1 E_preview already showed all four variants reach the same band
+    at 1000 batches (all within 0.0014 CV). The Phase 2 question is NO
+    LONGER "does the attractor survive" — that's settled — but rather:
+    "what's the within-attractor reconstruction MSE under each soft-hand
+    regime over a full epoch?"
+    Primary comparison: E1 (full soft-hand) vs E2 (pure MSE). If MSE
+    differs meaningfully, soft-hand is trading reconstruction quality
+    for geometric coherence at an epoch-scale budget.
+    4 variants × 3 bands × 3 seeds = 36 runs.
+    """
+    variants = [
+        ('E1_full_softhand',   {'soft_hand': True,  'boost': 0.5, 'cv_penalty': 0.3}),
+        ('E2_pure_mse',        {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
+        ('E3_measure_only',    {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
+        ('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            for seed in range(3):
+                configs.append({
+                    'group': 'E',
+                    'variant': variant_name,
+                    'band': band,
+                    'seed': seed,
+                    'phase': 2,
+                    'num_epochs': 1,
+                    'batch_size': 256,
+                    'overrides': overrides,
+                    'description': f'E-{band}-{variant_name}-s{seed}',
+                })
+    return configs
+def group_E_subset_phase1() -> List[Dict[str, Any]]:
+    """E subset for Phase 1 preview — 1 seed per variant, 1000 batches.
+    Quick read on whether E2 even approaches the attractor before
+    committing to full Phase 2 Group E. 4 variants × 3 bands = 12 runs.
+    """
+    variants = [
+        ('E1_full_softhand',   {'soft_hand': True,  'boost': 0.5, 'cv_penalty': 0.3}),
+        ('E2_pure_mse',        {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0}),
+        ('E3_measure_only',    {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 0.0, 'cv_measurement_only': True}),
+        ('E4_hard_cv_penalty', {'soft_hand': False, 'boost': 0.0, 'cv_penalty': 1.0, 'hard_cv_target': 0.21}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'E_preview',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'Eprev-{band}-{variant_name}',
+            })
+    return configs
+def group_F_activation() -> List[Dict[str, Any]]:
+    """Activation function: 5 variants × 3 bands = 15 runs."""
+    variants = [
+        ('F1_gelu',     {'activation': 'gelu'}),
+        ('F2_relu',     {'activation': 'relu'}),
+        ('F3_silu',     {'activation': 'silu'}),
+        ('F4_tanh',     {'activation': 'tanh'}),
+        ('F5_identity', {'activation': 'identity'}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'F',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'F-{band}-{variant_name}',
+            })
+    return configs
+def group_G_sphere_norm() -> List[Dict[str, Any]]:
+    """Sphere-norm ablation: 4 variants × 3 bands = 12 runs.
+    Expected per framework: G2 (no sphere-norm) reproduces charge-
+    discharge catastrophe. G3/G4 may or may not preserve the band.
+    """
+    variants = [
+        ('G1_sphere_norm', {'row_norm': 'sphere'}),        # baseline, F.normalize(dim=-1)
+        ('G2_no_norm',     {'row_norm': 'none'}),           # raw M to SVD
+        ('G3_layer_norm',  {'row_norm': 'layer_norm'}),
+        ('G4_scale_only',  {'row_norm': 'scale_only'}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'G',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'G-{band}-{variant_name}',
+            })
+    return configs
+def group_H_svd_necessity() -> List[Dict[str, Any]]:
+    """SVD necessity — PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
+    Tests whether learned linear readout can match SVD, and whether
+    fp64 SVD precision and per-batch SVD are load-bearing.
+    Staged seed counts based on the question each variant answers:
+    - H1/H2/H3 (3 seeds): core SVD-vs-linear comparison, needs variance
+    - H4/H5 (2 seeds): precision/batching questions, binary yes/no
+    - H6 (1 seed): expected-failure confirmation
+    Total: 3×3 + 3×3 + 3×3 + 3×2 + 3×2 + 3×1 = 42 runs
+    """
+    variants_full = [  # 3 seeds
+        ('H1_svd_fp64',          {'svd': 'fp64'}),
+        ('H2_linear_matched',    {'svd': 'none', 'linear_readout': True, 'match_params': True}),
+        ('H3_linear_unmatched',  {'svd': 'none', 'linear_readout': True, 'match_params': False}),
+    ]
+    variants_probe = [  # 2 seeds
+        ('H4_svd_fp32',          {'svd': 'fp32'}),
+        ('H5_batch_shared_svd',  {'svd': 'batch_shared'}),
+    ]
+    variants_confirm = [  # 1 seed, expected failure
+        ('H6_no_svd_direct',     {'svd': 'none', 'linear_readout': False}),
+    ]
+    configs = []
+    for variants, n_seeds in [(variants_full, 3), (variants_probe, 2), (variants_confirm, 1)]:
+        for variant_name, overrides in variants:
+            for band in ['LOW', 'MID', 'HIGH']:
+                for seed in range(n_seeds):
+                    configs.append({
+                        'group': 'H',
+                        'variant': variant_name,
+                        'band': band,
+                        'seed': seed,
+                        'phase': 2,
+                        'num_epochs': 1,
+                        'batch_size': 256,
+                        'overrides': overrides,
+                        'description': f'H-{band}-{variant_name}-s{seed}',
+                    })
+    return configs
+def group_L2_lbfgs() -> List[Dict[str, Any]]:
+    """LBFGS characterization — PHASE 2 (1 epoch, ~3900 batches at batch_size=256).
+    Front-loads LBFGS investigation after Phil's isolated test at 100
+    batches showed LBFGS + pure MSE + no soft-hand reaches the HIGH
+    attractor (CV 0.869) with better within-attractor reconstruction MSE
+    (0.0644) than Adam + soft-hand achieves at 30 epochs (0.072).
+    Phase 2 L2 tests whether this gap holds at epoch scale and whether
+    MID band shows a similar effect.
+    ═══════════════════════════════════════════════════════════════
+    STIPEND: LOW band (D=16) OMITTED pending LBFGS engineering pass.
+    ═══════════════════════════════════════════════════════════════
+    Isolated test in Phase 1 session confirmed LBFGS + sphere_norm +
+    D=16 crashes torch.linalg.eigh (error code 15, ill-conditioned
+    Gram matrix). PyTorch LBFGS's flat-space strong Wolfe line search
+    drives parameters off the sphere manifold, producing degenerate
+    SVD inputs. Fix requires Riemannian (constraint-aware) line
+    search — see scratchpad entry 000080 for the engineering pass
+    TODO. L2-LOW will be runnable once RLBFGS integration lands.
+    Current scope: MID + HIGH only, pure MSE + no soft-hand
+    (matching the Phil isolated test configuration that produced
+    the 0.869/0.0644 data point).
+    2 bands × 3 seeds = 6 runs.
+    """
+    variants = [
+        ('L2_lbfgs_pure_mse', {
+            'optimizer': 'lbfgs',
+            'lr': 1.0,
+            'batch_size': 32,     # LBFGS small-batch required for closure stability
+            'soft_hand': False,   # no soft-hand (corrupted Hessian approximation)
+            'boost': 0.0,
+            'cv_penalty': 0.0,
+        }),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['MID', 'HIGH']:  # LOW stipended — see docstring
+            for seed in range(3):
+                configs.append({
+                    'group': 'L2',
+                    'variant': variant_name,
+                    'band': band,
+                    'seed': seed,
+                    'phase': 2,
+                    'num_epochs': 1,
+                    'batch_size': 32,  # overrides default (LBFGS needs small batch)
+                    'overrides': overrides,
+                    'description': f'L2-{band}-{variant_name}-s{seed}',
+                })
+    return configs
+def group_I_cross_attention() -> List[Dict[str, Any]]:
+    """Cross-attention necessity: 4 variants × 3 bands = 12 runs."""
+    variants = [
+        ('I1_1layer',          {'n_cross': 1, 'max_alpha': 0.2}),
+        ('I2_0layers',         {'n_cross': 0}),
+        ('I3_2layers',         {'n_cross': 2, 'max_alpha': 0.2}),
+        ('I4_unbounded_alpha', {'n_cross': 1, 'max_alpha': 1.0}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'I',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'I-{band}-{variant_name}',
+            })
+    return configs
+def group_J_capacity_within_LOW() -> List[Dict[str, Any]]:
+    """Minimum on-attractor parameter count — LOW band only, 5 variants."""
+    variants = [
+        ('J1_V64_h64',    {'V': 64,  'hidden': 64}),   # baseline, 184K
+        ('J2_V32_h32',    {'V': 32,  'hidden': 32}),   # ~50K
+        ('J3_V16_h32',    {'V': 16,  'hidden': 32}),   # ~30K
+        ('J4_V64_h32',    {'V': 64,  'hidden': 32}),   # ~100K
+        ('J5_V128_h128',  {'V': 128, 'hidden': 128}),  # ~528K
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        configs.append({
+            'group': 'J',
+            'variant': variant_name,
+            'band': 'LOW',
+            'seed': 0,
+            'phase': 1,
+            'overrides': overrides,
+            'description': f'J-LOW-{variant_name}',
+        })
+    return configs
+def group_K_batch_size() -> List[Dict[str, Any]]:
+    """Batch size sensitivity: 4 variants × 3 bands = 12 runs."""
+    variants = [
+        ('K1_bs128',  {'batch_size': 128}),
+        ('K2_bs32',   {'batch_size': 32}),
+        ('K3_bs512',  {'batch_size': 512}),
+        ('K4_bs1024', {'batch_size': 1024}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'K',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'K-{band}-{variant_name}',
+            })
+    return configs
+def group_L_initialization() -> List[Dict[str, Any]]:
+    """Init: 4 variants × 3 bands = 12 runs."""
+    variants = [
+        ('L1_orthogonal',    {'init': 'orthogonal'}),
+        ('L2_kaiming',       {'init': 'kaiming_normal'}),
+        ('L3_xavier',        {'init': 'xavier_uniform'}),
+        ('L4_normal_small',  {'init': 'normal_0_02'}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'L',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'L-{band}-{variant_name}',
+            })
+    return configs
+def group_M_brute_force_sgd() -> List[Dict[str, Any]]:
+    """Brute-force SGD stress: 3 variants × 3 bands = 9 runs."""
+    variants = [
+        ('M1_sgd_aggressive',   {'optimizer': 'sgd', 'lr': 1e-1, 'momentum': 0.0, 'warmup': 0}),
+        ('M2_sgd_huge_lr',      {'optimizer': 'sgd', 'lr': 1.0,  'momentum': 0.0, 'grad_clip': 1.0}),
+        ('M3_sgd_high_momentum',{'optimizer': 'sgd', 'lr': 3e-3, 'momentum': 0.99}),
+    ]
+    configs = []
+    for variant_name, overrides in variants:
+        for band in ['LOW', 'MID', 'HIGH']:
+            configs.append({
+                'group': 'M',
+                'variant': variant_name,
+                'band': band,
+                'seed': 0,
+                'phase': 1,
+                'overrides': overrides,
+                'description': f'M-{band}-{variant_name}',
+            })
+    return configs
+def group_N_uniformity_diagnostic() -> List[Dict[str, Any]]:
+    """Attractor uniformity diagnostic — NOT a standalone group.
+    Instead, ADDED TO EVERY other variant's post-training analysis:
+    1. Extract final sphere-normed rows
+    2. Compute pentachoron CV at n_samples=2000
+    3. Compare to uniform-sphere prediction for that D
+    4. Record observed_CV, uniform_CV, deviation in final_report.json
+    This function returns 0 standalone configs — Group N is a flag
+    that every other group's runs should include the diagnostic.
+    """
+    return []
+# ----------------------------------------------------------------------------
+# Full matrix assembly
+# ----------------------------------------------------------------------------
+def get_phase1_configs() -> List[Dict[str, Any]]:
+    """Phase 1 matrix — all band-classification ablations.
+    Recommended run order (most informative first):
+      1. Group A (seed replication) — foundational
+      2. Group G (sphere-norm) — framework verification
+      3. Group E_preview (soft-hand 1000-batch preview)
+      4. Group B, C, D, F, I, J, K, L, M — remaining ablations
+    """
+    return (
+        group_A_seed_replication()          # 15 runs
+        + group_G_sphere_norm()             # 12 runs
+        + group_E_subset_phase1()           # 12 runs
+        + group_B_dataset_composition()     # 18 runs
+        + group_C_optimizer()               # 15 runs
+        + group_D_schedule()                # 15 runs
+        + group_F_activation()              # 15 runs
+        + group_I_cross_attention()         # 12 runs
+        + group_J_capacity_within_LOW()     # 5 runs
+        + group_K_batch_size()              # 12 runs
+        + group_L_initialization()          # 12 runs
+        + group_M_brute_force_sgd()         # 9 runs
+    )
+def group_P_small_battery_floor() -> List[Dict[str, Any]]:
+    """Small-battery floor sweep — PHASE 2 variant with tiny batch budget.
+    Grid-sweeps architecture at the H2_linear_matched baseline to find
+    the smallest battery that still reconstructs gaussian within a
+    reasonable multiplier of the h2-64 floor AND lands in a valid
+    geometric attractor (CV in MID/HIGH range).
+    Grid axes:
+      hidden:     {4, 8, 16, 32, 64}       5
+      V:          {2, 4, 8, 16, 32}        5
+      D:          {2, 3, 4}                3
+      depth:      {0, 1}                   2
+      n_cross:    {0, 1}                   2
+      optimizer:  {'adam', 'lbfgs'}        2
+    Full product: 5 × 5 × 3 × 2 × 2 × 2 = 600 runs.
+    Pins (H2_linear_matched baseline):
+      svd='none', linear_readout=True, match_params=True
+      band='HIGH' (patch_size=4, img_size=64)
+      batch_size=256
+      batch_limit=20 (5120 samples seen — matches floor-sweep budget)
+    NOTE: smooth_mid is NOT varied here — PatchSVAE_F_Ablation doesn't
+    expose it as a parameter. All configs use the PatchSVAE_F_Ablation
+    default BoundarySmooth. If smooth_mid variation is needed later,
+    plumb it through the model class and add it as a grid axis.
+    LIMITATION: cv_of() returns 0 for V<5 (pentachoron volume needs ≥5
+    points). V∈{2,4} configs will have observed_sphere_cv=0, cv_ema=0,
+    and predicted_band='LOW'. This is an architectural constraint of
+    the geometric validity metric, not a training failure. Use
+    test_mse_per_noise[0] and train_loss_trajectory as the primary
+    quality metrics for those configs; CV-based analysis applies only
+    to V≥8 configs.
+    Records via run_ablation_config's full report: CV_ema, cv_last,
+    S0, SD, ratio, erank, observed_sphere_cv, band_deviation,
+    predicted_band, band_match, params_finite, cv_trajectory,
+    train_loss_trajectory, test_mse, test_mse_per_noise, plus
+    per-config wallclock and batches_completed.
+    """
+    configs = []
+    for hidden in [4, 8, 16, 32, 64]:
+        for V in [2, 4, 8, 16, 32]:
+            for D in [2, 3, 4]:
+                for depth in [0, 1]:
+                    for n_cross in [0, 1]:
+                        for optimizer in ['adam', 'lbfgs']:
+                            variant_name = (
+                                f"P_h{hidden}_V{V}_D{D}_dp{depth}"
+                                f"_nx{n_cross}_{optimizer}"
+                            )
+                            # Per-optimizer LR tuned for the 20-step budget:
+                            # Adam at 1e-4 (Phase-2 default) barely moves in
+                            # 20 steps on small models. LBFGS's line search
+                            # handles its own step sizing; 1.0 is the library
+                            # default for unit-Wolfe-step.
+                            lr = 3e-3 if optimizer == 'adam' else 1.0
+                            configs.append({
+                                'group': 'P',
+                                'variant': variant_name,
+                                'band': 'HIGH',
+                                'seed': 42,
+                                'phase': 2,
+                                'num_epochs': 1,
+                                'batch_size': 256,
+                                'batch_limit': 20,
+                                'overrides': {
+                                    # H2_linear_matched baseline
+                                    'svd': 'none',
+                                    'linear_readout': True,
+                                    'match_params': True,
+                                    # Size axes
+                                    'hidden': hidden,
+                                    'V': V,
+                                    'D': D,
+                                    'depth': depth,
+                                    'n_cross': n_cross,
+                                    # Pin n_heads=1: D varies {2,3,4},
+                                    # default n_heads=4 would fail D=2,3
+                                    'n_heads': 1,
+                                    # Optimizer + LR tuned for short budget
+                                    'optimizer': optimizer,
+                                    'lr': lr,
+                                    # Gradient clipping catches LBFGS
+                                    # explosions (both initial-step Wolfe
+                                    # failures on tiny params and mid-training
+                                    # Hessian-approximation corruption on
+                                    # depth=1 + n_cross=1 configs). Standard
+                                    # defensive practice for small-model
+                                    # sweeps; no cost when not triggered.
+                                    'grad_clip': 1.0,
+                                    # Measure CV every 2 batches (was 50 —
+                                    # too coarse for a 20-batch sweep).
+                                    'cv_measure_every': 2,
+                                    # Pure MSE, no soft-hand (per 000079 — LBFGS
+                                    # Hessian corruption avoidance)
+                                    'soft_hand': False,
+                                    # Training: gaussian only (for floor detection)
+                                    'noise_types': [0],
+                                    # Testing: all 16 noises, 256 each.
+                                    # Separate from training distribution so
+                                    # per-noise generalization is measured.
+                                    'test_noise_types': list(range(16)),
+                                    'test_samples_per_noise': 256,
+                                    'test_batch_size': 64,
+                                },
+                                'description': (
+                                    f'P-HIGH-{variant_name} '
+                                    f'(floor sweep, 20-batch budget)'
+                                ),
+                            })
+    return configs
+def group_implicit_solver_A_d5_spherical() -> List[Dict[str, Any]]:
+    """Implicit-solver A-set: D=5 spherical reference batteries.
+    Three configs to test the projective-axis hypothesis at D=5:
+      A3a: V=16, D=5  — minimal V, may force more antipodal collapses
+      A3b: V=32, D=5  — direct comparator to H2a (V=32, D=4)
+      A3c: V=64, D=5  — extra V room, may reduce antipodal pair count
+    All configs match Q-rank02 (H2a) baseline:
+      H2_linear_matched: svd=none, linear_readout=True, match_params=True
+      Adam @ lr=3e-3, depth=0, n_cross=0, n_heads=1
+      1000 batches, gaussian-only training
+      Per-noise test on all 16 noise types
+    Predicted (if 000101 generalizes to D=5):
+      - All three converge with finite MSE
+      - All three show projective-uniform distribution on ℝP⁴
+      - Axis count grows with V; antipodal pair count grows with V/D
+      - Effective rank stays near full (~4.95/5)
+    A3b is the critical test (matches H2a config except D bumped to 5).
+    """
+    A_CONFIGS = [
+        # (V, D, label)
+        (16, 5, 'A3a_V16_D5'),
+        (32, 5, 'A3b_V32_D5'),
+        (64, 5, 'A3c_V64_D5'),
+    ]
+    configs = []
+    for V, D, label in A_CONFIGS:
+        variant_name = f"{label}_h64_dp0_nx0_adam"
+        configs.append({
+            'group': 'implicit_solver_A',
+            'variant': variant_name,
+            'band': 'HIGH',  # nominally HIGH — D=5 is a new regime
+            'seed': 42,
+            'phase': 2,
+            'num_epochs': 1,
+            'batch_size': 256,
+            'batch_limit': 1000,
+            'overrides': {
+                'svd': 'none',
+                'linear_readout': True,
+                'match_params': True,
+                'hidden': 64,
+                'V': V,
+                'D': D,
+                'depth': 0,
+                'n_cross': 0,
+                'n_heads': 1,
+                'optimizer': 'adam',
+                'lr': 3e-3,
+                'grad_clip': 1.0,
+                'cv_measure_every': 50,
+                'soft_hand': False,
+                'noise_types': [0],
+                'test_noise_types': list(range(16)),
+                'test_samples_per_noise': 256,
+                'test_batch_size': 64,
+            },
+            'description': (
+                f'implicit_solver_A-{variant_name} '
+                f'(D=5 spherical reference, projective probe target)'
+            ),
+        })
+    return configs
+def get_implicit_solver_A_configs() -> List[Dict[str, Any]]:
+    """Implicit-solver A-set Stage 1: D=5 spherical references."""
+    return group_implicit_solver_A_d5_spherical()
+def group_R_packed_polytope_test() -> List[Dict[str, Any]]:
+    """Sphere-packing prediction test — does V × D matter geometrically?
+    Hypothesis (from G-Class probe v3): the 32-row × D=3 G-Class behavior
+    (rotating antipodal frame) emerged because 32 points cannot be
+    uniformly arranged on S² — geometric frustration. When V matches a
+    natural polytope vertex count for S^(D-1), training should produce
+    STATIC sphere-solver rows instead.
+    Three test configs (each predicted to produce H2-LIKE static rows):
+      - D=4, V=16: 16-cell (4-orthoplex) vertex count on S³
+      - D=4, V=8:  16-cell again (8 vertices = 4D cross-polytope subset)
+                   or 8-cell (tesseract) — 8 is canonical for both
+      - D=3, V=20: dodecahedron vertex count on S²
+    All else matches H2a (Q-rank02): adam, lr=3e-3, depth=0, n_cross=0,
+    H2_linear_matched (svd=none, linear_readout=True, match_params=True).
+    1000 batches, gaussian-only training, 16-noise per-noise test.
+    Predicted result: all three produce row_stability > 0.85, antipodal
+    pair fraction < 0.55 — i.e. H2-LIKE character on the v3 probe.
+    """
+    POLYTOPE_CONFIGS = [
+        # (V, D, polytope_name)
+        (16, 4, '16cell_orthoplex'),
+        (8,  4, '8cell_or_16cell_subset'),
+        (20, 3, 'dodecahedron'),
+    ]
+    configs = []
+    for V, D, polytope in POLYTOPE_CONFIGS:
+        variant_name = f"R_h64_V{V}_D{D}_{polytope}_adam"
+        configs.append({
+            'group': 'R',
+            'variant': variant_name,
+            'band': 'HIGH',
+            'seed': 42,
+            'phase': 2,
+            'num_epochs': 1,
+            'batch_size': 256,
+            'batch_limit': 1000,
+            'overrides': {
+                'svd': 'none',
+                'linear_readout': True,
+                'match_params': True,
+                'hidden': 64,
+                'V': V,
+                'D': D,
+                'depth': 0,
+                'n_cross': 0,
+                'n_heads': 1,
+                'optimizer': 'adam',
+                'lr': 3e-3,
+                'grad_clip': 1.0,
+                'cv_measure_every': 50,
+                'soft_hand': False,
+                'noise_types': [0],
+                'test_noise_types': list(range(16)),
+                'test_samples_per_noise': 256,
+                'test_batch_size': 64,
+            },
+            'description': (
+                f'R-HIGH-{variant_name} '
+                f'(packing test, predicted H2-LIKE)'
+            ),
+        })
+    return configs
+def get_phaseR_configs() -> List[Dict[str, Any]]:
+    """Phase R — sphere-packing prediction test (3 configs)."""
+    return group_R_packed_polytope_test()
+def group_Q_h2_candidates() -> List[Dict[str, Any]]:
+    """Top-10 P-sweep winners extended to 1000 batches.
+    These are the 10 configs flagged by the P-sweep analyzer's
+    continued-training-potential ranking. Each is re-run with the
+    same architecture and optimizer but with batch_limit=1000 (50×
+    the P sweep's 20-batch budget).
+    Purpose: answer the classification questions the P sweep couldn't:
+      - What's the actual convergence floor per config?
+      - Does Adam catch LBFGS with enough budget? (6 Adam / 4 LBFGS in top 10)
+      - Where does the loss trajectory flatten?
+      - Does discrimination ratio sharpen with more training?
+      - Does final CV land in the valid band (0.13-0.30)?
+    Results feed into H2 class-rank assignment.
+    cv_measure_every=50 so we get ~20 CV measurements across the run
+    (P sweep used 2, which would be 500 measurements at 1000 batches —
+    too many).
+    """
+    # Top 10 from P-sweep analyzer (ranked by continued_training_potential)
+    TOP_10 = [
+        # (hidden, V, D, depth, n_cross, optimizer)
+        (64, 32, 4, 1, 0, 'lbfgs'),  # 1 — 57123 params, P-MSE 0.053
+        (64, 32, 4, 0, 0, 'adam'),    # 2 — 40227 params, P-MSE 0.572
+        (64, 32, 4, 0, 1, 'adam'),    # 3 — 40319 params, P-MSE 0.584
+        (64, 32, 4, 0, 1, 'lbfgs'),   # 4 — 40319 params, P-MSE 0.041
+        (64, 16, 4, 1, 1, 'lbfgs'),   # 5 — 36607 params, P-MSE 0.115
+        (64, 32, 3, 1, 1, 'adam'),    # 6 — 45852 params, P-MSE 0.656
+        (64, 32, 3, 0, 1, 'adam'),    # 7 — 28956 params, P-MSE 0.641
+        (64, 32, 4, 1, 1, 'adam'),    # 8 — 57215 params, P-MSE 0.620
+        (64, 32, 3, 0, 0, 'adam'),    # 9 — 28899 params, P-MSE 0.638
+        (64, 32, 2, 0, 1, 'adam'),    # 10 — 19649 params, P-MSE 0.736
+    ]
+    configs = []
+    for rank, (hidden, V, D, depth, n_cross, optimizer) in enumerate(TOP_10, start=1):
+        variant_name = (
+            f"Q_rank{rank:02d}_h{hidden}_V{V}_D{D}_dp{depth}"
+            f"_nx{n_cross}_{optimizer}"
+        )
+        # Same LR as P sweep: Adam 3e-3, LBFGS 1.0
+        lr = 3e-3 if optimizer == 'adam' else 1.0
+        configs.append({
+            'group': 'Q',
+            'variant': variant_name,
+            'band': 'HIGH',
+            'seed': 42,
+            'phase': 2,
+            'num_epochs': 1,
+            'batch_size': 256,
+            'batch_limit': 1000,  # 50× the P sweep
+            'overrides': {
+                # H2_linear_matched baseline
+                'svd': 'none',
+                'linear_readout': True,
+                'match_params': True,
+                # Size axes (from P winner)
+                'hidden': hidden,
+                'V': V,
+                'D': D,
+                'depth': depth,
+                'n_cross': n_cross,
+                'n_heads': 1,
+                # Optimizer
+                'optimizer': optimizer,
+                'lr': lr,
+                'grad_clip': 1.0,
+                # CV measurement — every 50 gives ~20 measurements
+                # across the 1000-batch run. P used 2 (too frequent
+                # at this budget).
+                'cv_measure_every': 50,
+                # Pure MSE, no soft-hand
+                'soft_hand': False,
+                # Training: gaussian only (matches P sweep)
+                'noise_types': [0],
+                # Full 16-noise test at end
+                'test_noise_types': list(range(16)),
+                'test_samples_per_noise': 256,
+                'test_batch_size': 64,
+            },
+            'description': (
+                f'Q-HIGH-{variant_name} '
+                f'(H2 candidate extended sweep, 1000 batches)'
+            ),
+        })
+    return configs
+def get_phaseQ_configs() -> List[Dict[str, Any]]:
+    """Phase Q — top-10 P winners at 1000 batches for H2 class-rank assignment."""
+    return group_Q_h2_candidates()
+def get_phaseP_configs() -> List[Dict[str, Any]]:
+    """Phase P (floor sweep) — 600 configs at 20 batches each."""
+    return group_P_small_battery_floor()
+def get_phase2_configs() -> List[Dict[str, Any]]:
+    """Phase 2 matrix — 1 epoch each at batch_size=256, resume-capable.
+    Revised from original 174-config design after Phase 1 settled the
+    "does the attractor survive" question. Phase 2 now characterizes
+    WITHIN-ATTRACTOR behavior over one full epoch (~3900 batches):
+    - Group E (36 runs): within-attractor MSE under each soft-hand regime
+    - Group H (42 runs): SVD necessity (vs learned linear readout)
+    - Group L2 (6 runs): LBFGS within-attractor MSE characterization
+                         (MID + HIGH only; LOW stipended pending RLBFGS
+                         engineering pass — see group_L2_lbfgs docstring)
+    Total: 84 runs. Intriguing cases can be continued to epoch 3 or 5
+    using the orchestrator's continue_training() function.
+    """
+    return (
+        group_E_soft_hand()          # 36 runs
+        + group_H_svd_necessity()    # 42 runs
+        + group_L2_lbfgs()           # 6 runs
+    )
+def summarize(configs: List[Dict[str, Any]]) -> None:
+    """Print a breakdown of the matrix for sanity-check."""
+    by_group = {}
+    by_band = {}
+    by_phase = {}
+    for c in configs:
+        by_group[c['group']] = by_group.get(c['group'], 0) + 1
+        by_band[c['band']] = by_band.get(c['band'], 0) + 1
+        by_phase[c['phase']] = by_phase.get(c['phase'], 0) + 1
+    print(f"Total configs: {len(configs)}")
+    print(f"\nBy group:")
+    for g, n in sorted(by_group.items()):
+        print(f"  {g}: {n}")
+    print(f"\nBy band:")
+    for b, n in sorted(by_band.items()):
+        print(f"  {b}: {n}")
+    print(f"\nBy phase:")
+    for p, n in sorted(by_phase.items()):
+        print(f"  Phase {p}: {n}")
+if __name__ == '__main__':
+    print("=" * 60)
+    print("PHASE 1 MATRIX")
+    print("=" * 60)
+    summarize(get_phase1_configs())
+    print()
+    print("=" * 60)
+    print("PHASE 2 MATRIX")
+    print("=" * 60)
+    summarize(get_phase2_configs())