File size: 4,457 Bytes
451a228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
{
  "name": "final_latest",
  "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
  "trained_tokens": 13999999995,
  "trainer_state": {
    "completed_steps": 581678,
    "train_tokens_seen": 13999999995,
    "last_loss": 0.5624260902404785
  },
  "checkpoint_args": {
    "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
    "model_backend": "custom",
    "param_dtype": "fp32",
    "weight_update_mode": "bf16_live_fp32_master",
    "batch_size": 1,
    "seq_len": 4096,
    "steps": 5,
    "warmup_steps": 2,
    "target_tokens": 13999999995,
    "max_steps": 0,
    "batch_mode": "bucketed",
    "pad_to_seq_len": false,
    "index_dir": "",
    "bucket_tokens_per_batch": 16384,
    "bucket_token_budget_spec": "512:32768,*:24576",
    "bucket_max_batch_size": 512,
    "bucket_pad_multiple": 8,
    "bucket_pad_to_upper": true,
    "bucket_replacement": false,
    "bucket_repeat_epochs": true,
    "bucket_sampling": "token_mass",
    "vocab_mode": "oeis",
    "hidden_size": 1024,
    "intermediate_size": 3072,
    "num_hidden_layers": 28,
    "num_attention_heads": 16,
    "num_key_value_heads": 8,
    "head_dim": 0,
    "compile": true,
    "compile_mode": "reduce-overhead",
    "compile_dynamic": true,
    "compile_skip_dynamic_cudagraphs": true,
    "prewarm_bucket_shapes": true,
    "prewarm_bucket_passes": 2,
    "prewarm_restore_state": true,
    "prewarm_verify_restore": true,
    "prewarm_update_optimizer": true,
    "prewarm_materialize_optimizer_state": true,
    "gradient_checkpointing": false,
    "native_gqa": true,
    "amp": true,
    "lr": 0.0003,
    "optimizer_mode": "torch_muon_hybrid",
    "adamw_lr": 0.0001,
    "muon_lr": 0.01,
    "body_lr_mult": 1.0,
    "adamw_weight_decay": 0.01,
    "muon_weight_decay": 0.0,
    "muon_momentum": 0.95,
    "muon_ns_steps": 5,
    "muon_adjust_lr_fn": "",
    "no_muon_nesterov": false,
    "lr_schedule": "warmup_cosine_cooldown",
    "lr_total_tokens": 13999999995,
    "lr_warmup_tokens": 0,
    "lr_warmup_fraction": 0.005,
    "lr_decay_end_fraction": 0.95,
    "lr_min_factor": 0.1,
    "lr_final_factor": 0.0,
    "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
    "checkpoint_every_tokens": 500000000,
    "checkpoint_every_steps": 0,
    "keep_last_checkpoints": 4,
    "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
    "save_final": true,
    "trim_final_batch": true,
    "allow_token_overshoot": false,
    "val_data": "/root/oeis_val_decontam.jsonl",
    "val_format": "auto",
    "val_index_dir": "",
    "val_every_tokens": 500000000,
    "val_every_steps": 0,
    "val_batches": 256,
    "val_batch_size": 32,
    "val_max_examples": 0,
    "val_max_context_tokens": 0,
    "oeis_eval_data": "",
    "oeis_eval_every_tokens": 0,
    "oeis_eval_every_steps": 0,
    "oeis_eval_batch_size": 64,
    "oeis_eval_max_examples": 0,
    "oeis_eval_max_new_tokens": 20,
    "oeis_eval_max_context_tokens": 0,
    "oeis_eval_collect_examples": 3,
    "oeis_eval_generation_backend": "legacy",
    "expected_loss_tokens": 0,
    "safe_preflight": true,
    "allow_synthetic_only": false,
    "allow_replacement_sampling": false,
    "preflight_only": false,
    "wandb": true,
    "wandb_project": "oeis-massive",
    "wandb_entity": "n8programs",
    "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
    "wandb_id": "oeis440m14b_20260525_025101",
    "wandb_resume": "allow",
    "wandb_mode": "online",
    "wandb_tags": "full,440m,14b,resume_skipdyn",
    "log_every_steps": 10,
    "seed": 0,
    "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
  },
  "transformers_compatible": true,
  "rope_basis": "HF/Qwen split-half RoPE basis",
  "source_rope_basis": "custom interleaved even/odd RoPE basis",
  "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
  "oeis_vocab": {
    "0-9": "digit tokens",
    "10": "negative sign",
    "11": "term separator",
    "12": "BOS",
    "13": "EOS",
    "14": "PAD",
    "15": "reserved"
  },
  "generation_defaults": {
    "max_context_tokens": 4096,
    "recommended_max_new_tokens_for_oeis_eval_neo": 192,
    "stop_token_ids": [
      11,
      13,
      14
    ]
  },
  "dtype": "float32",
  "elapsed_seconds": 7.9373568799346685
}