N8Programs commited on
Commit
451a228
·
verified ·
1 Parent(s): d47573d

Add files using upload-large-folder tool

Browse files
Files changed (26) hide show
  1. README.md +79 -0
  2. checkpoint_manifest.json +67 -0
  3. checkpoints/best_val/config.json +63 -0
  4. checkpoints/best_val/generation_config.json +12 -0
  5. checkpoints/best_val/model.safetensors +3 -0
  6. checkpoints/best_val/oeis_checkpoint_meta.json +137 -0
  7. checkpoints/checkpoint_tokens_012000258345/config.json +63 -0
  8. checkpoints/checkpoint_tokens_012000258345/generation_config.json +12 -0
  9. checkpoints/checkpoint_tokens_012000258345/model.safetensors +3 -0
  10. checkpoints/checkpoint_tokens_012000258345/oeis_checkpoint_meta.json +137 -0
  11. checkpoints/checkpoint_tokens_012500265837/config.json +63 -0
  12. checkpoints/checkpoint_tokens_012500265837/generation_config.json +12 -0
  13. checkpoints/checkpoint_tokens_012500265837/model.safetensors +3 -0
  14. checkpoints/checkpoint_tokens_012500265837/oeis_checkpoint_meta.json +137 -0
  15. checkpoints/checkpoint_tokens_013000266889/config.json +63 -0
  16. checkpoints/checkpoint_tokens_013000266889/generation_config.json +12 -0
  17. checkpoints/checkpoint_tokens_013000266889/model.safetensors +3 -0
  18. checkpoints/checkpoint_tokens_013000266889/oeis_checkpoint_meta.json +137 -0
  19. checkpoints/checkpoint_tokens_013500289737/config.json +63 -0
  20. checkpoints/checkpoint_tokens_013500289737/generation_config.json +12 -0
  21. checkpoints/checkpoint_tokens_013500289737/model.safetensors +3 -0
  22. checkpoints/checkpoint_tokens_013500289737/oeis_checkpoint_meta.json +137 -0
  23. checkpoints/final_latest/config.json +63 -0
  24. checkpoints/final_latest/generation_config.json +12 -0
  25. checkpoints/final_latest/model.safetensors +3 -0
  26. checkpoints/final_latest/oeis_checkpoint_meta.json +137 -0
README.md CHANGED
@@ -1,3 +1,82 @@
1
  ---
2
  license: cc-by-sa-4.0
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-sa-4.0
3
+ library_name: transformers
4
+ tags:
5
+ - oeis
6
+ - qwen3
7
+ - causal-lm
8
+ - checkpoint
9
  ---
10
+
11
+ # NextTerm-440M Checkpoints
12
+
13
+ Transformers-compatible checkpoints from the OEIS NextTerm-440M run.
14
+
15
+ These checkpoints use a Qwen3-style causal LM architecture with a 16-token OEIS digit vocabulary. They were converted from the training checkpoints by remapping the custom interleaved RoPE basis into the Hugging Face / Qwen split-half RoPE basis, so they can be loaded directly with `AutoModelForCausalLM`.
16
+
17
+ ## Checkpoints
18
+
19
+ | Folder | Tokens trained | Notes |
20
+ | --- | ---: | --- |
21
+ | `checkpoints/final_latest` | 13,999,999,995 | Final checkpoint; recommended default |
22
+ | `checkpoints/best_val` | 9,500,200,875 | Best validation-loss checkpoint |
23
+ | `checkpoints/checkpoint_tokens_012000258345` | 12,000,258,345 | Historical checkpoint |
24
+ | `checkpoints/checkpoint_tokens_012500265837` | 12,500,265,837 | Historical checkpoint |
25
+ | `checkpoints/checkpoint_tokens_013000266889` | 13,000,266,889 | Historical checkpoint |
26
+ | `checkpoints/checkpoint_tokens_013500289737` | 13,500,289,737 | Historical checkpoint |
27
+
28
+ ## OEIS Vocab
29
+
30
+ The model is token-ID based; no text tokenizer is included.
31
+
32
+ | Token ID | Meaning |
33
+ | ---: | --- |
34
+ | `0`-`9` | decimal digits |
35
+ | `10` | negative sign |
36
+ | `11` | term separator |
37
+ | `12` | BOS |
38
+ | `13` | EOS |
39
+ | `14` | PAD |
40
+ | `15` | reserved |
41
+
42
+ For next-term generation, stop on any of `[11, 13, 14]`.
43
+
44
+ ## Loading
45
+
46
+ ```python
47
+ import torch
48
+ from transformers import AutoModelForCausalLM
49
+
50
+ model = AutoModelForCausalLM.from_pretrained(
51
+ "N8Programs/NextTerm-440M-Checkpoints",
52
+ subfolder="checkpoints/final_latest",
53
+ dtype=torch.bfloat16,
54
+ device_map="auto",
55
+ )
56
+ ```
57
+
58
+ Example input IDs for the prefix `1, 2, 3, ...`:
59
+
60
+ ```python
61
+ input_ids = torch.tensor([[12, 1, 11, 2, 11, 3, 11]], device=model.device)
62
+ out = model.generate(
63
+ input_ids,
64
+ max_new_tokens=192,
65
+ do_sample=False,
66
+ eos_token_id=[11, 13, 14],
67
+ pad_token_id=14,
68
+ )
69
+ ```
70
+
71
+ ## Evaluation Notes
72
+
73
+ OEIS Eval Neo excludes exact packed-sequence overlaps with the training data and uses `max_new_tokens=192`, which is sufficient for every answer in that eval set.
74
+
75
+ Known OEIS Eval Neo results:
76
+
77
+ | Checkpoint | Accuracy |
78
+ | --- | ---: |
79
+ | `final_latest` | 6545 / 19034 = 34.39% |
80
+ | `best_val` | 6477 / 19034 = 34.03% |
81
+
82
+ Each checkpoint folder includes an `oeis_checkpoint_meta.json` file with training tokens, source checkpoint path, and conversion details.
checkpoint_manifest.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "repo": "N8Programs/NextTerm-440M-Checkpoints",
3
+ "format": "transformers save_pretrained subfolders",
4
+ "recommended_checkpoint": "checkpoints/final_latest",
5
+ "oeis_vocab_size": 16,
6
+ "stop_token_ids": [
7
+ 11,
8
+ 13,
9
+ 14
10
+ ],
11
+ "checkpoints": [
12
+ {
13
+ "name": "best_val",
14
+ "subfolder": "checkpoints/best_val",
15
+ "trained_tokens": 9500200875,
16
+ "dtype": "float32",
17
+ "rope_basis": "HF/Qwen split-half RoPE basis",
18
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
19
+ "transformers_compatible": true
20
+ },
21
+ {
22
+ "name": "checkpoint_tokens_012000258345",
23
+ "subfolder": "checkpoints/checkpoint_tokens_012000258345",
24
+ "trained_tokens": 12000258345,
25
+ "dtype": "float32",
26
+ "rope_basis": "HF/Qwen split-half RoPE basis",
27
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
28
+ "transformers_compatible": true
29
+ },
30
+ {
31
+ "name": "checkpoint_tokens_012500265837",
32
+ "subfolder": "checkpoints/checkpoint_tokens_012500265837",
33
+ "trained_tokens": 12500265837,
34
+ "dtype": "float32",
35
+ "rope_basis": "HF/Qwen split-half RoPE basis",
36
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
37
+ "transformers_compatible": true
38
+ },
39
+ {
40
+ "name": "checkpoint_tokens_013000266889",
41
+ "subfolder": "checkpoints/checkpoint_tokens_013000266889",
42
+ "trained_tokens": 13000266889,
43
+ "dtype": "float32",
44
+ "rope_basis": "HF/Qwen split-half RoPE basis",
45
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
46
+ "transformers_compatible": true
47
+ },
48
+ {
49
+ "name": "checkpoint_tokens_013500289737",
50
+ "subfolder": "checkpoints/checkpoint_tokens_013500289737",
51
+ "trained_tokens": 13500289737,
52
+ "dtype": "float32",
53
+ "rope_basis": "HF/Qwen split-half RoPE basis",
54
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
55
+ "transformers_compatible": true
56
+ },
57
+ {
58
+ "name": "final_latest",
59
+ "subfolder": "checkpoints/final_latest",
60
+ "trained_tokens": 13999999995,
61
+ "dtype": "float32",
62
+ "rope_basis": "HF/Qwen split-half RoPE basis",
63
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
64
+ "transformers_compatible": true
65
+ }
66
+ ]
67
+ }
checkpoints/best_val/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/best_val/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/best_val/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09dd9ae312dbe26bf087da16a95d85a95c94171ace11973dea01fa36e39a7de1
3
+ size 1762036008
checkpoints/best_val/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "best_val",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/best_val.pt",
4
+ "trained_tokens": 9500200875,
5
+ "trainer_state": {
6
+ "completed_steps": 394768,
7
+ "train_tokens_seen": 9500200875,
8
+ "last_loss": 0.5931950807571411
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 8.110643866471946
137
+ }
checkpoints/checkpoint_tokens_012000258345/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/checkpoint_tokens_012000258345/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/checkpoint_tokens_012000258345/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89464d801e77673b8765dce243f74145aef74d378d4c54a448f5c20b34f3316
3
+ size 1762036008
checkpoints/checkpoint_tokens_012000258345/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "checkpoint_tokens_012000258345",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/checkpoint_tokens_012000258345.pt",
4
+ "trained_tokens": 12000258345,
5
+ "trainer_state": {
6
+ "completed_steps": 498580,
7
+ "train_tokens_seen": 12000258345,
8
+ "last_loss": 0.5622460842132568
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 7.989120943471789
137
+ }
checkpoints/checkpoint_tokens_012500265837/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/checkpoint_tokens_012500265837/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/checkpoint_tokens_012500265837/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e827f9f0a3a61a8f7346b4ba0e9f51eeb7a82eeee286d625c1200c4a4b5429cf
3
+ size 1762036008
checkpoints/checkpoint_tokens_012500265837/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "checkpoint_tokens_012500265837",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/checkpoint_tokens_012500265837.pt",
4
+ "trained_tokens": 12500265837,
5
+ "trainer_state": {
6
+ "completed_steps": 519332,
7
+ "train_tokens_seen": 12500265837,
8
+ "last_loss": 0.40147635340690613
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 8.423396774567664
137
+ }
checkpoints/checkpoint_tokens_013000266889/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/checkpoint_tokens_013000266889/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/checkpoint_tokens_013000266889/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f1a19595150478a37e117e0c731b4c37113339cabbac2a62044bedb8e12e4f
3
+ size 1762036008
checkpoints/checkpoint_tokens_013000266889/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "checkpoint_tokens_013000266889",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/checkpoint_tokens_013000266889.pt",
4
+ "trained_tokens": 13000266889,
5
+ "trainer_state": {
6
+ "completed_steps": 540129,
7
+ "train_tokens_seen": 13000266889,
8
+ "last_loss": 0.99857097864151
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 7.916065665893257
137
+ }
checkpoints/checkpoint_tokens_013500289737/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/checkpoint_tokens_013500289737/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/checkpoint_tokens_013500289737/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d80e416c674dff4cf3137f306960a2349dc11181f0e0580589b0b5a58f386923
3
+ size 1762036008
checkpoints/checkpoint_tokens_013500289737/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "checkpoint_tokens_013500289737",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/checkpoint_tokens_013500289737.pt",
4
+ "trained_tokens": 13500289737,
5
+ "trainer_state": {
6
+ "completed_steps": 560923,
7
+ "train_tokens_seen": 13500289737,
8
+ "last_loss": 0.43072962760925293
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 7.8347954377532005
137
+ }
checkpoints/final_latest/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "float32",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16
63
+ }
checkpoints/final_latest/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 11,
6
+ 13,
7
+ 14
8
+ ],
9
+ "pad_token_id": 14,
10
+ "transformers_version": "5.9.0",
11
+ "use_cache": true
12
+ }
checkpoints/final_latest/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1364c6b994ea498c66d39ffcb1387b6246587c336783735985cdb5af25f7573d
3
+ size 1762036008
checkpoints/final_latest/oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "final_latest",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
4
+ "trained_tokens": 13999999995,
5
+ "trainer_state": {
6
+ "completed_steps": 581678,
7
+ "train_tokens_seen": 13999999995,
8
+ "last_loss": 0.5624260902404785
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "float32",
136
+ "elapsed_seconds": 7.9373568799346685
137
+ }