N8Programs commited on
Commit
fc79e8b
·
verified ·
1 Parent(s): ef65102

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 12,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 13,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 14,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000.0,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": false,
59
+ "transformers_version": "5.9.0",
60
+ "use_cache": true,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 16,
63
+ "rope_theta": 1000000.0,
64
+ "torch_dtype": "bfloat16"
65
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 12,
3
+ "do_sample": false,
4
+ "eos_token_id": 13,
5
+ "pad_token_id": 14,
6
+ "transformers_version": "5.9.0",
7
+ "use_cache": true
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84fe6511b007c9c85706f563f547525c7cd57571227940d91b1c514041316128
3
+ size 881035576
oeis_checkpoint_meta.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "final_latest",
3
+ "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
4
+ "trained_tokens": 13999999995,
5
+ "trainer_state": {
6
+ "completed_steps": 581678,
7
+ "train_tokens_seen": 13999999995,
8
+ "last_loss": 0.5624260902404785
9
+ },
10
+ "checkpoint_args": {
11
+ "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
12
+ "model_backend": "custom",
13
+ "param_dtype": "fp32",
14
+ "weight_update_mode": "bf16_live_fp32_master",
15
+ "batch_size": 1,
16
+ "seq_len": 4096,
17
+ "steps": 5,
18
+ "warmup_steps": 2,
19
+ "target_tokens": 13999999995,
20
+ "max_steps": 0,
21
+ "batch_mode": "bucketed",
22
+ "pad_to_seq_len": false,
23
+ "index_dir": "",
24
+ "bucket_tokens_per_batch": 16384,
25
+ "bucket_token_budget_spec": "512:32768,*:24576",
26
+ "bucket_max_batch_size": 512,
27
+ "bucket_pad_multiple": 8,
28
+ "bucket_pad_to_upper": true,
29
+ "bucket_replacement": false,
30
+ "bucket_repeat_epochs": true,
31
+ "bucket_sampling": "token_mass",
32
+ "vocab_mode": "oeis",
33
+ "hidden_size": 1024,
34
+ "intermediate_size": 3072,
35
+ "num_hidden_layers": 28,
36
+ "num_attention_heads": 16,
37
+ "num_key_value_heads": 8,
38
+ "head_dim": 0,
39
+ "compile": true,
40
+ "compile_mode": "reduce-overhead",
41
+ "compile_dynamic": true,
42
+ "compile_skip_dynamic_cudagraphs": true,
43
+ "prewarm_bucket_shapes": true,
44
+ "prewarm_bucket_passes": 2,
45
+ "prewarm_restore_state": true,
46
+ "prewarm_verify_restore": true,
47
+ "prewarm_update_optimizer": true,
48
+ "prewarm_materialize_optimizer_state": true,
49
+ "gradient_checkpointing": false,
50
+ "native_gqa": true,
51
+ "amp": true,
52
+ "lr": 0.0003,
53
+ "optimizer_mode": "torch_muon_hybrid",
54
+ "adamw_lr": 0.0001,
55
+ "muon_lr": 0.01,
56
+ "body_lr_mult": 1.0,
57
+ "adamw_weight_decay": 0.01,
58
+ "muon_weight_decay": 0.0,
59
+ "muon_momentum": 0.95,
60
+ "muon_ns_steps": 5,
61
+ "muon_adjust_lr_fn": "",
62
+ "no_muon_nesterov": false,
63
+ "lr_schedule": "warmup_cosine_cooldown",
64
+ "lr_total_tokens": 13999999995,
65
+ "lr_warmup_tokens": 0,
66
+ "lr_warmup_fraction": 0.005,
67
+ "lr_decay_end_fraction": 0.95,
68
+ "lr_min_factor": 0.1,
69
+ "lr_final_factor": 0.0,
70
+ "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
71
+ "checkpoint_every_tokens": 500000000,
72
+ "checkpoint_every_steps": 0,
73
+ "keep_last_checkpoints": 4,
74
+ "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
75
+ "save_final": true,
76
+ "trim_final_batch": true,
77
+ "allow_token_overshoot": false,
78
+ "val_data": "/root/oeis_val_decontam.jsonl",
79
+ "val_format": "auto",
80
+ "val_index_dir": "",
81
+ "val_every_tokens": 500000000,
82
+ "val_every_steps": 0,
83
+ "val_batches": 256,
84
+ "val_batch_size": 32,
85
+ "val_max_examples": 0,
86
+ "val_max_context_tokens": 0,
87
+ "oeis_eval_data": "",
88
+ "oeis_eval_every_tokens": 0,
89
+ "oeis_eval_every_steps": 0,
90
+ "oeis_eval_batch_size": 64,
91
+ "oeis_eval_max_examples": 0,
92
+ "oeis_eval_max_new_tokens": 20,
93
+ "oeis_eval_max_context_tokens": 0,
94
+ "oeis_eval_collect_examples": 3,
95
+ "oeis_eval_generation_backend": "legacy",
96
+ "expected_loss_tokens": 0,
97
+ "safe_preflight": true,
98
+ "allow_synthetic_only": false,
99
+ "allow_replacement_sampling": false,
100
+ "preflight_only": false,
101
+ "wandb": true,
102
+ "wandb_project": "oeis-massive",
103
+ "wandb_entity": "n8programs",
104
+ "wandb_run_name": "oeis-440m-14b-full-20260525_025101",
105
+ "wandb_id": "oeis440m14b_20260525_025101",
106
+ "wandb_resume": "allow",
107
+ "wandb_mode": "online",
108
+ "wandb_tags": "full,440m,14b,resume_skipdyn",
109
+ "log_every_steps": 10,
110
+ "seed": 0,
111
+ "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
112
+ },
113
+ "transformers_compatible": true,
114
+ "rope_basis": "HF/Qwen split-half RoPE basis",
115
+ "source_rope_basis": "custom interleaved even/odd RoPE basis",
116
+ "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
117
+ "oeis_vocab": {
118
+ "0-9": "digit tokens",
119
+ "10": "negative sign",
120
+ "11": "term separator",
121
+ "12": "BOS",
122
+ "13": "EOS",
123
+ "14": "PAD",
124
+ "15": "reserved"
125
+ },
126
+ "generation_defaults": {
127
+ "max_context_tokens": 4096,
128
+ "recommended_max_new_tokens_for_oeis_eval_neo": 192,
129
+ "stop_token_ids": [
130
+ 11,
131
+ 13,
132
+ 14
133
+ ]
134
+ },
135
+ "dtype": "bfloat16",
136
+ "elapsed_seconds": 7.9373568799346685,
137
+ "local_conversion": {
138
+ "source": "/Users/natebreslow/Documents/khashabiLab/bigOEIS/hf_downloads/NextTerm-440M-Checkpoints/checkpoints/final_latest",
139
+ "conversion": "float tensors downcast from fp32 safetensors to bf16 safetensors for local MLX inference throughput"
140
+ }
141
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<unused>"
4
+ ],
5
+ "bos_token": "<bos>",
6
+ "eos_token": "<eos>",
7
+ "pad_token": "<pad>",
8
+ "unk_token": "<pad>"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 12,
8
+ "content": "<bos>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 13,
17
+ "content": "<eos>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 14,
26
+ "content": "<pad>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 15,
35
+ "content": "<unused>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Sequence",
45
+ "normalizers": [
46
+ {
47
+ "type": "Replace",
48
+ "pattern": {
49
+ "Regex": "[^0-9,-]+"
50
+ },
51
+ "content": ""
52
+ },
53
+ {
54
+ "type": "Replace",
55
+ "pattern": {
56
+ "Regex": ",+"
57
+ },
58
+ "content": ","
59
+ },
60
+ {
61
+ "type": "Strip",
62
+ "strip_left": true,
63
+ "strip_right": true
64
+ },
65
+ {
66
+ "type": "Replace",
67
+ "pattern": {
68
+ "Regex": "^,+"
69
+ },
70
+ "content": ""
71
+ },
72
+ { "type": "Replace", "pattern": { "Regex": ",{2,}$" }, "content": "," }
73
+ ]
74
+ },
75
+ "pre_tokenizer": {
76
+ "type": "Split",
77
+ "pattern": {
78
+ "Regex": ""
79
+ },
80
+ "behavior": "Isolated",
81
+ "invert": false
82
+ },
83
+ "post_processor": {
84
+ "type": "TemplateProcessing",
85
+ "single": [
86
+ {
87
+ "SpecialToken": {
88
+ "id": "<bos>",
89
+ "type_id": 0
90
+ }
91
+ },
92
+ {
93
+ "Sequence": {
94
+ "id": "A",
95
+ "type_id": 0
96
+ }
97
+ }
98
+ ],
99
+ "pair": [
100
+ {
101
+ "Sequence": {
102
+ "id": "A",
103
+ "type_id": 0
104
+ }
105
+ },
106
+ {
107
+ "Sequence": {
108
+ "id": "B",
109
+ "type_id": 1
110
+ }
111
+ }
112
+ ],
113
+ "special_tokens": {
114
+ "<bos>": {
115
+ "id": "<bos>",
116
+ "ids": [
117
+ 12
118
+ ],
119
+ "tokens": [
120
+ "<bos>"
121
+ ]
122
+ }
123
+ }
124
+ },
125
+ "decoder": {
126
+ "type": "Sequence",
127
+ "decoders": [
128
+ {
129
+ "type": "Replace",
130
+ "pattern": {
131
+ "String": " "
132
+ },
133
+ "content": ""
134
+ }
135
+ ]
136
+ },
137
+ "model": {
138
+ "type": "WordLevel",
139
+ "vocab": {
140
+ "0": 0,
141
+ "1": 1,
142
+ "2": 2,
143
+ "3": 3,
144
+ "4": 4,
145
+ "5": 5,
146
+ "6": 6,
147
+ "7": 7,
148
+ "8": 8,
149
+ "9": 9,
150
+ "-": 10,
151
+ ",": 11,
152
+ "<bos>": 12,
153
+ "<eos>": 13,
154
+ "<pad>": 14,
155
+ "<unused>": 15
156
+ },
157
+ "unk_token": "<pad>"
158
+ }
159
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "12": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "13": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "14": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "15": {
28
+ "content": "<unused>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "additional_special_tokens": [
37
+ "<unused>"
38
+ ],
39
+ "bos_token": "<bos>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "<eos>",
42
+ "extra_special_tokens": {},
43
+ "model_max_length": 40960,
44
+ "pad_token": "<pad>",
45
+ "tokenizer_class": "PreTrainedTokenizerFast",
46
+ "unk_token": "<pad>"
47
+ }