NextTerm-440M / oeis_checkpoint_meta.json
N8Programs's picture
Upload folder using huggingface_hub
fc79e8b verified
{
"name": "final_latest",
"source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
"trained_tokens": 13999999995,
"trainer_state": {
"completed_steps": 581678,
"train_tokens_seen": 13999999995,
"last_loss": 0.5624260902404785
},
"checkpoint_args": {
"data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed",
"model_backend": "custom",
"param_dtype": "fp32",
"weight_update_mode": "bf16_live_fp32_master",
"batch_size": 1,
"seq_len": 4096,
"steps": 5,
"warmup_steps": 2,
"target_tokens": 13999999995,
"max_steps": 0,
"batch_mode": "bucketed",
"pad_to_seq_len": false,
"index_dir": "",
"bucket_tokens_per_batch": 16384,
"bucket_token_budget_spec": "512:32768,*:24576",
"bucket_max_batch_size": 512,
"bucket_pad_multiple": 8,
"bucket_pad_to_upper": true,
"bucket_replacement": false,
"bucket_repeat_epochs": true,
"bucket_sampling": "token_mass",
"vocab_mode": "oeis",
"hidden_size": 1024,
"intermediate_size": 3072,
"num_hidden_layers": 28,
"num_attention_heads": 16,
"num_key_value_heads": 8,
"head_dim": 0,
"compile": true,
"compile_mode": "reduce-overhead",
"compile_dynamic": true,
"compile_skip_dynamic_cudagraphs": true,
"prewarm_bucket_shapes": true,
"prewarm_bucket_passes": 2,
"prewarm_restore_state": true,
"prewarm_verify_restore": true,
"prewarm_update_optimizer": true,
"prewarm_materialize_optimizer_state": true,
"gradient_checkpointing": false,
"native_gqa": true,
"amp": true,
"lr": 0.0003,
"optimizer_mode": "torch_muon_hybrid",
"adamw_lr": 0.0001,
"muon_lr": 0.01,
"body_lr_mult": 1.0,
"adamw_weight_decay": 0.01,
"muon_weight_decay": 0.0,
"muon_momentum": 0.95,
"muon_ns_steps": 5,
"muon_adjust_lr_fn": "",
"no_muon_nesterov": false,
"lr_schedule": "warmup_cosine_cooldown",
"lr_total_tokens": 13999999995,
"lr_warmup_tokens": 0,
"lr_warmup_fraction": 0.005,
"lr_decay_end_fraction": 0.95,
"lr_min_factor": 0.1,
"lr_final_factor": 0.0,
"checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101",
"checkpoint_every_tokens": 500000000,
"checkpoint_every_steps": 0,
"keep_last_checkpoints": 4,
"resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt",
"save_final": true,
"trim_final_batch": true,
"allow_token_overshoot": false,
"val_data": "/root/oeis_val_decontam.jsonl",
"val_format": "auto",
"val_index_dir": "",
"val_every_tokens": 500000000,
"val_every_steps": 0,
"val_batches": 256,
"val_batch_size": 32,
"val_max_examples": 0,
"val_max_context_tokens": 0,
"oeis_eval_data": "",
"oeis_eval_every_tokens": 0,
"oeis_eval_every_steps": 0,
"oeis_eval_batch_size": 64,
"oeis_eval_max_examples": 0,
"oeis_eval_max_new_tokens": 20,
"oeis_eval_max_context_tokens": 0,
"oeis_eval_collect_examples": 3,
"oeis_eval_generation_backend": "legacy",
"expected_loss_tokens": 0,
"safe_preflight": true,
"allow_synthetic_only": false,
"allow_replacement_sampling": false,
"preflight_only": false,
"wandb": true,
"wandb_project": "oeis-massive",
"wandb_entity": "n8programs",
"wandb_run_name": "oeis-440m-14b-full-20260525_025101",
"wandb_id": "oeis440m14b_20260525_025101",
"wandb_resume": "allow",
"wandb_mode": "online",
"wandb_tags": "full,440m,14b,resume_skipdyn",
"log_every_steps": 10,
"seed": 0,
"report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json"
},
"transformers_compatible": true,
"rope_basis": "HF/Qwen split-half RoPE basis",
"source_rope_basis": "custom interleaved even/odd RoPE basis",
"conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers",
"oeis_vocab": {
"0-9": "digit tokens",
"10": "negative sign",
"11": "term separator",
"12": "BOS",
"13": "EOS",
"14": "PAD",
"15": "reserved"
},
"generation_defaults": {
"max_context_tokens": 4096,
"recommended_max_new_tokens_for_oeis_eval_neo": 192,
"stop_token_ids": [
11,
13,
14
]
},
"dtype": "bfloat16",
"elapsed_seconds": 7.9373568799346685,
"local_conversion": {
"source": "/Users/natebreslow/Documents/khashabiLab/bigOEIS/hf_downloads/NextTerm-440M-Checkpoints/checkpoints/final_latest",
"conversion": "float tensors downcast from fp32 safetensors to bf16 safetensors for local MLX inference throughput"
}
}