{ "name": "final_latest", "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt", "trained_tokens": 13999999995, "trainer_state": { "completed_steps": 581678, "train_tokens_seen": 13999999995, "last_loss": 0.5624260902404785 }, "checkpoint_args": { "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed", "model_backend": "custom", "param_dtype": "fp32", "weight_update_mode": "bf16_live_fp32_master", "batch_size": 1, "seq_len": 4096, "steps": 5, "warmup_steps": 2, "target_tokens": 13999999995, "max_steps": 0, "batch_mode": "bucketed", "pad_to_seq_len": false, "index_dir": "", "bucket_tokens_per_batch": 16384, "bucket_token_budget_spec": "512:32768,*:24576", "bucket_max_batch_size": 512, "bucket_pad_multiple": 8, "bucket_pad_to_upper": true, "bucket_replacement": false, "bucket_repeat_epochs": true, "bucket_sampling": "token_mass", "vocab_mode": "oeis", "hidden_size": 1024, "intermediate_size": 3072, "num_hidden_layers": 28, "num_attention_heads": 16, "num_key_value_heads": 8, "head_dim": 0, "compile": true, "compile_mode": "reduce-overhead", "compile_dynamic": true, "compile_skip_dynamic_cudagraphs": true, "prewarm_bucket_shapes": true, "prewarm_bucket_passes": 2, "prewarm_restore_state": true, "prewarm_verify_restore": true, "prewarm_update_optimizer": true, "prewarm_materialize_optimizer_state": true, "gradient_checkpointing": false, "native_gqa": true, "amp": true, "lr": 0.0003, "optimizer_mode": "torch_muon_hybrid", "adamw_lr": 0.0001, "muon_lr": 0.01, "body_lr_mult": 1.0, "adamw_weight_decay": 0.01, "muon_weight_decay": 0.0, "muon_momentum": 0.95, "muon_ns_steps": 5, "muon_adjust_lr_fn": "", "no_muon_nesterov": false, "lr_schedule": "warmup_cosine_cooldown", "lr_total_tokens": 13999999995, "lr_warmup_tokens": 0, "lr_warmup_fraction": 0.005, "lr_decay_end_fraction": 0.95, "lr_min_factor": 0.1, "lr_final_factor": 0.0, "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101", "checkpoint_every_tokens": 500000000, "checkpoint_every_steps": 0, "keep_last_checkpoints": 4, "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt", "save_final": true, "trim_final_batch": true, "allow_token_overshoot": false, "val_data": "/root/oeis_val_decontam.jsonl", "val_format": "auto", "val_index_dir": "", "val_every_tokens": 500000000, "val_every_steps": 0, "val_batches": 256, "val_batch_size": 32, "val_max_examples": 0, "val_max_context_tokens": 0, "oeis_eval_data": "", "oeis_eval_every_tokens": 0, "oeis_eval_every_steps": 0, "oeis_eval_batch_size": 64, "oeis_eval_max_examples": 0, "oeis_eval_max_new_tokens": 20, "oeis_eval_max_context_tokens": 0, "oeis_eval_collect_examples": 3, "oeis_eval_generation_backend": "legacy", "expected_loss_tokens": 0, "safe_preflight": true, "allow_synthetic_only": false, "allow_replacement_sampling": false, "preflight_only": false, "wandb": true, "wandb_project": "oeis-massive", "wandb_entity": "n8programs", "wandb_run_name": "oeis-440m-14b-full-20260525_025101", "wandb_id": "oeis440m14b_20260525_025101", "wandb_resume": "allow", "wandb_mode": "online", "wandb_tags": "full,440m,14b,resume_skipdyn", "log_every_steps": 10, "seed": 0, "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json" }, "transformers_compatible": true, "rope_basis": "HF/Qwen split-half RoPE basis", "source_rope_basis": "custom interleaved even/odd RoPE basis", "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers", "oeis_vocab": { "0-9": "digit tokens", "10": "negative sign", "11": "term separator", "12": "BOS", "13": "EOS", "14": "PAD", "15": "reserved" }, "generation_defaults": { "max_context_tokens": 4096, "recommended_max_new_tokens_for_oeis_eval_neo": 192, "stop_token_ids": [ 11, 13, 14 ] }, "dtype": "bfloat16", "elapsed_seconds": 7.9373568799346685, "local_conversion": { "source": "/Users/natebreslow/Documents/khashabiLab/bigOEIS/hf_downloads/NextTerm-440M-Checkpoints/checkpoints/final_latest", "conversion": "float tensors downcast from fp32 safetensors to bf16 safetensors for local MLX inference throughput" } }