Instructions to use N8Programs/NextTerm-440M-Checkpoints with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use N8Programs/NextTerm-440M-Checkpoints with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("N8Programs/NextTerm-440M-Checkpoints", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "name": "final_latest", | |
| "source_checkpoint": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt", | |
| "trained_tokens": 13999999995, | |
| "trainer_state": { | |
| "completed_steps": 581678, | |
| "train_tokens_seen": 13999999995, | |
| "last_loss": 0.5624260902404785 | |
| }, | |
| "checkpoint_args": { | |
| "data": "/root/oeis-massive/packed_data/oeis_train_full_synth_plus_organic_13999999995.packed", | |
| "model_backend": "custom", | |
| "param_dtype": "fp32", | |
| "weight_update_mode": "bf16_live_fp32_master", | |
| "batch_size": 1, | |
| "seq_len": 4096, | |
| "steps": 5, | |
| "warmup_steps": 2, | |
| "target_tokens": 13999999995, | |
| "max_steps": 0, | |
| "batch_mode": "bucketed", | |
| "pad_to_seq_len": false, | |
| "index_dir": "", | |
| "bucket_tokens_per_batch": 16384, | |
| "bucket_token_budget_spec": "512:32768,*:24576", | |
| "bucket_max_batch_size": 512, | |
| "bucket_pad_multiple": 8, | |
| "bucket_pad_to_upper": true, | |
| "bucket_replacement": false, | |
| "bucket_repeat_epochs": true, | |
| "bucket_sampling": "token_mass", | |
| "vocab_mode": "oeis", | |
| "hidden_size": 1024, | |
| "intermediate_size": 3072, | |
| "num_hidden_layers": 28, | |
| "num_attention_heads": 16, | |
| "num_key_value_heads": 8, | |
| "head_dim": 0, | |
| "compile": true, | |
| "compile_mode": "reduce-overhead", | |
| "compile_dynamic": true, | |
| "compile_skip_dynamic_cudagraphs": true, | |
| "prewarm_bucket_shapes": true, | |
| "prewarm_bucket_passes": 2, | |
| "prewarm_restore_state": true, | |
| "prewarm_verify_restore": true, | |
| "prewarm_update_optimizer": true, | |
| "prewarm_materialize_optimizer_state": true, | |
| "gradient_checkpointing": false, | |
| "native_gqa": true, | |
| "amp": true, | |
| "lr": 0.0003, | |
| "optimizer_mode": "torch_muon_hybrid", | |
| "adamw_lr": 0.0001, | |
| "muon_lr": 0.01, | |
| "body_lr_mult": 1.0, | |
| "adamw_weight_decay": 0.01, | |
| "muon_weight_decay": 0.0, | |
| "muon_momentum": 0.95, | |
| "muon_ns_steps": 5, | |
| "muon_adjust_lr_fn": "", | |
| "no_muon_nesterov": false, | |
| "lr_schedule": "warmup_cosine_cooldown", | |
| "lr_total_tokens": 13999999995, | |
| "lr_warmup_tokens": 0, | |
| "lr_warmup_fraction": 0.005, | |
| "lr_decay_end_fraction": 0.95, | |
| "lr_min_factor": 0.1, | |
| "lr_final_factor": 0.0, | |
| "checkpoint_dir": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101", | |
| "checkpoint_every_tokens": 500000000, | |
| "checkpoint_every_steps": 0, | |
| "keep_last_checkpoints": 4, | |
| "resume": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/latest.pt", | |
| "save_final": true, | |
| "trim_final_batch": true, | |
| "allow_token_overshoot": false, | |
| "val_data": "/root/oeis_val_decontam.jsonl", | |
| "val_format": "auto", | |
| "val_index_dir": "", | |
| "val_every_tokens": 500000000, | |
| "val_every_steps": 0, | |
| "val_batches": 256, | |
| "val_batch_size": 32, | |
| "val_max_examples": 0, | |
| "val_max_context_tokens": 0, | |
| "oeis_eval_data": "", | |
| "oeis_eval_every_tokens": 0, | |
| "oeis_eval_every_steps": 0, | |
| "oeis_eval_batch_size": 64, | |
| "oeis_eval_max_examples": 0, | |
| "oeis_eval_max_new_tokens": 20, | |
| "oeis_eval_max_context_tokens": 0, | |
| "oeis_eval_collect_examples": 3, | |
| "oeis_eval_generation_backend": "legacy", | |
| "expected_loss_tokens": 0, | |
| "safe_preflight": true, | |
| "allow_synthetic_only": false, | |
| "allow_replacement_sampling": false, | |
| "preflight_only": false, | |
| "wandb": true, | |
| "wandb_project": "oeis-massive", | |
| "wandb_entity": "n8programs", | |
| "wandb_run_name": "oeis-440m-14b-full-20260525_025101", | |
| "wandb_id": "oeis440m14b_20260525_025101", | |
| "wandb_resume": "allow", | |
| "wandb_mode": "online", | |
| "wandb_tags": "full,440m,14b,resume_skipdyn", | |
| "log_every_steps": 10, | |
| "seed": 0, | |
| "report_json": "/root/oeis_runs/oeis-440m-14b-full-20260525_025101/report_resume_skipdyn_20260526_114400.json" | |
| }, | |
| "transformers_compatible": true, | |
| "rope_basis": "HF/Qwen split-half RoPE basis", | |
| "source_rope_basis": "custom interleaved even/odd RoPE basis", | |
| "conversion": "q_proj/k_proj rows and q_norm/k_norm weights permuted with run_oeis_nextterm_eval_torch._map_custom_state_to_transformers", | |
| "oeis_vocab": { | |
| "0-9": "digit tokens", | |
| "10": "negative sign", | |
| "11": "term separator", | |
| "12": "BOS", | |
| "13": "EOS", | |
| "14": "PAD", | |
| "15": "reserved" | |
| }, | |
| "generation_defaults": { | |
| "max_context_tokens": 4096, | |
| "recommended_max_new_tokens_for_oeis_eval_neo": 192, | |
| "stop_token_ids": [ | |
| 11, | |
| 13, | |
| 14 | |
| ] | |
| }, | |
| "dtype": "float32", | |
| "elapsed_seconds": 7.9373568799346685 | |
| } | |