{ "architectures": [ "MambaInteger50M" ], "d_model": 512, "n_layer": 16, "vocab_size": 16384, "ssm_cfg": { "d_state": 64, "dt_rank": 32, "dt_min": 0.001, "dt_max": 0.1, "use_dyadic_scan": true, "scale_bits": 15, "n_heads": 16, "d_head": 32, "use_ssd": true, "chunk_size": 64 }, "training": { "seq_len": 1024, "batch_size": 64, "gradient_accumulation_steps": 2, "total_steps": 200000, "learning_rate": 6e-4, "decay_lr": 6e-4, "weight_decay": 0.05, "grad_clip": 50.0, "num_workers": 0, "log_interval": 500, "checkpoint_interval": 10000, "use_amp": false, "use_compile": false }, "rms_norm_eps": 1e-05, "use_bitshift_norm": true, "weight_quantization": "bitnet_1.58", "model_type": "mamba_integer", "model_name": "mamba_integer_50m" }