Discovered / config.json
reaperdoesntknow's picture
Upload MoAMetricLM
efb471d verified
{
"alpha_init": 1.0,
"architectures": [
"MoAMetricLM"
],
"attn_drop": 0.0,
"attn_heads": 16,
"bos_token_id": 0,
"chunk_size": 256,
"conv_kernel": 5,
"conv_mult": 2,
"dim": 512,
"discrepancy_modulation": false,
"drop_path": 0.0,
"dtype": "float32",
"enable_feature_gates": true,
"enable_router_gates": true,
"enable_v_energy": true,
"energy_amplification": 9.869604401089358,
"eos_token_id": 0,
"ff_mult": 3,
"ffn_hidden": 1536,
"hidden_size": 512,
"intermediate_size": 1536,
"layer_scale_init_value": 0.0001,
"learn_alpha": true,
"learn_radius": true,
"lm_attn_heads": 16,
"lm_ffn_hidden": 1536,
"lm_intermediate_size": 1536,
"lm_mixer_hidden": 768,
"lm_mqa_q_heads": 16,
"lm_num_attention_heads": 16,
"lm_num_key_value_heads": 16,
"lm_proj_drop": 0.1,
"lm_router_dropout": 0.1,
"lm_router_hidden": 64,
"lm_router_temperature": 1.0,
"lr_rank": 32,
"maha_init": 1.0,
"max_position_embeddings": 8192,
"max_seq_len_cached": 8192,
"metric": "maha_diag",
"mixer_hidden": 768,
"model_type": "moa_metric",
"mqa_q_heads": 16,
"n_branches": 3,
"num_attention_heads": 16,
"num_hidden_layers": 4,
"num_key_value_heads": 16,
"num_layers": 4,
"origin_init_scale": 0.0,
"pad_token_id": 0,
"proj_drop": 0.1,
"r_basis": 16,
"radius_init": 5.0,
"router_dropout": 0.1,
"router_hidden": 128,
"router_temperature": 1.25,
"router_topk": 2,
"theta_base": 10000.0,
"ti_reg_samples": 64,
"ti_reg_weight": 0.01,
"tie_word_embeddings": true,
"transformers_version": "5.0.0",
"use_balls": false,
"vocab_size": 50277,
"window_size": 512
}