Upload DeepseekV4ForCausalLM

#5
by qgallouedec HF Staff - opened
Files changed (2) hide show
  1. config.json +14 -3
  2. model.safetensors +1 -1
config.json CHANGED
@@ -12,9 +12,14 @@
12
  128
13
  ],
14
  "compress_rope_parameters": {
 
 
 
 
15
  "partial_rotary_factor": 0.125,
16
  "rope_theta": 160000.0,
17
- "rope_type": "default"
 
18
  },
19
  "compress_rope_theta": 160000.0,
20
  "dtype": "bfloat16",
@@ -43,7 +48,7 @@
43
  "num_experts_per_tok": 2,
44
  "num_hash_layers": 3,
45
  "num_hidden_layers": 4,
46
- "num_key_value_heads": 2,
47
  "num_nextn_predict_layers": 1,
48
  "o_groups": 8,
49
  "o_lora_rank": 1024,
@@ -57,9 +62,14 @@
57
  "rms_norm_eps": 1e-06,
58
  "rope_interleave": true,
59
  "rope_parameters": {
 
 
 
 
60
  "partial_rotary_factor": 0.125,
61
  "rope_theta": 10000.0,
62
- "rope_type": "default"
 
63
  },
64
  "rope_theta": 10000.0,
65
  "routed_scaling_factor": 1.5,
@@ -70,6 +80,7 @@
70
  "swiglu_limit": 10.0,
71
  "tie_word_embeddings": false,
72
  "topk_group": null,
 
73
  "transformers_version": "5.7.0.dev0",
74
  "use_cache": true,
75
  "v_head_dim": null,
 
12
  128
13
  ],
14
  "compress_rope_parameters": {
15
+ "beta_fast": 32,
16
+ "beta_slow": 1,
17
+ "factor": 16,
18
+ "original_max_position_embeddings": 65536,
19
  "partial_rotary_factor": 0.125,
20
  "rope_theta": 160000.0,
21
+ "rope_type": "yarn",
22
+ "type": "yarn"
23
  },
24
  "compress_rope_theta": 160000.0,
25
  "dtype": "bfloat16",
 
48
  "num_experts_per_tok": 2,
49
  "num_hash_layers": 3,
50
  "num_hidden_layers": 4,
51
+ "num_key_value_heads": 1,
52
  "num_nextn_predict_layers": 1,
53
  "o_groups": 8,
54
  "o_lora_rank": 1024,
 
62
  "rms_norm_eps": 1e-06,
63
  "rope_interleave": true,
64
  "rope_parameters": {
65
+ "beta_fast": 32,
66
+ "beta_slow": 1,
67
+ "factor": 16,
68
+ "original_max_position_embeddings": 65536,
69
  "partial_rotary_factor": 0.125,
70
  "rope_theta": 10000.0,
71
+ "rope_type": "yarn",
72
+ "type": "yarn"
73
  },
74
  "rope_theta": 10000.0,
75
  "routed_scaling_factor": 1.5,
 
80
  "swiglu_limit": 10.0,
81
  "tie_word_embeddings": false,
82
  "topk_group": null,
83
+ "topk_method": "noaux_tc",
84
  "transformers_version": "5.7.0.dev0",
85
  "use_cache": true,
86
  "v_head_dim": null,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb246e5c57a24315997e93d55e759f78457690fe73efc19e5e58be0bb0f28a5
3
  size 37758460
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4076740c031a3b6670e03573ec7f9ea4095e54c5dde682ac8e2eb629572cd39c
3
  size 37758460