Upload DeepseekV4ForCausalLM

#1
by qgallouedec HF Staff - opened
Files changed (2) hide show
  1. config.json +7 -6
  2. model.safetensors +2 -2
config.json CHANGED
@@ -7,7 +7,8 @@
7
  "bos_token_id": 0,
8
  "compress_ratios": [
9
  0,
10
- 0
 
11
  ],
12
  "compress_rope_parameters": {
13
  "partial_rotary_factor": 0.125,
@@ -17,7 +18,7 @@
17
  "compress_rope_theta": 160000.0,
18
  "dtype": "bfloat16",
19
  "eos_token_id": 1,
20
- "first_k_dense_replace": null,
21
  "hc_eps": 1e-06,
22
  "hc_mult": 4,
23
  "hc_sinkhorn_iters": 20,
@@ -34,13 +35,13 @@
34
  "model_type": "deepseek_v4",
35
  "moe_intermediate_size": 2048,
36
  "n_group": null,
37
- "n_routed_experts": 256,
38
  "n_shared_experts": 1,
39
  "norm_topk_prob": true,
40
  "num_attention_heads": 4,
41
- "num_experts_per_tok": 6,
42
- "num_hash_layers": 3,
43
- "num_hidden_layers": 2,
44
  "num_key_value_heads": 2,
45
  "num_nextn_predict_layers": 1,
46
  "o_groups": 8,
 
7
  "bos_token_id": 0,
8
  "compress_ratios": [
9
  0,
10
+ 4,
11
+ 128
12
  ],
13
  "compress_rope_parameters": {
14
  "partial_rotary_factor": 0.125,
 
18
  "compress_rope_theta": 160000.0,
19
  "dtype": "bfloat16",
20
  "eos_token_id": 1,
21
+ "first_k_dense_replace": 1,
22
  "hc_eps": 1e-06,
23
  "hc_mult": 4,
24
  "hc_sinkhorn_iters": 20,
 
35
  "model_type": "deepseek_v4",
36
  "moe_intermediate_size": 2048,
37
  "n_group": null,
38
+ "n_routed_experts": 4,
39
  "n_shared_experts": 1,
40
  "norm_topk_prob": true,
41
  "num_attention_heads": 4,
42
+ "num_experts_per_tok": 2,
43
+ "num_hash_layers": 2,
44
+ "num_hidden_layers": 3,
45
  "num_key_value_heads": 2,
46
  "num_nextn_predict_layers": 1,
47
  "o_groups": 8,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53f6bf42b1d190e81a90dc7b9f30eaa4b3fa650313fe4785d26b4cb11bf7adef
3
- size 83997562
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c0682e334f5e114dc223c4196864ef67442228dcfe4eec24c953d386ecd5e6
3
+ size 52073390