ONNX
Safetensors
Chinese
English
Dubbing-model
xuan3986 commited on
Commit
c0270b7
·
verified ·
1 Parent(s): edd8172
funcineforge_zh_en/camplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
funcineforge_zh_en/flow/config.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: CosyVoiceFlowMatching
2
+ model_conf:
3
+ model_dtype: fp32
4
+ codebook_size: 6561
5
+ model_size: 1024
6
+ xvec_size: 192
7
+ feat_token_ratio: 2
8
+ mel_norm_type: null
9
+ lookahead_length: 3
10
+ training_cfg_rate: 0.2
11
+ inference_cfg_rate: 0.7
12
+ only_mask_loss: true
13
+ dit_conf:
14
+ dim: 1024
15
+ depth: 22
16
+ heads: 16
17
+ dim_head: 64
18
+ ff_mult: 2
19
+ mel_dim: 80
20
+ mu_dim: 80
21
+ spk_dim: 80
22
+ causal_mask_type:
23
+ - prob_min: 0
24
+ prob_max: 0.25
25
+ block_size: -1
26
+ ratio: 2
27
+ - prob_min: 0.25
28
+ prob_max: 0.5
29
+ block_size: 1
30
+ ratio: 2
31
+ - prob_min: 0.5
32
+ prob_max: 0.75
33
+ block_size: 15
34
+ ratio: 2
35
+ - prob_min: 0.75
36
+ prob_max: 1.0
37
+ block_size: 30
38
+ ratio: 2
39
+ mel_feat_conf:
40
+ n_fft: 1920
41
+ hop_length: 480
42
+ win_length: 1920
43
+ sampling_rate: 24000
44
+ n_mel_channels: 80
45
+ mel_fmin: 0
46
+ mel_fmax: 8000
47
+ center: false
48
+ feat_type: power_log
49
+ prompt_conf:
50
+ prompt_type: prefix
51
+ prompt_width_ratio_range:
52
+ - 0.7
53
+ - 1.0
54
+ frontend: WhisperFrontend
55
+ frontend_conf:
56
+ fs: 24000
57
+ n_mels: 80
58
+ do_pad_trim: false
59
+ filters_path: /cpfs_speech/zhifu.gzf/init_model/SenseVoiceSANM/assets/mel_filters.npz
60
+ train_conf:
61
+ use_lora: false
62
+ accum_grad: 1
63
+ grad_clip: 1
64
+ max_epoch: 150
65
+ keep_nbest_models: 150000
66
+ log_interval: 50
67
+ effective_save_name_excludes:
68
+ - none
69
+ resume: true
70
+ validate_interval: 10000
71
+ save_checkpoint_interval: 10000
72
+ avg_nbest_model: 100
73
+ use_bf16: false
74
+ use_deepspeed: true
75
+ save_init_model: false
76
+ loss_rescale_by_rank: false
77
+ deepspeed_config: /nfs/yanzhang.ljx/workspace/FunCineForge/exps/decode_conf/ds_stage0_fp32.json
78
+ optim: adamw
79
+ optim_conf:
80
+ lr: 0.0001
81
+ scheduler: warmuplr
82
+ scheduler_conf:
83
+ warmup_steps: 10000
84
+ dataset: CosyVoiceFlowMetaDataset
85
+ dataset_conf:
86
+ wav_token_ratio: 960
87
+ load_meta_data_key: text,token,wav_path,spk_emb_path
88
+ set_invalid_xvec_zeros: true
89
+ index_ds: CosyVoice
90
+ data_split_num: 64
91
+ batch_sampler: BatchSampler
92
+ shuffle: true
93
+ sort_size: 512
94
+ batch_type: token
95
+ batch_size: 10000
96
+ batch_size_token_max: 12000
97
+ batch_size_sample_max: 100
98
+ max_token_length: 2250
99
+ max_text_length: null
100
+ batch_size_scale_threshold: 3000
101
+ num_workers: 6
102
+ retry: 100
103
+ enable_tf32: true
104
+ debug: false
105
+ device: cpu
funcineforge_zh_en/flow/ds-model.pt.best/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54932cde43cb4beb54648b14ff701dd92eaa16423f463b594148dfaae6593a74
3
+ size 3987933779
funcineforge_zh_en/llm/config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: FunCineForgeLM
2
+ model_conf:
3
+ lsm_weight: 0.0
4
+ length_normalized_loss: true
5
+ codec_unit: 6761
6
+ timespk_unit: 1550
7
+ face_size: 512
8
+ llm: Qwen2-0.5B
9
+ llm_conf:
10
+ hub: hf
11
+ freeze: false
12
+ llm_dtype: fp32
13
+ init_param_path: /nfs/yanzhang.ljx/workspace/FunCineForge/tokenizer/Qwen2-0.5B-CosyVoice-BlankEN
14
+ use_lora: false
15
+ lora_conf:
16
+ task_type: CAUSAL_LM
17
+ r: 16
18
+ lora_alpha: 32
19
+ lora_dropout: 0.05
20
+ bias: none
21
+ target_modules:
22
+ - q_proj
23
+ - v_proj
24
+ train_conf:
25
+ use_lora: ${llm_conf.use_lora}
26
+ accum_grad: 1
27
+ grad_clip: 5
28
+ max_epoch: 200
29
+ log_interval: 100
30
+ effective_save_name_excludes:
31
+ - none
32
+ resume: true
33
+ validate_interval: 5000
34
+ save_checkpoint_interval: 5000
35
+ keep_nbest_models: 100000
36
+ avg_nbest_model: 5
37
+ use_bf16: false
38
+ save_init_model: false
39
+ loss_rescale_by_rank: false
40
+ use_deepspeed: true
41
+ deepspeed_config: /nfs/yanzhang.ljx/workspace/FunCineForge/exps/decode_conf/ds_stage0_fp32.json
42
+ optim: adamw
43
+ optim_conf:
44
+ lr: 8.0e-05
45
+ scheduler: warmuplr
46
+ scheduler_conf:
47
+ warmup_steps: 2000
48
+ dataset: FunCineForgeDataset
49
+ dataset_conf:
50
+ use_emotion_clue: true
51
+ codebook_size: 6561
52
+ sos: 6561
53
+ eos: 6562
54
+ turn_of_speech: 6563
55
+ fill_token: 6564
56
+ ignore_id: -100
57
+ startofclue_token: 151646
58
+ endofclue_token: 151647
59
+ frame_shift: 25
60
+ timebook_size: 1500
61
+ pangbai: 1500
62
+ dubai: 1501
63
+ duihua: 1502
64
+ duoren: 1503
65
+ male: 1504
66
+ female: 1505
67
+ child: 1506
68
+ youth: 1507
69
+ adult: 1508
70
+ middle: 1509
71
+ elderly: 1510
72
+ speaker_id_start: 1511
73
+ index_ds: CosyVoice
74
+ dataloader: DataloaderMapStyle
75
+ load_meta_data_key: text,clue,token,face,dialogue
76
+ data_split_num: 1
77
+ batch_sampler: BatchSampler
78
+ shuffle: true
79
+ sort_size: 512
80
+ face_size: 512
81
+ batch_type: token
82
+ batch_size: 3000
83
+ batch_size_token_max: 20000
84
+ batch_size_sample_max: 100
85
+ max_token_length: 5000
86
+ max_text_length: 300
87
+ batch_size_scale_threshold: 3000
88
+ num_workers: 20
89
+ retry: 100
90
+ specaug: FunCineForgeSpecAug
91
+ specaug_conf:
92
+ apply_time_warp: false
93
+ apply_freq_mask: false
94
+ apply_time_mask: true
95
+ time_mask_width_ratio_range:
96
+ - 0
97
+ - 0.05
98
+ num_time_mask: 10
99
+ fill_value: -100
100
+ tokenizer: FunCineForgeTokenizer
101
+ tokenizer_conf:
102
+ init_param_path: ${llm_conf.init_param_path}
103
+ face_encoder: FaceRecIR101
104
+ face_encoder_conf:
105
+ init_param_path: /nfs/yanzhang.ljx/workspace/FunCineForge/speaker_diarization/pretrained_models/face_recog_ir101.onnx
106
+ enable_tf32: true
107
+ debug: false
108
+ train_data_set_list: /nfs/yanzhang.ljx/workspace/datasets/YingShi/clean/train.jsonl
109
+ valid_data_set_list: /nfs/yanzhang.ljx/workspace/datasets/YingShi/clean/test.jsonl
110
+ output_dir: /cpfs_fundata/yanzhang.ljx/workspace/exps/1m-8gpu/zh_en
111
+ init_param: /nfs/hengwu.zty/exps/4m-8gpu/CosyVoice_MixedAM_5b15_Qwen2_500M_phn_fp32_fsq6561_simple_sys_minmo_l12_merge_cosyvoice3d5_baiyinku_emilia_yodas2_0605/ds-model.pt.ep0.290000/mp_rank_00_model_states.pt
112
+ device: cpu
funcineforge_zh_en/llm/ds-model.pt.best/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ef73ff7c19b85cadecf0ea134173100c44353b643322788778c3f687f1f5a20
3
+ size 6096417415
funcineforge_zh_en/vocoder/config.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: CausalHifiGan
2
+ model_conf:
3
+ CausalHiFTGenerator_conf:
4
+ in_channels: 80
5
+ base_channels: 512
6
+ nb_harmonics: 8
7
+ sampling_rate: 24000
8
+ nsf_alpha: 0.1
9
+ nsf_sigma: 0.003
10
+ nsf_voiced_threshold: 10
11
+ upsample_rates: [8, 5, 3]
12
+ upsample_kernel_sizes: [16, 11, 7]
13
+ istft_params:
14
+ n_fft: 16
15
+ hop_len: 4
16
+ resblock_kernel_sizes: [3, 7, 11]
17
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
18
+ source_resblock_kernel_sizes: [7, 7, 11]
19
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
20
+ lrelu_slope: 0.1
21
+ audio_limit: 0.99
22
+ CausalConvRNNF0Predictor_conf:
23
+ num_class: 1
24
+ in_channels: 80
25
+ cond_channels: 512
26
+ sample_rate: 24000
funcineforge_zh_en/vocoder/ds-model.pt.best/avg_5_removewn.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac00e77b8bec73bdeebd2fa06b4bea531f396afa35f962d8dc1708c6e876d9f
3
+ size 83141596
funcineforge_zh_en/vocoder/hift_causal.hyper.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ hift: !new:cosyvoice.models.vocoder.hift_causal.CausalHiFTGenerator
17
+ in_channels: 80
18
+ base_channels: 512
19
+ nb_harmonics: 8
20
+ sampling_rate: !ref <sample_rate>
21
+ nsf_alpha: 0.1
22
+ nsf_sigma: 0.003
23
+ nsf_voiced_threshold: 10
24
+ upsample_rates: [8, 5, 3]
25
+ upsample_kernel_sizes: [16, 11, 7]
26
+ istft_params:
27
+ n_fft: 16
28
+ hop_len: 4
29
+ resblock_kernel_sizes: [3, 7, 11]
30
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
31
+ source_resblock_kernel_sizes: [7, 7, 11]
32
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
33
+ lrelu_slope: 0.1
34
+ audio_limit: 0.99
35
+ f0_predictor: !new:cosyvoice.models.vocoder.f0_predictor_causal.CausalConvRNNF0Predictor
36
+ num_class: 1
37
+ in_channels: 80
38
+ cond_channels: 512