Upload

Browse files

Files changed (8) hide show

funcineforge_zh_en/camplus.onnx +3 -0
funcineforge_zh_en/flow/config.yaml +105 -0
funcineforge_zh_en/flow/ds-model.pt.best/mp_rank_00_model_states.pt +3 -0
funcineforge_zh_en/llm/config.yaml +112 -0
funcineforge_zh_en/llm/ds-model.pt.best/mp_rank_00_model_states.pt +3 -0
funcineforge_zh_en/vocoder/config.yaml +26 -0
funcineforge_zh_en/vocoder/ds-model.pt.best/avg_5_removewn.pt +3 -0
funcineforge_zh_en/vocoder/hift_causal.hyper.yaml +38 -0

funcineforge_zh_en/camplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

funcineforge_zh_en/flow/config.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+model: CosyVoiceFlowMatching
+model_conf:
+  model_dtype: fp32
+  codebook_size: 6561
+  model_size: 1024
+  xvec_size: 192
+  feat_token_ratio: 2
+  mel_norm_type: null
+  lookahead_length: 3
+  training_cfg_rate: 0.2
+  inference_cfg_rate: 0.7
+  only_mask_loss: true
+  dit_conf:
+    dim: 1024
+    depth: 22
+    heads: 16
+    dim_head: 64
+    ff_mult: 2
+    mel_dim: 80
+    mu_dim: 80
+    spk_dim: 80
+    causal_mask_type:
+    - prob_min: 0
+      prob_max: 0.25
+      block_size: -1
+      ratio: 2
+    - prob_min: 0.25
+      prob_max: 0.5
+      block_size: 1
+      ratio: 2
+    - prob_min: 0.5
+      prob_max: 0.75
+      block_size: 15
+      ratio: 2
+    - prob_min: 0.75
+      prob_max: 1.0
+      block_size: 30
+      ratio: 2
+  mel_feat_conf:
+    n_fft: 1920
+    hop_length: 480
+    win_length: 1920
+    sampling_rate: 24000
+    n_mel_channels: 80
+    mel_fmin: 0
+    mel_fmax: 8000
+    center: false
+    feat_type: power_log
+  prompt_conf:
+    prompt_type: prefix
+    prompt_width_ratio_range:
+    - 0.7
+    - 1.0
+frontend: WhisperFrontend
+frontend_conf:
+  fs: 24000
+  n_mels: 80
+  do_pad_trim: false
+  filters_path: /cpfs_speech/zhifu.gzf/init_model/SenseVoiceSANM/assets/mel_filters.npz
+train_conf:
+  use_lora: false
+  accum_grad: 1
+  grad_clip: 1
+  max_epoch: 150
+  keep_nbest_models: 150000
+  log_interval: 50
+  effective_save_name_excludes:
+  - none
+  resume: true
+  validate_interval: 10000
+  save_checkpoint_interval: 10000
+  avg_nbest_model: 100
+  use_bf16: false
+  use_deepspeed: true
+  save_init_model: false
+  loss_rescale_by_rank: false
+  deepspeed_config: /nfs/yanzhang.ljx/workspace/FunCineForge/exps/decode_conf/ds_stage0_fp32.json
+optim: adamw
+optim_conf:
+  lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 10000
+dataset: CosyVoiceFlowMetaDataset
+dataset_conf:
+  wav_token_ratio: 960
+  load_meta_data_key: text,token,wav_path,spk_emb_path
+  set_invalid_xvec_zeros: true
+  index_ds: CosyVoice
+  data_split_num: 64
+  batch_sampler: BatchSampler
+  shuffle: true
+  sort_size: 512
+  batch_type: token
+  batch_size: 10000
+  batch_size_token_max: 12000
+  batch_size_sample_max: 100
+  max_token_length: 2250
+  max_text_length: null
+  batch_size_scale_threshold: 3000
+  num_workers: 6
+  retry: 100
+enable_tf32: true
+debug: false
+device: cpu

funcineforge_zh_en/flow/ds-model.pt.best/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54932cde43cb4beb54648b14ff701dd92eaa16423f463b594148dfaae6593a74
+size 3987933779

funcineforge_zh_en/llm/config.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+model: FunCineForgeLM
+model_conf:
+  lsm_weight: 0.0
+  length_normalized_loss: true
+  codec_unit: 6761
+  timespk_unit: 1550
+  face_size: 512
+llm: Qwen2-0.5B
+llm_conf:
+  hub: hf
+  freeze: false
+  llm_dtype: fp32
+  init_param_path: /nfs/yanzhang.ljx/workspace/FunCineForge/tokenizer/Qwen2-0.5B-CosyVoice-BlankEN
+  use_lora: false
+  lora_conf:
+    task_type: CAUSAL_LM
+    r: 16
+    lora_alpha: 32
+    lora_dropout: 0.05
+    bias: none
+    target_modules:
+    - q_proj
+    - v_proj
+train_conf:
+  use_lora: ${llm_conf.use_lora}
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 200
+  log_interval: 100
+  effective_save_name_excludes:
+  - none
+  resume: true
+  validate_interval: 5000
+  save_checkpoint_interval: 5000
+  keep_nbest_models: 100000
+  avg_nbest_model: 5
+  use_bf16: false
+  save_init_model: false
+  loss_rescale_by_rank: false
+  use_deepspeed: true
+  deepspeed_config: /nfs/yanzhang.ljx/workspace/FunCineForge/exps/decode_conf/ds_stage0_fp32.json
+optim: adamw
+optim_conf:
+  lr: 8.0e-05
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 2000
+dataset: FunCineForgeDataset
+dataset_conf:
+  use_emotion_clue: true
+  codebook_size: 6561
+  sos: 6561
+  eos: 6562
+  turn_of_speech: 6563
+  fill_token: 6564
+  ignore_id: -100
+  startofclue_token: 151646
+  endofclue_token: 151647
+  frame_shift: 25
+  timebook_size: 1500
+  pangbai: 1500
+  dubai: 1501
+  duihua: 1502
+  duoren: 1503
+  male: 1504
+  female: 1505
+  child: 1506
+  youth: 1507
+  adult: 1508
+  middle: 1509
+  elderly: 1510
+  speaker_id_start: 1511
+  index_ds: CosyVoice
+  dataloader: DataloaderMapStyle
+  load_meta_data_key: text,clue,token,face,dialogue
+  data_split_num: 1
+  batch_sampler: BatchSampler
+  shuffle: true
+  sort_size: 512
+  face_size: 512
+  batch_type: token
+  batch_size: 3000
+  batch_size_token_max: 20000
+  batch_size_sample_max: 100
+  max_token_length: 5000
+  max_text_length: 300
+  batch_size_scale_threshold: 3000
+  num_workers: 20
+  retry: 100
+  specaug: FunCineForgeSpecAug
+  specaug_conf:
+    apply_time_warp: false
+    apply_freq_mask: false
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0
+    - 0.05
+    num_time_mask: 10
+    fill_value: -100
+tokenizer: FunCineForgeTokenizer
+tokenizer_conf:
+  init_param_path: ${llm_conf.init_param_path}
+face_encoder: FaceRecIR101
+face_encoder_conf:
+  init_param_path: /nfs/yanzhang.ljx/workspace/FunCineForge/speaker_diarization/pretrained_models/face_recog_ir101.onnx
+enable_tf32: true
+debug: false
+train_data_set_list: /nfs/yanzhang.ljx/workspace/datasets/YingShi/clean/train.jsonl
+valid_data_set_list: /nfs/yanzhang.ljx/workspace/datasets/YingShi/clean/test.jsonl
+output_dir: /cpfs_fundata/yanzhang.ljx/workspace/exps/1m-8gpu/zh_en
+init_param: /nfs/hengwu.zty/exps/4m-8gpu/CosyVoice_MixedAM_5b15_Qwen2_500M_phn_fp32_fsq6561_simple_sys_minmo_l12_merge_cosyvoice3d5_baiyinku_emilia_yodas2_0605/ds-model.pt.ep0.290000/mp_rank_00_model_states.pt
+device: cpu

funcineforge_zh_en/llm/ds-model.pt.best/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ef73ff7c19b85cadecf0ea134173100c44353b643322788778c3f687f1f5a20
+size 6096417415

funcineforge_zh_en/vocoder/config.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+model: CausalHifiGan
+model_conf:
+    CausalHiFTGenerator_conf:
+        in_channels: 80
+        base_channels: 512
+        nb_harmonics: 8
+        sampling_rate: 24000
+        nsf_alpha: 0.1
+        nsf_sigma: 0.003
+        nsf_voiced_threshold: 10
+        upsample_rates: [8, 5, 3]
+        upsample_kernel_sizes: [16, 11, 7]
+        istft_params:
+            n_fft: 16
+            hop_len: 4
+        resblock_kernel_sizes: [3, 7, 11]
+        resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        source_resblock_kernel_sizes: [7, 7, 11]
+        source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        lrelu_slope: 0.1
+        audio_limit: 0.99
+    CausalConvRNNF0Predictor_conf:
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+    sample_rate: 24000

funcineforge_zh_en/vocoder/ds-model.pt.best/avg_5_removewn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aac00e77b8bec73bdeebd2fa06b4bea531f396afa35f962d8dc1708c6e876d9f
+size 83141596

funcineforge_zh_en/vocoder/hift_causal.hyper.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1986]
+__set_seed2: !apply:numpy.random.seed [1986]
+__set_seed3: !apply:torch.manual_seed [1986]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1024
+llm_output_size: 1024
+spk_embed_dim: 192
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+hift: !new:cosyvoice.models.vocoder.hift_causal.CausalHiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 5, 3]
+    upsample_kernel_sizes: [16, 11, 7]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:cosyvoice.models.vocoder.f0_predictor_causal.CausalConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512