Spaces:

FunAudioLLM
/

Fun-CineForge-Demo

Running on Zero

App Files Files Community

xuan3986 commited on Mar 23

Commit

03022ee

verified ·

1 Parent(s): 21a001f

Upload 111 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +1 -0
LICENSE +201 -0
README.md +155 -14
README_zh.md +153 -0
app.py +415 -0
data/ref.wav +3 -0
data/sample.mp4 +3 -0
decode_conf/decode.yaml +42 -0
decode_conf/diar.yaml +51 -0
decode_conf/ds_stage0_fp32.json +33 -0
funcineforge/.DS_Store +0 -0
funcineforge/__init__.py +7 -0
funcineforge/auto/__init__.py +0 -0
funcineforge/auto/auto_frontend.py +95 -0
funcineforge/auto/auto_model.py +173 -0
funcineforge/datasets/__init__.py +2 -0
funcineforge/datasets/datasets.py +193 -0
funcineforge/datasets/index_ds.py +151 -0
funcineforge/download/__init__.py +0 -0
funcineforge/download/download_model_from_hub.py +220 -0
funcineforge/download/file.py +320 -0
funcineforge/download/name_maps_from_hub.py +42 -0
funcineforge/face/__init__.py +1 -0
funcineforge/face/face_recognition.py +16 -0
funcineforge/models/__init__.py +5 -0
funcineforge/models/causal_hifigan.py +834 -0
funcineforge/models/flow_matching_model.py +514 -0
funcineforge/models/inference_model.py +116 -0
funcineforge/models/language_model.py +274 -0
funcineforge/models/modules/__init__.py +0 -0
funcineforge/models/modules/dit_flow_matching/__init__.py +0 -0
funcineforge/models/modules/dit_flow_matching/dit_model.py +208 -0
funcineforge/models/modules/dit_flow_matching/dit_modules.py +622 -0
funcineforge/models/modules/hifigan/__init__.py +14 -0
funcineforge/models/modules/hifigan/activations.py +120 -0
funcineforge/models/modules/hifigan/discriminator.py +299 -0
funcineforge/models/modules/hifigan/generator.py +625 -0
funcineforge/models/modules/hifigan/mel_spectrum.py +93 -0
funcineforge/models/modules/hifigan/nsf_utils.py +253 -0
funcineforge/models/specaug/__init__.py +0 -0
funcineforge/models/specaug/mask_along_axis.py +204 -0
funcineforge/models/specaug/specaug.py +103 -0
funcineforge/models/specaug/time_warp.py +89 -0
funcineforge/models/utils/__init__.py +2 -0
funcineforge/models/utils/llm_decoding.py +178 -0
funcineforge/models/utils/mask_along_axis.py +76 -0
funcineforge/models/utils/masks.py +132 -0
funcineforge/models/utils/nets_utils.py +734 -0
funcineforge/tokenizer/__init__.py +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/ref.wav filter=lfs diff=lfs merge=lfs -text
+data/sample.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pyc

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,14 +1,155 @@
----
-title: Fun CineForge Demo
-emoji: 🏢
-colorFrom: green
-colorTo: gray
-sdk: gradio
-sdk_version: 6.9.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Fun-CineForge-zh-en-v1-0.5B
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+### <p align="center">「English | [简体中文](./README_zh.md)」</p>
+<p align="center">
+<b>🎬 Fun-CineForge: A Unified Dataset Pipeline and Model for Zero-Shot Movie Dubbing<br>
+in Diverse Cinematic Scenes</b>
+</p>
+<div align="center">
+![license](https://img.shields.io/github/license/modelscope/modelscope.svg)
+<a href=""><img src="https://img.shields.io/badge/OS-Linux-orange.svg"></a>
+<a href=""><img src="https://img.shields.io/badge/Python->=3.8-aff.svg"></a>
+<a href=""><img src="https://img.shields.io/badge/Pytorch->=2.1-blue"></a>
+</div>
+<div align="center">
+<h4><a href="#Dataset&Demo">Dataset & Demo</a>
+｜<a href="#Environment">Environment</a>
+｜<a href="#Dataset-Pipeline">Dataset Pipeline</a>
+｜<a href="#Dubbing-Model">Dubbing Model</a>
+｜<a href="#Recent-Updates">Recent Updates</a>
+｜<a href="#Publication">Publication</a>
+｜<a href="#Comminicate">Comminicate</a>
+</h4>
+</div>
+**Fun-CineForge** contains an end-to-end dataset pipeline for producing large-scale dubbing datasets and an MLLM-based dubbing model designed for diverse cinematic scenes. Using this pipeline, we constructed the first large-scale Chinese television dubbing dataset CineDub-CN, which includes rich annotations and diverse scenes. In monologue, narration, dialogue, and multi-speaker scenes, our dubbing model consistently outperforms state-of-the-art methods in terms of audio quality, lip-sync, timbre transition, and instruction following.
+<a name="Dataset&Demo"></a>
+## Dataset & Demo 🎬
+You can access [https://funcineforge.github.io/](https://funcineforge.github.io/) to get our CineDub-CN dataset samples and demo samples.
+<a name="Environment"></a>
+## Environmental Installation
+Fun-CineForge relies on Conda and Python environments. Execute **setup.py** to automatically install the entire project environment and open-source model.
+```shell
+# Conda
+git clone git@github.com:FunAudioLLM/FunCineForge.git
+conda create -n FunCineForge python=3.10 -y && conda activate FunCineForge
+sudo apt-get install ffmpeg
+# Initial settings
+python setup.py
+```
+<a name="Dataset-Pipeline"></a>
+## Dataset Pipeline 🔨
+### Data collection
+If you want to produce your own data,
+we recommend that you refer to the following requirements to collect the corresponding movies or television series.
+1. Video source: TV dramas or movies, non documentaries, with more monologues or dialogue scenes, clear and unobstructed faces (such as without masks and veils).
+2. Speech Requirements: Standard pronunciation, clear articulation, prominent human voice. Avoid materials with strong dialects, excessive background noise, or strong colloquialism.
+3. Image Requirements: High resolution, clear facial details, sufficient lighting, avoiding extremely dark or strong backlit scenes.
+### How to use
+- [1] Standardize video format and name; trim the beginning and end of long videos; extract the audio from the trimmed video. (default is to trim 10 seconds from both the beginning and end.)
+```shell
+python normalize_trim.py --root datasets/raw_zh --intro 10 --outro 10
+```
+- [2] [Speech Separation](./speech_separation/README.md). The audio is used to separate the vocals from the instrumental music.
+```shell
+cd speech_separation
+python run.py --root datasets/clean/zh --gpus 0 1 2 3
+```
+- [3] [VideoClipper](./video_clip/README.md). For long videos, VideoClipper is used to obtain sentence-level subtitle files and clip the long video into segments based on timestamps. Now it supports bilingualism in both Chinese and English. Below is an example in Chinese. It is recommended to use gpu acceleration for English.
+```shell
+cd video_clip
+bash run.sh --stage 1 --stop_stage 2 --input datasets/raw_zh --output datasets/clean/zh --lang zh --device cpu
+```
+- Video duration limit and check for cleanup. (Without --execute, only pre-deleted files will be printed. After checking, add --execute to confirm the deletion.)
+```shell
+python clean_video.py --root datasets/clean/zh
+python clean_srt.py --root datasets/clean/zh --lang zh
+```
+- [4] [Speaker Diarization](./speaker_diarization/README.md). Multimodal active speaker recognition obtains RTTM files; identifies the speaker's facial frames, extracts frame-level speaker face and lip raw data.
+```shell
+cd speaker_diarization
+bash run.sh --stage 1 --stop_stage 4 --hf_access_token hf_xxx --root datasets/clean/zh --gpus "0 1 2 3"
+```
+- (Reference) Extract speech tokens based on the CosyVoice3 tokenizer for llm training.
+```shell
+python speech_tokenizer.py --root datasets/clean/zh
+```
+- [5] Multimodal CoT Correction. Based on general-purpose MLLMs, the system uses audio, ASR text, and RTTM files as input. It leverages Chain-of-Thought (CoT) reasoning to extract clues and corrects the results of the specialized models. It also annotates character age, gender, and vocal timbre. Experimental results show that this strategy reduces the CER from 4.53% to 0.94% and the speaker diarization error rate from 8.38% to 1.20%, achieving quality comparable to or even better than manual transcription. Adding the --resume enables breakpoint COT inference to prevent wasted resources from repeated COT inferences. Now supports both Chinese and English.
+```shell
+python cot.py --root_dir datasets/clean/zh --lang zh --provider google --model gemini-3-pro-preview --api_key xxx --resume
+python cot.py --root_dir datasets/clean/en --lang en --provider google --model gemini-3-pro-preview --api_key xxx --resume
+```
+- The construction of the dataset retrieval file will read all production data, perform bidirectional verification of script content and speaker separation results.
+```shell
+python build_datasets.py --root_zh datasets/clean/zh --root_en datasets/clean/en --out_dir datasets/clean --save
+```
+<a name="Dubbing-Model"></a>
+## Dubbing Model ⚙️
+We've open-sourced the inference code and the **infer.sh** script, and provided some test cases in the data folder for your experience. Inference requires a consumer-grade GPU. Run the following command:
+```shell
+cd exps
+bash infer.sh
+```
+The API for multi-speaker dubbing from raw videos and SRT scripts is under development ...
+<a name="Recent-Updates"></a>
+## Recent Updates 🚀
+- 2025/12/18: Fun-CineForge dataset pipeline toolkit is online! 🔥
+- 2026/01/19: Chinese demo samples and CineDub-CN dataset samples released. 🔥
+- 2026/01/25: Fix some environmental and operational issues.
+- 2026/02/09: Optimized the data pipeline and added support for English videos.
+- 2026/03/05: English demo samples and CineDub-EN dataset samples released. 🔥
+- 2026/03/16: Open source inference code and checkpoints. 🔥
+<a name="Publication"></a>
+## Publication 📚
+If you use our dataset or code, please cite the following paper:
+<pre>
+@misc{liu2026funcineforgeunifieddatasettoolkit,
+    title={FunCineForge: A Unified Dataset Toolkit and Model for Zero-Shot Movie Dubbing in Diverse Cinematic Scenes},
+    author={Jiaxuan Liu and Yang Xiang and Han Zhao and Xiangang Li and Zhenhua Ling},
+    year={2026},
+    eprint={2601.14777},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV},
+}
+</pre>
+<a name="Comminicate"></a>
+## Comminicate 🍟
+The Fun-CineForge open-source project is developed and maintained by the Tongyi Lab Speech Team and a student from NERCSLIP, University of Science and Technology of China.
+We welcome you to participate in discussions on Fun-CineForge [GitHub Issues](https://github.com/FunAudioLLM/FunCineForge/issues) or contact us for collaborative development.
+For any questions, you can contact the [developer](mailto:jxliu@mail.ustc.edu.cn).
+⭐ Hope you will support Fun-CineForge. Thank you.
+### Disclaimer
+This repository contains research artifacts:
+⚠️ Currently not a commercial product of Tongyi Lab.
+⚠️ Released for academic research / cutting-edge exploration purposes
+⚠️ CineDub Dataset samples are subject to specific license terms.

README_zh.md ADDED Viewed

	@@ -0,0 +1,153 @@

+### <p align="center">「[English](./README.md) | 简体中文」</p>
+<p align="center">
+<b>🎬 Fun-CineForge：一种用于多样化影视场景零样本配音的统一数据集管道和模型</b>
+</p>
+<div align="center">
+![license](https://img.shields.io/github/license/modelscope/modelscope.svg)
+<a href=""><img src="https://img.shields.io/badge/OS-Linux-orange.svg"></a>
+<a href=""><img src="https://img.shields.io/badge/Python->=3.8-aff.svg"></a>
+<a href=""><img src="https://img.shields.io/badge/Pytorch->=2.1-blue"></a>
+</div>
+<div align="center">
+<h4><a href="#数据集&样例">数据集 & 样例</a>
+｜<a href="#环境安装">环境安装</a>
+｜<a href="#数据集管道">数据集管道</a>
+｜<a href="#配音模型">配音模型</a>
+｜<a href="#近期更新">近期更新</a>
+｜<a href="#发表">发表</a>
+｜<a href="#社区交流">社区交流</a>
+</h4>
+</div>
+**Fun-CineForge** 包含一个生产大规模配音数据集的端到端数据集管道，和一个基于多模态大模型的配音模型，该模型专为多样的电影场景而设计。利用该管道，我们构建了首个大规模中文电视剧配音数据集 CineDub-CN，该数据集包含丰富的标注和多样化的场景。在独白、旁白、对话和多说话人场景中，我们的配音模型在音频质量、唇形同步、音色转换和指令遵循等方面全部优于最先进的方法。
+<a name="数据集&样例"></a>
+## 数据集 & 样例 🎬
+您可以访问此 [https://funcineforge.github.io/](https://funcineforge.github.io/) 获取我们的 CineDub-CN 数据集和 CineDub-EN 数据集样例和演示样例。
+<a name="环境安装"></a>
+## 环境安装
+Fun-CineForge 依赖 Conda 和 Python 环境。执行 **setup.py** 自动安装整个项目环境和开源模型。
+```shell
+# Conda
+git clone git@github.com:FunAudioLLM/FunCineForge.git
+conda create -n FunCineForge python=3.10 -y && conda activate FunCineForge
+sudo apt-get install ffmpeg
+# 初始化设置
+python setup.py
+```
+<a name="数据集管道"></a>
+## 数据集管道 🔨
+### 数据收集
+如果您想自行生产数据，我们建议您参考下面的要求收集相应的电影或影视剧。
+1. 视频来源：电视剧或电影，非纪录片，人物独白或对话场景较多，人脸清晰且无遮挡（如无面罩、面纱）。
+2. 语音要求：发音标准，吐字清晰，人声突出。避免方言浓重、背景噪音过大或口语感过强的素材。
+3. 图片要求：高分辨率，面部细节清晰，光线充足，避免极端阴暗或强烈逆光的场景。
+### 使用方法
+- [1] 将视频格式、名称标准化；裁剪长视频的片头片尾；提取裁剪后视频的音频。（默认是从起止各裁剪 10 秒。）
+```shell
+python normalize_trim.py --root datasets/raw_zh --intro 10 --outro 10
+```
+- [2] [Speech Separation](./speech_separation/README.md). 音频进行人声乐声分离。
+```shell
+cd speech_separation
+python run.py --root datasets/clean/zh --gpus 0 1 2 3
+```
+- [3] [VideoClipper](./video_clip/README.md). 对于长视频，使用 VideoClipper 获取句子级别的字幕文件，并根据时间戳将长视频剪辑成片段。现在它支持中英双语。以下是中文示例。英文建议采用 gpu 加速处理。
+```shell
+cd video_clip
+bash run.sh --stage 1 --stop_stage 2 --input datasets/raw_zh --output datasets/clean/zh --lang zh --device cpu
+```
+- 视频时长限制及清理检查。（若不使用--execute参数，则仅打印已预删除的文件。检查后，若需确认删除，请添加--execute参数。）
+```shell
+python clean_video.py --root datasets/clean/zh
+python clean_srt.py --root datasets/clean/zh --lang zh
+```
+- [4] [Speaker Diarization](./speaker_diarization/README.md). 多模态主动说话人识别，得到 RTTM 文件；识别说话人的面部帧，提取帧级的说话人面部和唇部原始数据，从面部帧中识别说话帧，提取说话帧的面部特征。
+```shell
+cd speaker_diarization
+bash run.sh --stage 1 --stop_stage 4 --hf_access_token hf_xxx --root datasets/clean/zh --gpus "0 1 2 3"
+```
+- （参考）基于 CosyVoice3 tokenizer 提取 speech tokens 用于大模型训练。
+```shell
+python speech_tokenizer.py --root datasets/clean/zh
+```
+- [5] 多模态思维链校正。该系统基于通用多模态大模型，以音频、ASR 抄本和 RTTM 文件为输入，利用思维链推理来提取线索，并校正专用模型的结果，并标注人物年龄、性别和音色。实验结果表明，该策略将词错率从4.53% 降低到 0.94%，说话人识别错误率从 8.38% 降低到 1.20%，其质量可与人工转录相媲美，甚至更优。添加--resume选项可启用断点思维链推理，以避免重复思维链推理造成的资源浪费。现支持中英文。
+```shell
+python cot.py --root_dir datasets/clean/zh --lang zh --provider google --model gemini-3-pro-preview --api_key xxx --resume
+python cot.py --root_dir datasets/clean/en --lang en --provider google --model gemini-3-pro-preview --api_key xxx --resume
+```
+- 数据集检索文件的构建会读取生产的所有数据，双向校验脚本内容和说话人分离结果。
+```shell
+python build_datasets.py --root_zh datasets/clean/zh --root_en datasets/clean/en --out_dir datasets/clean --save
+```
+<a name="Dubbing-Model"></a>
+## 配音模型 ⚙️
+我们开源了推理代码和 **infer.sh** 脚本，在 data 文件夹中提供了一些测试样例，以供体验。推理需要一张消费级 GPU。按下面的命令运行：
+```shell
+cd exps
+bash infer.sh
+```
+从原始视频和 SRT 脚本进行多人配音的 API 调用接口在开发中 ...
+<a name="近期更新"></a>
+## 近期更新 🚀
+- 2025/12/18：Fun-CineForge 数据集管道工具包上线！🔥
+- 2026/01/19：发布中文演示样例和 CineDub-CN 数据集样例。 🔥
+- 2026/01/25：修复了一些环境和运行问题。
+- 2026/02/09：优化了数据管道，新增支持英文视频的能力。
+- 2026/03/05：发布英文演示样例和 CineDub-EN 数据集样例。 🔥
+- 2026/03/16：开源推理代码和 checkpoints。 🔥
+<a name="发表"></a>
+## 发表 📚
+如果您使用了我们的数据集或代码，请引用以下论文：
+<pre>
+@misc{liu2026funcineforgeunifieddatasettoolkit,
+    title={FunCineForge: A Unified Dataset Toolkit and Model for Zero-Shot Movie Dubbing in Diverse Cinematic Scenes},
+    author={Jiaxuan Liu and Yang Xiang and Han Zhao and Xiangang Li and Zhenhua Ling},
+    year={2026},
+    eprint={2601.14777},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV},
+}
+</pre>
+<a name="社区交流"></a>
+## 社区交流 🍟
+Fun-CineForge 开源项目由通义实验室语音团队和中国科学技术大学 NERCSLIP 学生开发并维护，我们欢迎您在 Fun-CineForge [GitHub Issues](https://github.com/FunAudioLLM/FunCineForge/issues) 参与问题讨论，或联系我们合作开发。
+有任何问题您可以联系[开发者](mailto:jxliu@mail.ustc.edu.cn)。
+⭐ 希望您你支持 Fun-CineForge，谢谢。
+### 免责声明
+该仓库包含的研究成果：
+⚠️ 目前非通义实验室商业化产品
+⚠️ 供学术研究/前沿探索用途
+⚠️ 数据集样例受特定许可条款约束

app.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# app.py
+import os
+import json
+import torch
+import gradio as gr
+import typing
+import time
+import shutil
+from moviepy.video.io.VideoFileClip import VideoFileClip, AudioFileClip
+from moviepy.audio.AudioClip import CompositeAudioClip
+from modelscope import snapshot_download
+from utils import get_video_duration, generate_jsonl_data, validate_timestamps, parse_srt_content
+# 尝试导入模型库
+from funcineforge import AutoFrontend
+from speaker_diarization.run import GlobalModels
+snapshot_download(
+    repo_id="FunAudioLLM/Fun-CineForge",
+    revision='v1.0.0',
+    local_dir='pretrained_models',
+    ignore_patterns=[
+        "*.md",
+        ".git*",
+        "funcineforge_zh_en/llm/config.yaml"
+    ],
+    repo_type="model",
+)
+# ==================== 配置区域 ====================
+DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+SERVER_PORT = 7860
+TEMP_DIR = "temp_workdir"
+CONFIG_FRONTEND = "decode_conf/diar.yaml"
+CONFIG_MODEL = "decode_conf/decode.yaml"
+PRETRAIN = "pretrained_models"
+MAX_SEGMENTS = 8 # UI 片段数上限
+DEFAULT_VIDEO_PATH="data/sample.mp4"
+DEFAULT_AUDIO_PATH="data/ref.wav"
+DEFAULT_TEXT = "我军无粮，利在急战。今乘魏兵新败，不敢出兵，出其不意，乘机退去，方可平安无事。"
+DEFAULT_CLUE = "一位中年男性以沉稳但略带担忧的语调，分析我军无粮急战的困境与敌军心败状态。他随即提出一种撤退方案，整体流露出对战局的担忧和谋求生路。"
+# 全局模型实例（延迟加载）
+model_pool: typing.Optional[GlobalModels] = None
+engine = None
+def init_engine():
+    """延迟加载模型，避免启动时卡住"""
+    global engine
+    engine = AutoFrontend(PRETRAIN, CONFIG_MODEL, TEMP_DIR, DEVICE)
+    return engine
+def init_frontend_models():
+    global model_pool
+    model_pool = GlobalModels(
+        hf_token = None,
+        config_path = CONFIG_FRONTEND,
+        pretrained_dir= PRETRAIN,
+        device = DEVICE,
+        pool_sizes = {"face": 1, "asd": 1, "fr": 1},
+        batch_size = 1,
+        preload = True
+    )
+    return model_pool
+# ==================== Gradio UI 逻辑 ====================
+def create_segments_ui():
+    segments = []
+    accordions = []
+    for i in range(MAX_SEGMENTS):
+        with gr.Accordion(f"🎬 配音片段 {i + 1}", open=(i == 0), visible=(i == 0)) as acc:
+            accordions.append(acc)
+            with gr.Row():
+                text_input = gr.Textbox(label="📝 配音文本内容", placeholder="输入台词...", lines=2, scale=3, elem_id=f"text_{i}")
+                clue_input = gr.Textbox(label="💡 线索描述", placeholder="一位中年男性角色语气沉稳且坚定，流露出对自身忠诚的强烈自信与决心。整体情感是忠贞不渝的承诺和不容置疑的信念。", lines=2, scale=3, elem_id=f"clue_{i}")
+            with gr.Row():
+                start_time = gr.Number(label="⏱️ 起始时间 (s)", value=0.0 + i*5, precision=2, scale=2, elem_id=f"start_{i}")
+                end_time = gr.Number(label="⏱️ 终止时间 (s)", value=5.0 + i*5, precision=2, scale=2, elem_id=f"end_{i}")
+            with gr.Row():
+                age_input = gr.Dropdown(label="👤 年龄", choices=["儿童", "青年", "中年", "中老年", "老年", "不确定"], value="不确定", scale=2, elem_id=f"age_{i}")
+                gender_input = gr.Dropdown(label="👤 性别", choices=["男", "女", "不确定"], value="不确定", scale=2, elem_id=f"gender_{i}")
+            with gr.Row():
+                ref_audio = gr.Audio(label="🎤 参考语音 (可选，默认以视频原声作为参考音频)", sources=["upload"], type="filepath", scale=4,elem_id=f"audio_{i}")
+                load_audio_btn = gr.Button("📂 加载示例音频", size="sm", variant="secondary", scale=1) if i == 0 else None
+            with gr.Row():
+                enable_check = gr.Checkbox(label="启用此片段", value=(i == 0), scale=1, elem_id=f"enable_{i}")
+        segments.append({
+            "accordion": acc, "text": text_input, "clue": clue_input, "start": start_time, "end": end_time,
+            "age": age_input, "gender": gender_input, "audio": ref_audio,
+            "enable": enable_check, "index": i, "load_audio_btn": load_audio_btn})
+    return segments, accordions
+def add_segment_fn(current_count):
+    """点击加号：显示下一个片段，到达上限则禁用按钮"""
+    if current_count >= MAX_SEGMENTS:
+        return [current_count] + [gr.update() for _ in range(MAX_SEGMENTS)] + [gr.update(interactive=False, value=f"已达上限 ({MAX_SEGMENTS})")]
+    new_count = current_count + 1
+    vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
+    btn = gr.update(interactive=(new_count < MAX_SEGMENTS), value="➕新片段")
+    return [new_count] + vis + [btn]
+def load_srt_fn(srt_file, current_count):
+    empty_fields = [gr.update() for _ in range(MAX_SEGMENTS * 4)]
+    empty_vis = [gr.update() for _ in range(MAX_SEGMENTS)]
+    if not srt_file:
+        return [current_count] + empty_fields + empty_vis + [gr.update()]
+    try:
+        with open(srt_file, 'r', encoding='utf-8-sig') as f:
+            content = f.read()
+    except Exception as e:
+        gr.Warning(f"读取 SRT 文件失败: {e}")
+        return [current_count] + empty_fields + empty_vis + [gr.update()]
+    parsed = parse_srt_content(content)
+    if not parsed:
+        print(" 未解析到有效字幕，请检查 SRT 格式")
+        return [current_count] + empty_fields + empty_vis + [gr.update()]
+    updates = []
+    for i in range(MAX_SEGMENTS):
+        if i < len(parsed):
+            seg = parsed[i]
+            updates.append(gr.update(value=seg['text']))
+            updates.append(gr.update(value=round(seg['start'], 2)))
+            updates.append(gr.update(value=round(seg['end'], 2)))
+            updates.append(gr.update(value=True))
+        else:
+            updates.append(gr.update(value=""))
+            updates.append(gr.update(value=0.0))
+            updates.append(gr.update(value=5.0 + i*5))
+            updates.append(gr.update(value=False))
+    new_count = min(len(parsed), MAX_SEGMENTS)
+    vis = [gr.update(visible=(i < new_count)) for i in range(MAX_SEGMENTS)]
+    btn = gr.update(interactive=(new_count < MAX_SEGMENTS))
+    if len(parsed) > MAX_SEGMENTS:
+        gr.Warning(f"SRT 包含 {len(parsed)} 个片段，已截取前 {MAX_SEGMENTS} 条")
+    return [new_count] + updates + vis + [btn]
+def process_dubbing(video_file, *segment_inputs, progress=gr.Progress()):
+    """主推理流程"""
+    if not video_file:
+        return None, "❌ 请上传视频文件"
+    video_duration = get_video_duration(video_file)
+    if video_duration <= 0:
+        return None, "❌ 无法获取视频时长，请检查视频文件"
+    if os.path.exists(TEMP_DIR):
+        try:
+            shutil.rmtree(TEMP_DIR)
+        except Exception as e:
+            return None, f"❌ 清空临时目录失败：{e}"
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    # 解析 segment_inputs
+    segments_data = []
+    for i in range(MAX_SEGMENTS):
+        base_idx = i * 8
+        enable = segment_inputs[base_idx + 7]  # enable_check
+        if not enable:  continue
+        text = segment_inputs[base_idx + 0]
+        if not text or not text.strip():    continue
+        clue = segment_inputs[base_idx + 1]
+        start = segment_inputs[base_idx + 2]
+        end = segment_inputs[base_idx + 3]
+        age = segment_inputs[base_idx + 4]
+        gender = segment_inputs[base_idx + 5]
+        ref_audio = segment_inputs[base_idx + 6]
+        errors = validate_timestamps(start, end, video_duration)
+        if errors:
+            return None, f"❌ 片段 {i+1} 时间戳错误：\n" + "\n".join(errors)
+        data = {
+            "text": str(text).strip(),
+            "clue": str(clue) if clue else "",
+            "start": float(start) if start else 0.0,
+            "end": float(end) if end else 0.0,
+            "age": str(age) if age else "不确定",
+            "gender": str(gender) if gender else "不确定",
+            "ref_audio": str(ref_audio) if ref_audio else ""
+        }
+        segments_data.append(data)
+    if not segments_data:
+        return None, "❌ 有效片段数据为空，请启用并填写至少一个片段"
+    try:
+        progress(0.1, desc="📋 预处理视频，生成 JSONL 数据...")
+        frontend = init_frontend_models()
+        jsonl_path, jsonl_items = generate_jsonl_data(frontend, video_file, segments_data, TEMP_DIR, video_duration)
+        report_lines = [f"✅ 任务完成！共生成 **{len(jsonl_items)}** 个片段数据。\n", "详细 JSONL 数据预览：**", "=" * 40]
+        for idx, item in enumerate(jsonl_items):
+            report_lines.extend([f"\n---片段 #{idx + 1} ---", json.dumps(item, ensure_ascii=False, indent=2), "-" * 40])
+        full_report = "\n".join(report_lines)
+        progress(0.3, desc="🔄 FunCineForge 模型加载中...")
+        eng = init_engine()
+        if eng and jsonl_items:
+            try:
+                progress(0.5, desc="🚀 FunCineForge 模型推理中...")
+                eng.inference(jsonl_path)
+                progress(0.8, desc="🎵 正在将配音语音粘贴回静音视频...")
+                output_wav_dir = os.path.join(TEMP_DIR, "wav")
+                final_video_path = os.path.join(TEMP_DIR, "dubbed_video.mp4")
+                if not os.path.exists(output_wav_dir):
+                    return None, f"⚠️ 未找到音频输出目录：{output_wav_dir}"
+                wav_files = sorted([f for f in os.listdir(output_wav_dir) if f.endswith('.wav')])
+                if not wav_files:
+                    return None, f"⚠️ 未生成任何音频文件：{output_wav_dir}"
+                time_mapping = {}
+                for item in jsonl_items:
+                    for wf in wav_files:
+                        if wf.startswith(item['utt']):
+                            time_mapping[wf] = float(item['start'])
+                            break
+                original_clip = VideoFileClip(video_file)
+                video_duration = original_clip.duration
+                is_silent = original_clip.audio is None
+                video_only = original_clip if is_silent else original_clip.without_audio()
+                audio_clips = []
+                for wav_file, start_time in time_mapping.items():
+                    wav_path = os.path.join(output_wav_dir, wav_file)
+                    audio_clip = AudioFileClip(wav_path).with_start(start_time)
+                    audio_clips.append(audio_clip)
+                final_audio = CompositeAudioClip(audio_clips)
+                if final_audio.duration < video_duration:
+                    final_audio = final_audio.with_duration(video_duration)
+                final_clip = video_only.with_audio(final_audio)
+                final_clip.write_videofile(
+                    final_video_path,
+                    codec='libx264',
+                    audio_codec='aac',
+                    preset='veryfast',
+                    threads=8,
+                    fps=original_clip.fps,
+                    logger=None
+                )
+                original_clip.close(); video_only.close()
+                for ac in audio_clips: ac.close()
+                if 'final_audio' in locals(): final_audio.close()
+                final_clip.close()
+                progress(1.0, desc="✅ 配音完成")
+                return final_video_path, full_report
+            except Exception as e:
+                import traceback; traceback.print_exc()
+                if "index out of range" in str(e):
+                    return None, f"⚠️ 模型推理失败。错误：{str(e)}，建议补齐输入的线索描述和说话人属性"
+                else:
+                    return None, f"⚠️ 模型推理失败。错误：{str(e)}"
+        else:
+            time.sleep(1)
+            progress(1.0, desc="模拟完成")
+            return video_file, full_report
+    except Exception as e:
+        import traceback; traceback.print_exc()
+        return None, f"❌ 发生错误：{str(e)}"
+# ==================== 主程序 ====================
+def main():
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    with gr.Blocks(
+        title="Fun-CineForge 影视配音平台",
+        theme=gr.themes.Soft(),
+        css="""
+        .segment-accordion { margin: 10px 0; }
+        .gr-button-primary { background: #1976d2; }
+        .gr-button-stop { background: #d32f2f; }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🎬 Fun-CineForge
+        **工作流程：** 上传短视频 → 配音片段信息（或上传 .srt 字幕文件） → 上传参考音色（可选） → 预处理、模型加载和推理 → 输出配音视频
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                video_input = gr.Video(label="上传视频", sources=["upload"])
+                load_video_btn = gr.Button("📂 加载示例视频", variant="secondary", size="sm")
+                srt_input = gr.UploadButton("上传 SRT 字幕", file_types=[".srt"], size="sm", variant="secondary")
+                # with gr.Row(elem_classes=["srt-compact"]):
+                #     srt_input = gr.File(label="上传 SRT 字幕", file_types=[".srt"], height="auto")
+                gr.Markdown("### 🎛️ 配音片段配置")
+                segments, accordions = create_segments_ui()
+                seg_count_state = gr.State(1)  #🔑记录当前可见片段数
+                add_segment_btn = gr.Button("➕添加新片段", size="sm", variant="secondary")
+                submit_btn = gr.Button("🚀 开始生成配音", variant="stop", size="lg")
+            with gr.Column(scale=1):
+                video_output = gr.Video(label="📺 配音后视频", autoplay=True)
+                status_text = gr.Textbox(label="结果状态", interactive=False, lines=2)
+                gr.Markdown("""
+                ### 📝 使用说明
+                | 字段 | 说明 |
+                |------|------|
+                | 配音文本 | 该片段台词内容（支持中/英） |
+                | 线索描述 | 请参考样例格式，阐述配音要求，重点描述说话人的性别年龄、语气和情感 |
+                | 时间戳 | 起止时间戳 (可精确到毫秒)，模型对时间戳敏感，建议紧邻有声区间。时长 ≤30s/片段 |
+                | 年龄/性别 | 说话人属性选项 |
+                | 参考语音 | 音色克隆参考 (可选) |
+                **⚠️ 注意：** 确保每个片段的时间戳不重叠，且时间戳不超过视频总时长。模型会根据片段的时��长度进行强制时间对齐，弱监督对齐唇部运动。
+                """)
+        # ==================== 事件绑定 ====================
+        # 收集所有片段组件作为输入
+        segment_inputs = []
+        for seg in segments:
+            segment_inputs.extend([
+                seg["text"],
+                seg["clue"],
+                seg["start"],
+                seg["end"],
+                seg["age"],
+                seg["gender"],
+                seg["audio"],
+                seg["enable"]
+            ])
+        srt_update_fields = []
+        for seg in segments:
+            srt_update_fields.extend([seg["text"], seg["start"], seg["end"], seg["enable"]])
+        # 动态添加片段
+        add_segment_btn.click(
+            fn=add_segment_fn,
+            inputs=[seg_count_state],
+            outputs=[seg_count_state] + accordions + [add_segment_btn]
+        )
+        # SRT 加载
+        srt_input.upload(
+            fn=load_srt_fn,
+            inputs=[srt_input, seg_count_state],
+            outputs=[seg_count_state] + srt_update_fields + accordions + [add_segment_btn]
+        )
+        # 主推理
+        submit_btn.click(
+            fn=process_dubbing,
+            inputs=[video_input] + segment_inputs,
+            outputs=[video_output, status_text]
+        )
+        # 视频上传联动时间戳
+        def update_timestamps(video):
+            if not video: return [gr.update() for _ in range(MAX_SEGMENTS * 2)]
+            dur = get_video_duration(video)
+            updates = []
+            for i in range(MAX_SEGMENTS):
+                updates.append(gr.update(value=0.0))
+                updates.append(gr.update(value=dur))
+            return updates
+        def load_default_video_fn():
+            return DEFAULT_VIDEO_PATH, DEFAULT_TEXT, DEFAULT_CLUE
+        def load_default_audio_fn():
+            return DEFAULT_AUDIO_PATH
+        load_video_btn.click(
+            fn=load_default_video_fn,
+            inputs=[],
+            outputs=[video_input, segments[0]["text"], segments[0]["clue"]]
+        ).then(
+            fn=update_timestamps,
+            inputs=[video_input],
+            outputs=[segment_inputs[i] for i in range(len(segment_inputs)) if i % 8 in [2, 3]]
+        )
+        video_input.change(
+            fn=update_timestamps,
+            inputs=[video_input],
+            outputs=[comp for pair in zip(segment_inputs[2::8], segment_inputs[3::8]) for comp in pair]
+        )
+        if segments and segments[0]["load_audio_btn"]:
+            segments[0]["load_audio_btn"].click(
+                fn=load_default_audio_fn,
+                inputs=[],
+                outputs=[segments[0]["audio"]]
+            )
+    # ==================== 启动服务 ====================
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=SERVER_PORT,
+        share=False,
+        show_error=True,
+        inbrowser=True,
+    )
+if __name__ == "__main__":
+    main()

data/ref.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8420568976edb1cf17a63d9fa968aedaf3c0f68cca4dbf75a409876b96ad700b
+size 788876

data/sample.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b901981a2213fc7f98cd6424869710e8396eb558ff2ff3e8ab5d52fe427e0ab6
+size 2567737

decode_conf/decode.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+model: FunCineForgeInferModel
+index_ds: FunCineForgeDS
+xvec_model: pretrained_models/funcineforge_zh_en/camplus.onnx
+model_conf: {}
+dataset_conf:
+# face is from the video, vocal is the reference audio, extract speaker ID and start-end timestamp from dialogue
+    load_meta_data_key: "text,clue,face,dialogue,vocal,video"
+    sos: 6561
+    eos: 6562
+    turn_of_speech: 6563
+    fill_token: 6564
+    ignore_id: -100
+    startofclue_token: 151646
+    endofclue_token: 151647
+    frame_shift: 25  # ms
+    timebook_size: 1500 # 60 * 25 = 1500
+    pangbai: 1500
+    dubai: 1501
+    duihua: 1502
+    duoren: 1503
+    male: 1504
+    female: 1505
+    child: 1506
+    youth: 1507
+    adult: 1508
+    middle: 1509
+    elderly: 1510
+    speaker_id_start: 1511
+sampling: ras
+lm_use_prompt: true
+fm_use_prompt: true
+use_llm_cache: true
+seed: 0
+max_length: 1500  # 60s * 25 fps
+min_length: 50   # 2s * 25 fps
+llm_dtype: fp32
+fm_dtype: fp32
+voc_dtype: fp32
+batch_size: 1

decode_conf/diar.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Diarization config
+fbank_dim: 80
+embedding_size: 192
+feature_extractor:
+  obj: speakerlab.process.processor.FBank
+  args:
+    n_mels: <fbank_dim>
+    sample_rate: <sample_rate>
+    mean_nor: True
+embedding_model:
+  obj: speakerlab.models.campplus.DTDNN.CAMPPlus
+  args:
+    feat_dim: <fbank_dim>
+    embedding_size: <embedding_size>
+# for visual embeddings extraction
+min_track: 10
+num_failed_det: 10
+crop_scale: 0.4
+min_face_size: 1
+face_det_stride: 5 # 每5帧检测一次人脸
+shot_stride: 50
+# for clustering
+audio_cluster:
+  obj: speakerlab.process.cluster.CommonClustering
+  args:
+    cluster_type: spectral
+    min_num_spks: 1
+    max_num_spks: 15
+    min_cluster_size: 1
+    oracle_num: null
+    pval: 0.032
+    mer_cos: 0.8
+vision_cluster:
+  obj: speakerlab.process.cluster.CommonClustering
+  args:
+    cluster_type: AHC
+    cluster_line: 2
+    min_cluster_size: 1
+    fix_cos_thr: 0.25
+cluster:
+  obj: speakerlab.process.cluster.JointClustering
+  args:
+    audio_cluster: <audio_cluster>
+    vision_cluster: <vision_cluster>

decode_conf/ds_stage0_fp32.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 100,
+  "gradient_clipping": 5,
+  "fp16": {
+    "enabled": false,
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "consecutive_hysteresis": false,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+   "enabled": false
+  },
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_optimization": {
+    "stage": 0,
+    "offload_optimizer": {
+      "device": "none",
+      "pin_memory": true
+    },
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  }
+}

funcineforge/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

funcineforge/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Initialize package."""
+import os
+from funcineforge.auto.auto_model import AutoModel
+from funcineforge.auto.auto_frontend import AutoFrontend
+os.environ["HYDRA_FULL_ERROR"] = "1"

funcineforge/auto/__init__.py ADDED Viewed

File without changes

funcineforge/auto/auto_frontend.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import torch
+import logging
+from omegaconf import OmegaConf
+from funcineforge.utils.hinter import get_logger
+from funcineforge.models.utils import dtype_map
+from funcineforge.datasets import FunCineForgeDS
+class AutoFrontend:
+    def __init__(
+        self,
+        ckpt_path: str,
+        config_path: str,
+        output_dir: str,
+        device: str = "cuda:0"
+    ):
+        self.logger = get_logger(log_level=logging.INFO, local_rank=1, world_size=1)
+        self.device = device
+        self.output_dir = output_dir
+        self.lm_model = None
+        self.fm_model = None
+        self.voc_model = None
+        self.model = None
+        self.index_ds_class = None
+        self.dataset_conf = None
+        self.kwargs = OmegaConf.load(config_path)
+        if device.startswith("cuda"):
+            try:
+                device_id = int(device.split(":")[-1])
+                torch.cuda.set_device(device_id)
+            except (ValueError, IndexError):
+                self.logger.warning(f"Invalid cuda device string {device}, defaulting to 0")
+                torch.cuda.set_device(0)
+        else:
+            self.logger.info(f"Running on CPU")
+        lm_ckpt_path = os.path.join(ckpt_path, "funcineforge_zh_en/llm/ds-model.pt.best/mp_rank_00_model_states.pt")
+        fm_ckpt_path = os.path.join(ckpt_path, "funcineforge_zh_en/flow/ds-model.pt.best/mp_rank_00_model_states.pt")
+        voc_ckpt_path = os.path.join(ckpt_path, "funcineforge_zh_en/vocoder/ds-model.pt.best/avg_5_removewn.pt")
+        lm_exp_dir, lm_model_name, lm_ckpt_id, _ = lm_ckpt_path.rsplit("/", 3)
+        self.logger.info(f"init LM model form {lm_ckpt_path}")
+        from funcineforge import AutoModel
+        self.lm_model = (AutoModel(
+            model=os.path.join(lm_exp_dir, lm_model_name),
+            init_param=lm_ckpt_path,
+            output_dir=None,
+            device=device,
+        ))
+        self.lm_model.model.to(dtype_map[self.kwargs.get("llm_dtype", "fp32")])
+        fm_exp_dir, fm_model_name, fm_ckpt_id, _ = fm_ckpt_path.rsplit("/", 3)
+        self.logger.info(f"build FM model form {fm_ckpt_path}")
+        self.fm_model = AutoModel(
+            model=os.path.join(fm_exp_dir, fm_model_name),
+            init_param=fm_ckpt_path,
+            output_dir=None,
+            device=device,
+        )
+        self.fm_model.model.to(dtype_map[self.kwargs.get("fm_dtype", "fp32")])
+        voc_exp_dir, voc_model_name, voc_ckpt_id, _ = voc_ckpt_path.rsplit("/", 3)
+        self.logger.info(f"build VOC model form {voc_ckpt_path}")
+        self.voc_model = AutoModel(
+            model=os.path.join(voc_exp_dir, voc_model_name),
+            init_param=voc_ckpt_path,
+            output_dir=None,
+            device=device,
+        )
+        self.voc_model.model.to(dtype_map[self.kwargs.get("voc_dtype", "fp32")])
+        self.logger.info(f"build inference model {self.kwargs.get('model')}")
+        self.kwargs["output_dir"] = output_dir
+        self.kwargs["tokenizer"] = None
+        self.model = AutoModel(
+            **self.kwargs,
+            lm_model=self.lm_model,
+            fm_model=self.fm_model,
+            voc_model=self.voc_model,
+        )
+        self.dataset_conf = self.kwargs.get("dataset_conf")
+    def inference(self, jsonl_path: str):
+        if not self.model:
+            raise RuntimeError("Model class not initialized.")
+        dataset = FunCineForgeDS(jsonl_path, **self.dataset_conf)
+        self.logger.info(f"Starting inference on {len(dataset)} items...")
+        self.model.inference(input=dataset, input_len=len(dataset))
+        self.logger.info("Inference finished.")

funcineforge/auto/auto_model.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+import time
+import torch
+import logging
+import os
+from tqdm import tqdm
+from funcineforge.utils.misc import deep_update
+from funcineforge.utils.set_all_random_seed import set_all_random_seed
+from funcineforge.utils.load_pretrained_model import load_pretrained_model
+from funcineforge.download.download_model_from_hub import download_model
+from funcineforge.tokenizer import FunCineForgeTokenizer
+from funcineforge.face import FaceRecIR101
+import importlib
+def prepare_data_iterator(data_in, input_len):
+    """ """
+    data_list = []
+    key_list = []
+    for idx in range(input_len):
+        item = data_in[idx]
+        utt = item["utt"]
+        data_list.append(item)
+        key_list.append(utt)
+    return key_list, data_list
+class AutoModel:
+    def __init__(self, **kwargs):
+        log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+        logging.basicConfig(level=log_level)
+        model, kwargs = self.build_model(**kwargs)
+        self.kwargs = kwargs
+        self.model = model
+        self.model_path = kwargs.get("model_path")
+    @staticmethod
+    def build_model(**kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info("download models from {} or local dir".format(kwargs.get("hub", "ms")))
+            kwargs = download_model(**kwargs)
+        set_all_random_seed(kwargs.get("seed", 0))
+        device = kwargs.get("device", "cuda")
+        if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
+            device = "cpu"
+            kwargs["batch_size"] = 1
+        kwargs["device"] = device
+        torch.set_num_threads(kwargs.get("ncpu", 4))
+        # build tokenizer
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            tokenizer = FunCineForgeTokenizer(**kwargs.get("tokenizer_conf", {}))
+            kwargs["token_list"] = (
+                tokenizer.token_list if hasattr(tokenizer, "token_list") else None
+            )
+            kwargs["token_list"] = (
+                tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"]
+            )
+            vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1
+            if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"):
+                vocab_size = tokenizer.get_vocab_size()
+        else:
+            vocab_size = -1
+        kwargs["tokenizer"] = tokenizer
+        # build face_encoder
+        face_encoder = kwargs.get("face_encoder", None)
+        if face_encoder is not None:
+            face_encoder = FaceRecIR101(**kwargs.get("face_encoder_conf", {}))
+        kwargs["face_encoder"] = face_encoder
+        model_conf = {}
+        model_class_name = kwargs["model"]
+        deep_update(model_conf, kwargs.get("model_conf", {}))
+        deep_update(model_conf, kwargs)
+        module = importlib.import_module("funcineforge.models")
+        model_class = getattr(module, model_class_name)
+        model = model_class(**model_conf, vocab_size=vocab_size)
+        # init_param
+        init_param = kwargs.get("init_param", None)
+        if init_param is not None and os.path.exists(init_param):
+            logging.info(f"Loading pretrained params from ckpt: {init_param}")
+            load_pretrained_model(
+                path=init_param,
+                model=model,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
+                scope_map=kwargs.get("scope_map", []),
+                excludes=kwargs.get("excludes", None),
+                use_deepspeed=kwargs.get("train_conf", {}).get("use_deepspeed", False),
+                save_deepspeed_zero_fp32=kwargs.get("save_deepspeed_zero_fp32", True),
+            )
+        # fp16
+        if kwargs.get("fp16", False):
+            model.to(torch.float16)
+        elif kwargs.get("bf16", False):
+            model.to(torch.bfloat16)
+        model.to(device)
+        return model, kwargs
+    def __call__(self, *args, **cfg):
+        kwargs = self.kwargs
+        deep_update(kwargs, cfg)
+        res = self.model(*args, kwargs)
+        return res
+    def inference(self, input, input_len=None, model=None, kwargs=None, **cfg):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        deep_update(kwargs, cfg)
+        model = self.model if model is None else model
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len
+        )
+        speed_stats = {}
+        num_samples = len(data_list)
+        disable_pbar = self.kwargs.get("disable_pbar", False)
+        pbar = (
+            tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None
+        )
+        time_speech_total = 0.0
+        time_escape_total = 0.0
+        count = 0
+        log_interval = kwargs.get("log_interval", None)
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "data_lengths": end_idx - beg_idx, "key": key_batch}
+            time1 = time.perf_counter()
+            with torch.no_grad():
+                res = model.inference(**batch, **kwargs)
+                if isinstance(res, (list, tuple)):
+                    results = res[0] if len(res) > 0 else [{"text": ""}]
+                    meta_data = res[1] if len(res) > 1 else {}
+            time2 = time.perf_counter()
+            batch_data_time = meta_data.get("batch_data_time", -1)
+            time_escape = time2 - time1
+            speed_stats["forward"] = f"{time_escape:0.3f}"
+            speed_stats["batch_size"] = f"{len(results)}"
+            speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}"
+            description = f"{speed_stats}, "
+            if pbar:
+                pbar.update(batch_size)
+                pbar.set_description(description)
+            else:
+                if log_interval is not None and count % log_interval == 0:
+                    logging.info(
+                        f"processed {count*batch_size}/{num_samples} samples: {key_batch[0]}"
+                    )
+            time_speech_total += batch_data_time
+            time_escape_total += time_escape
+            count += 1
+        if pbar:
+            pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}")
+        torch.cuda.empty_cache()
+        return

funcineforge/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .index_ds import FunCineForgeDS
2	+ from .datasets import FunCineForgeDataset

funcineforge/datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import logging
+import torch
+import pickle
+import numpy as np
+from funcineforge.utils.hinter import hint_once
+from funcineforge.datasets import FunCineForgeDS
+from funcineforge.models import FunCineForgeSpecAug
+class FunCineForgeDataset(torch.utils.data.Dataset):
+    """
+    Dataset for Mixed LM of FunCineForge
+    """
+    def __init__(
+        self,
+        path,
+        index_ds: str = None,
+        frontend=None,
+        tokenizer=None,
+        face_encoder=None,
+        int_pad_value: int = -1,
+        float_pad_value: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.index_ds = FunCineForgeDS(path, **kwargs)
+        self.tokenizer = tokenizer
+        self.face_encoder = face_encoder
+        self.int_pad_value = int_pad_value
+        self.float_pad_value = float_pad_value
+        self.batch_size = kwargs.get("batch_size")
+        self.batch_type = kwargs.get("batch_type")
+        self.retry = kwargs.get("retry", 100)
+        # self.kwargs = kwargs
+        self.max_token_length = kwargs.get("max_token_length", 1500)
+        self.batch_size_scale_ratio_max = kwargs.get("batch_size_scale_ratio_max", 1.5)
+        self.batch_size_token_max = kwargs.get("batch_size_token_max", 2500)
+        self.multiturn_num_max = kwargs.get("multiturn_num_max", 1)
+        self.face_size = kwargs.get("face_size", 512)
+        self.codebook_size = kwargs.get("codebook_size", 6561)
+        self.sos = kwargs.get("sos", self.codebook_size)
+        self.eos = kwargs.get("eos", self.codebook_size + 1)
+        self.turn_of_speech = kwargs.get("turn_of_speech", self.codebook_size + 2)
+        self.ignore_id = kwargs.get("ignore_id", -100)
+        specaug = kwargs.get("specaug", None)
+        specaug_conf = kwargs.get("specaug_conf", {})
+        if specaug is not None:
+            specaug = FunCineForgeSpecAug(**specaug_conf)
+        self.specaug = specaug
+        self.set_invalid_xvec_zeros = kwargs.get("set_invalid_xvec_zeros", False)
+        self.use_emotion_clue = kwargs.get("use_emotion_clue", False)
+        logging.info(f"use_emotion_clue: {self.use_emotion_clue}")
+    def get_source_len(self, index):
+        item = self.index_ds[index]
+        source_len = self.index_ds.get_source_len(item)
+        return source_len
+    def get_target_len(self, index):
+        item = self.index_ds[index]
+        return self.index_ds.get_target_len(item)
+    def __len__(self):
+        return len(self.index_ds)
+    def mixup_text_codec(self, text: torch.Tensor, aug_codec: torch.Tensor, timespk_ids: torch.Tensor, type_id: int):
+        text_len = text.shape[0]
+        timespk_len = timespk_ids.shape[0]
+        sequence = [self.sos, *text.tolist(), type_id, *timespk_ids.tolist(), self.turn_of_speech, *aug_codec.tolist(), self.eos]
+        # sequence = [self.sos, *text.tolist(), type_id, self.turn_of_speech, *aug_codec.tolist(), self.eos]
+        input_ids = torch.tensor(sequence, dtype=torch.int64)
+        text_flag = torch.zeros(len(sequence), dtype=torch.float32)
+        text_flag[1:text_len+1] = 1
+        timespk_flag = torch.zeros(len(sequence), dtype=torch.float32)
+        timespk_flag[text_len+1:text_len+2+timespk_len] = 1
+        # timespk_flag[text_len+1:text_len+2] = 1
+        codec_flag = 1 - (text_flag + timespk_flag)
+        labels = torch.tensor(sequence, dtype=torch.int64)
+        labels[:text_len+timespk_len+3] = self.ignore_id
+        # labels[:text_len+3] = self.ignore_id
+        return input_ids, labels, text_flag, codec_flag, timespk_flag
+    def __getitem__(self, index):
+        output = None
+        for idx in range(self.retry):
+            if idx == 0:
+                index_cur = index
+            else:
+                index_cur = torch.randint(0, len(self.index_ds), ()).item()
+            item = self.index_ds[index_cur]
+            # clue + text
+            text = item["text"]
+            clue = "<|startofclue|>" + item["clue"] + "<|endofclue|>"
+            if self.use_emotion_clue:
+                text = clue + text
+            text_ids = torch.tensor(self.tokenizer.encode(text), dtype=torch.int32)
+            hint_once(f"raw text: {text}", "log_text")
+            # speech tokens
+            target_out = item["token"]
+            codec = torch.from_numpy(np.load(target_out))
+            codec_len = codec.shape[0] # 可用数据集中的 speech_length 代替
+            aug_codec = codec.clone()
+            if self.specaug is not None:  # aug_codec是随机mask的codec增强鲁棒性
+                aug_codec, _ = self.specaug(aug_codec.float().unsqueeze(0).unsqueeze(-1))
+                aug_codec = aug_codec.squeeze(0).squeeze(-1).long()
+            # dialogue
+            timespk_ids = torch.from_numpy(item["timespk_ids"])
+            # mixup
+            type_id = item["type_id"]
+            input_ids, labels, text_flag, codec_flag, timespk_flag = self.mixup_text_codec(
+                text_ids, aug_codec, timespk_ids, type_id
+            )
+            # face
+            face_features = item["face"]
+            face_emb = torch.zeros((codec_len, self.face_size), dtype=torch.float32) # face_emb 长度与 codec_len 相同
+            with open(face_features, 'rb') as f:
+                stat_obj = pickle.load(f)
+                embeddings = stat_obj['embeddings']
+                faceI = stat_obj['faceI']
+                for emb, frameI in zip(embeddings, faceI):
+                    fi = int(frameI)
+                    if 0 <= fi < codec_len:
+                        end = min(fi + 5, codec_len)
+                        face_emb[fi:end] = torch.from_numpy(emb).expand(end - fi, -1)
+            # attention_mask 对应序列长度包括input_id=(sos, <|startofclue|>, clue, <|endofclue|>, text, type_id, timespk_ids, turn_of_speech, speech, eos)
+            attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
+            codec_len = torch.tensor([codec_len], dtype=torch.int32)
+            output = {
+                "input_ids": input_ids,
+                "face_emb": face_emb,
+                "attention_mask": attention_mask,
+                "labels_ids": labels,
+                "text_flag": text_flag,
+                "codec_flag": codec_flag,
+                "timespk_flag": timespk_flag,
+                "codec_len": codec_len,
+            }
+            break
+        return output
+    def collator(self, samples: list = None):
+        for idx in range(self.retry):
+            badcase_flag = False
+            outputs = {}
+            for sample in samples:
+                if sample is None:
+                    continue
+                for key in sample.keys():
+                    if key not in outputs:
+                        outputs[key] = []
+                    if isinstance(sample[key], (list, tuple)):
+                        outputs[key].extend(sample[key])
+                    else:
+                        outputs[key].append(sample[key])
+            for key, data_list in outputs.items():
+                if isinstance(data_list[0], torch.Tensor):
+                    if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32:
+                        pad_value = self.int_pad_value
+                    else:
+                        pad_value = self.float_pad_value
+                    outputs[key] = torch.nn.utils.rnn.pad_sequence(
+                        data_list, batch_first=True, padding_value=pad_value
+                    )
+            if self.batch_type != "example":
+                b, t = outputs["input_ids"].shape
+                if b > 1 and b * t > self.batch_size_token_max:
+                    logging.info(
+                        f"Warning, {idx}th, b*t: {b}*{t}={b * t} > batch_size_token_max: {self.batch_size_token_max}, drop last data"
+                    )
+                    samples = samples[:-1]
+                    continue
+            break
+        return outputs

funcineforge/datasets/index_ds.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import json
+import torch
+import logging
+import numpy as np
+class FunCineForgeDS(torch.utils.data.Dataset):
+    def __init__(self, data_jsonl: str, **kwargs):
+        super().__init__()
+        self.max_source_length = kwargs.get("max_source_length", None)
+        self.max_text_length = kwargs.get("max_text_length", None)
+        self.max_token_length = kwargs.get("max_token_length", None)
+        self.ignore_id = kwargs.get("ignore_id", -100)
+        self.frame_shift = kwargs.get("frame_shift", 25)
+        self.timebook_size = kwargs.get("timebook_size", 1500)
+        self.type_map = {"旁白": kwargs.get("pangbai", self.timebook_size),
+                         "独白": kwargs.get("dubai", self.timebook_size + 1),
+                         "对话": kwargs.get("duihua", self.timebook_size + 2),
+                         "多人": kwargs.get("duoren", self.timebook_size + 3),}
+        self.gender_map = {"男": kwargs.get("male", self.timebook_size + 4),
+                           "male": kwargs.get("male", self.timebook_size + 4),
+                           "女": kwargs.get("female", self.timebook_size + 5),
+                           "female": kwargs.get("female", self.timebook_size + 5),}
+        self.age_map = {"儿童": kwargs.get("child", self.timebook_size + 6),
+                        "child": kwargs.get("child", self.timebook_size + 6),
+                        "青年": kwargs.get("youth", self.timebook_size + 7),
+                        "teenager": kwargs.get("youth", self.timebook_size + 7),
+                        "中年": kwargs.get("adult", self.timebook_size + 8),
+                        "adult": kwargs.get("adult", self.timebook_size + 8),
+                        "中老年": kwargs.get("middle", self.timebook_size + 9),
+                        "middle-aged": kwargs.get("middle", self.timebook_size + 9),
+                        "老年": kwargs.get("elderly", self.timebook_size + 10),
+                        "elderly": kwargs.get("elderly", self.timebook_size + 10)}
+        self.speaker_id_start = kwargs.get("speaker_id_start", self.timebook_size + 11)
+        load_meta_data_key = kwargs.get("load_meta_data_key").split(",")
+        if not (data_jsonl.endswith(".jsonl") or data_jsonl.endswith(".json")):
+            # jsonl list file
+            with open(data_jsonl, encoding="utf-8") as fin:
+                file_list = fin.readlines()
+                logging.info(f"file_list: {file_list}")
+        else:
+            file_list = [data_jsonl]
+        contents = []
+        for file_json in file_list:
+            with open(file_json.strip(), encoding="utf-8") as fin:
+                for line in fin:
+                    data_dict = json.loads(line.strip())
+                    utt = data_dict["utt"]
+                    data_type = data_dict.get("type")
+                    type_id = self.type_map[data_type] if data_type in self.type_map else 1500
+                    data = data_dict["messages"]
+                    speech_length = data_dict.get("speech_length", -1)
+                    # 2 for startofclue, endofclue
+                    text_length = data_dict.get("text_length", -1) + data_dict.get("clue_length", -1) + 2
+                    if self.max_token_length is not None and (speech_length > self.max_token_length or speech_length <= 0):
+                        logging.info(
+                            f"speech_length: {speech_length} > {self.max_token_length}, drop it: {data_dict}"
+                        )
+                        continue
+                    if self.max_text_length is not None and (text_length > self.max_text_length or text_length <= 0):
+                        logging.info(
+                            f"text_length: {text_length} > {self.max_text_length}, drop it: {data_dict}"
+                        )
+                        continue
+                    skip_flag = None
+                    roles = {item.get("role") for item in data}
+                    for key in load_meta_data_key:
+                        if key not in roles:
+                            skip_flag = key
+                            break
+                    if skip_flag is not None:
+                        logging.info(
+                            f"doesn't have {skip_flag}, drop it: {data_dict}")
+                        continue
+                    contents_i = {}
+                    timespk_ids_len = 0
+                    for i, item in enumerate(data):
+                        role = item["role"]
+                        content = item["content"]
+                        for key in load_meta_data_key:
+                            if role == key:
+                                if key == "dialogue":
+                                    timespk_ids = self.timespk_to_codec(content)
+                                    timespk_ids_len = len(timespk_ids)
+                                    if timespk_ids_len == 0:
+                                        logging.info(f"[WARNING] len of timespk_ids is 0: {data_dict}")
+                                    contents_i["timespk_ids"] = timespk_ids
+                                else:
+                                    contents_i[role] = content
+                    contents_i["utt"] = utt
+                    contents_i["type_id"] = type_id
+                    # face embs len = speech tokens len, so need * 2;
+                    # 4: sos, tos, eos; type_id
+                    contents_i["source_len"] = speech_length * 2 + text_length + timespk_ids_len + 4
+                    contents_i["speech_len"] = speech_length
+                    contents_i["text_len"] = text_length # include clue_length
+                    contents.append(contents_i)
+        self.contents = contents
+        logging.info("total_num of samplers: {}, {}".format(len(self.contents), data_jsonl))
+    def timespk_to_codec(self, dialogue):
+        # tuple tokens (start, spk, gender, age, end) * n_parts
+        n_parts = len(dialogue)
+        if n_parts == 0:
+            return np.array([], dtype=np.int64)
+        starts = np.array([part["start"] for part in dialogue])
+        durations = np.array([part["duration"] for part in dialogue])
+        speakers = np.array([int(part["spk"]) for part in dialogue])
+        genders = [part["gender"] for part in dialogue]
+        ages = [part["age"] for part in dialogue]
+        start_idxs = (starts * self.frame_shift + 1).astype(np.int64)
+        end_idxs = ((starts + durations) * self.frame_shift + 1).astype(np.int64)
+        spk_ids = (self.speaker_id_start + speakers - 1).astype(np.int64)
+        gender_ids = [self.gender_map.get(g, self.ignore_id) for g in genders]
+        age_ids = [self.age_map.get(a, self.ignore_id) for a in ages]
+        sequence = np.full(n_parts * 5, self.ignore_id, dtype=np.int64)
+        sequence[0::5] = start_idxs
+        sequence[1::5] = spk_ids
+        sequence[2::5] = gender_ids
+        sequence[3::5] = age_ids
+        sequence[4::5] = end_idxs
+        return sequence
+    def __len__(self):
+        return len(self.contents)
+    def __getitem__(self, index):
+        data = self.contents[index]
+        return data
+    def get_source_len(self, data_dict):
+        source_len = data_dict.get("source_len", 0)
+        return source_len
+    def get_target_len(self, data_dict):
+        target_len = data_dict.get("speech_len", 0)
+        return target_len

funcineforge/download/__init__.py ADDED Viewed

File without changes

funcineforge/download/download_model_from_hub.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+import json
+from omegaconf import OmegaConf, DictConfig
+from funcineforge.download.name_maps_from_hub import name_maps_ms, name_maps_hf, name_maps_openai
+def download_model(**kwargs):
+    hub = kwargs.get("hub", "ms")
+    if hub == "ms":
+        kwargs = download_from_ms(**kwargs)
+    elif hub == "hf":
+        kwargs = download_from_hf(**kwargs)
+    elif hub == "openai":
+        model_or_path = kwargs.get("model")
+        if os.path.exists(model_or_path):
+            # local path
+            kwargs["model_path"] = model_or_path
+            kwargs["model"] = "WhisperWarp"
+        else:
+            # model name
+            if model_or_path in name_maps_openai:
+                model_or_path = name_maps_openai[model_or_path]
+            kwargs["model_path"] = model_or_path
+    return kwargs
+def download_from_ms(**kwargs):
+    model_or_path = kwargs.get("model")
+    if model_or_path in name_maps_ms:
+        model_or_path = name_maps_ms[model_or_path]
+    model_revision = kwargs.get("model_revision", "master")
+    if not os.path.exists(model_or_path) and "model_path" not in kwargs:
+        try:
+            model_or_path = get_or_download_model_dir(
+                model_or_path,
+                model_revision,
+                is_training=kwargs.get("is_training"),
+                check_latest=kwargs.get("check_latest", True),
+            )
+        except Exception as e:
+            print(f"Download: {model_or_path} failed!: {e}")
+    kwargs["model_path"] = model_or_path if "model_path" not in kwargs else kwargs["model_path"]
+    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
+        with open(os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8") as f:
+            conf_json = json.load(f)
+            cfg = {}
+            if "file_path_metas" in conf_json:
+                add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
+            # cfg.update(kwargs)
+            cfg = OmegaConf.merge(cfg, kwargs)
+            if "config" in cfg:
+                config = OmegaConf.load(cfg["config"])
+                kwargs = OmegaConf.merge(config, cfg)
+                kwargs["model"] = config["model"]
+    elif os.path.exists(os.path.join(model_or_path, "config.yaml")):
+        config = OmegaConf.load(os.path.join(model_or_path, "config.yaml"))
+        kwargs = OmegaConf.merge(config, kwargs)
+        init_param = kwargs.get("init_param", "")
+        if (
+            isinstance(init_param, str)
+            and not os.path.exists(init_param)
+            or isinstance(init_param, (list, tuple))
+        ):
+            init_param_new = init_param
+            if isinstance(init_param, str):
+                init_param = init_param.split(",")
+            for init_param_i in init_param:
+                if not os.path.exists(init_param_i):
+                    print(f"init_param: {init_param_i},  does not exist")
+                    init_param_i = os.path.join(model_or_path, "model.pt")
+                    init_param_new = f"{init_param_new},{init_param_i}"
+            kwargs["init_param"] = init_param_new
+            # assert os.path.exists(kwargs["init_param"]), "init_param does not exist"
+        if os.path.exists(os.path.join(model_or_path, "tokens.txt")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.txt")
+        if os.path.exists(os.path.join(model_or_path, "tokens.json")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.json")
+        if os.path.exists(os.path.join(model_or_path, "seg_dict")):
+            kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(model_or_path, "seg_dict")
+        if os.path.exists(os.path.join(model_or_path, "bpe.model")):
+            kwargs["tokenizer_conf"]["bpemodel"] = os.path.join(model_or_path, "bpe.model")
+        kwargs["model"] = config["model"]
+        if os.path.exists(os.path.join(model_or_path, "am.mvn")):
+            kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
+        if os.path.exists(os.path.join(model_or_path, "jieba_usr_dict")):
+            kwargs["jieba_usr_dict"] = os.path.join(model_or_path, "jieba_usr_dict")
+    if isinstance(kwargs, DictConfig):
+        kwargs = OmegaConf.to_container(kwargs, resolve=True)
+    return kwargs
+def download_from_hf(**kwargs):
+    model_or_path = kwargs.get("model")
+    if model_or_path in name_maps_hf:
+        model_or_path = name_maps_hf[model_or_path]
+    model_revision = kwargs.get("model_revision", "master")
+    if not os.path.exists(model_or_path) and "model_path" not in kwargs:
+        try:
+            model_or_path = get_or_download_model_dir_hf(
+                model_or_path,
+                model_revision,
+                is_training=kwargs.get("is_training"),
+                check_latest=kwargs.get("check_latest", True),
+            )
+        except Exception as e:
+            print(f"Download: {model_or_path} failed!: {e}")
+    kwargs["model_path"] = model_or_path if "model_path" not in kwargs else kwargs["model_path"]
+    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
+        with open(os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8") as f:
+            conf_json = json.load(f)
+            cfg = {}
+            if "file_path_metas" in conf_json:
+                add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
+            cfg = OmegaConf.merge(cfg, kwargs)
+            # cfg.update(kwargs)
+            if "config" in cfg:
+                config = OmegaConf.load(cfg["config"])
+                kwargs = OmegaConf.merge(config, cfg)
+                kwargs["model"] = config["model"]
+    elif os.path.exists(os.path.join(model_or_path, "config.yaml")) and os.path.exists(
+        os.path.join(model_or_path, "model.pt")
+    ):
+        config = OmegaConf.load(os.path.join(model_or_path, "config.yaml"))
+        kwargs = OmegaConf.merge(config, kwargs)
+        init_param = os.path.join(model_or_path, "model.pt")
+        kwargs["init_param"] = init_param
+        if os.path.exists(os.path.join(model_or_path, "tokens.txt")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.txt")
+        if os.path.exists(os.path.join(model_or_path, "tokens.json")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.json")
+        if os.path.exists(os.path.join(model_or_path, "seg_dict")):
+            kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(model_or_path, "seg_dict")
+        if os.path.exists(os.path.join(model_or_path, "bpe.model")):
+            kwargs["tokenizer_conf"]["bpemodel"] = os.path.join(model_or_path, "bpe.model")
+        kwargs["model"] = config["model"]
+        if os.path.exists(os.path.join(model_or_path, "am.mvn")):
+            kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
+        if os.path.exists(os.path.join(model_or_path, "jieba_usr_dict")):
+            kwargs["jieba_usr_dict"] = os.path.join(model_or_path, "jieba_usr_dict")
+    if isinstance(kwargs, DictConfig):
+        kwargs = OmegaConf.to_container(kwargs, resolve=True)
+    return kwargs
+def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg={}):
+    print(file_path_metas)
+    if isinstance(file_path_metas, dict):
+        for k, v in file_path_metas.items():
+            if isinstance(v, str):
+                p = os.path.join(model_or_path, v)
+                if os.path.exists(p):
+                    cfg[k] = p
+            elif isinstance(v, dict):
+                if k not in cfg:
+                    cfg[k] = {}
+                add_file_root_path(model_or_path, v, cfg[k])
+    return cfg
+def get_or_download_model_dir(
+    model,
+    model_revision=None,
+    is_training=False,
+    check_latest=True,
+):
+    """Get local model directory or download model if necessary.
+    Args:
+        model (str): model id or path to local model directory.
+        model_revision  (str, optional): model version number.
+        :param is_training:
+    """
+    from modelscope.hub.check_model import check_local_model_is_latest
+    from modelscope.hub.snapshot_download import snapshot_download
+    from modelscope.utils.constant import Invoke, ThirdParty
+    key = Invoke.LOCAL_TRAINER if is_training else Invoke.PIPELINE
+    if os.path.exists(model) and check_latest:
+        model_cache_dir = model if os.path.isdir(model) else os.path.dirname(model)
+        try:
+            check_local_model_is_latest(
+                model_cache_dir, user_agent={Invoke.KEY: key, ThirdParty.KEY: "funcineforge"}
+            )
+        except:
+            print("could not check the latest version")
+    else:
+        model_cache_dir = snapshot_download(
+            model, revision=model_revision, user_agent={Invoke.KEY: key, ThirdParty.KEY: "funcineforge"}
+        )
+    return model_cache_dir
+def get_or_download_model_dir_hf(
+    model,
+    model_revision=None,
+    is_training=False,
+    check_latest=True,
+):
+    """Get local model directory or download model if necessary.
+    Args:
+        model (str): model id or path to local model directory.
+        model_revision  (str, optional): model version number.
+        :param is_training:
+    """
+    from huggingface_hub import snapshot_download
+    model_cache_dir = snapshot_download(model)
+    return model_cache_dir

funcineforge/download/file.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import contextlib
+import os
+import tempfile
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from typing import Generator, Union
+import requests
+from urllib.parse import urlparse
+def download_from_url(url):
+    result = urlparse(url)
+    file_path = None
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        # bytes
+        data = storage.read(url)
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        file_path = os.path.join(work_dir, os.path.basename(url))
+        with open(file_path, "wb") as fb:
+            fb.write(data)
+    assert file_path is not None, f"failed to download: {url}"
+    return file_path
+class Storage(metaclass=ABCMeta):
+    """Abstract class of storage.
+    All backends need to implement two apis: ``read()`` and ``read_text()``.
+    ``read()`` reads the file as a byte stream and ``read_text()`` reads
+    the file as texts.
+    """
+    @abstractmethod
+    def read(self, filepath: str):
+        pass
+    @abstractmethod
+    def read_text(self, filepath: str):
+        pass
+    @abstractmethod
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        pass
+    @abstractmethod
+    def write_text(self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8") -> None:
+        pass
+class LocalStorage(Storage):
+    """Local hard disk storage"""
+    def read(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, "rb") as f:
+            content = f.read()
+        return content
+    def read_text(self, filepath: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, "r", encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "wb") as f:
+            f.write(obj)
+    def write_text(self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8") -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "w", encoding=encoding) as f:
+            f.write(obj)
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: Union[str, Path]) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        yield filepath
+class HTTPStorage(Storage):
+    """HTTP and HTTPS storage."""
+    def read(self, url):
+        # TODO @wenmeng.zwm add progress bar if file is too large
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.content
+    def read_text(self, url):
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.text
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = HTTPStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, url: Union[str, Path]) -> None:
+        raise NotImplementedError("write is not supported by HTTP Storage")
+    def write_text(self, obj: str, url: Union[str, Path], encoding: str = "utf-8") -> None:
+        raise NotImplementedError("write_text is not supported by HTTP Storage")
+class OSSStorage(Storage):
+    """OSS storage."""
+    def __init__(self, oss_config_file=None):
+        # read from config file or env var
+        raise NotImplementedError("OSSStorage.__init__ to be implemented in the future")
+    def read(self, filepath):
+        raise NotImplementedError("OSSStorage.read to be implemented in the future")
+    def read_text(self, filepath, encoding="utf-8"):
+        raise NotImplementedError("OSSStorage.read_text to be implemented in the future")
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = OSSStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        raise NotImplementedError("OSSStorage.write to be implemented in the future")
+    def write_text(self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8") -> None:
+        raise NotImplementedError("OSSStorage.write_text to be implemented in the future")
+G_STORAGES = {}
+class File(object):
+    _prefix_to_storage: dict = {
+        "oss": OSSStorage,
+        "http": HTTPStorage,
+        "https": HTTPStorage,
+        "local": LocalStorage,
+    }
+    @staticmethod
+    def _get_storage(uri):
+        assert isinstance(uri, str), f"uri should be str type, but got {type(uri)}"
+        if "://" not in uri:
+            # local path
+            storage_type = "local"
+        else:
+            prefix, _ = uri.split("://")
+            storage_type = prefix
+        assert storage_type in File._prefix_to_storage, (
+            f"Unsupported uri {uri}, valid prefixs: " f"{list(File._prefix_to_storage.keys())}"
+        )
+        if storage_type not in G_STORAGES:
+            G_STORAGES[storage_type] = File._prefix_to_storage[storage_type]()
+        return G_STORAGES[storage_type]
+    @staticmethod
+    def read(uri: str) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        storage = File._get_storage(uri)
+        return storage.read(uri)
+    @staticmethod
+    def read_text(uri: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        storage = File._get_storage(uri)
+        return storage.read_text(uri)
+    @staticmethod
+    def write(obj: bytes, uri: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        storage = File._get_storage(uri)
+        return storage.write(obj, uri)
+    @staticmethod
+    def write_text(obj: str, uri: str, encoding: str = "utf-8") -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        storage = File._get_storage(uri)
+        return storage.write_text(obj, uri)
+    @contextlib.contextmanager
+    def as_local_path(uri: str) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        storage = File._get_storage(uri)
+        with storage.as_local_path(uri) as local_path:
+            yield local_path

funcineforge/download/name_maps_from_hub.py ADDED Viewed

	@@ -0,0 +1,42 @@

+name_maps_ms = {
+    "paraformer": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "paraformer-zh": "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "paraformer-en": "iic/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-en-spk": "iic/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-zh-streaming": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
+    "fsmn-vad": "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    "ct-punc": "iic/punc_ct-transformer_cn-en-common-vocab471067-large",
+    "ct-punc-c": "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    "fa-zh": "iic/speech_timestamp_prediction-v1-16k-offline",
+    "cam++": "iic/speech_campplus_sv_zh-cn_16k-common",
+    "Whisper-large-v3": "iic/Whisper-large-v3",
+    "Qwen-Audio": "Qwen/Qwen-Audio",
+    "emotion2vec_plus_large": "iic/emotion2vec_plus_large",
+    "emotion2vec_plus_base": "iic/emotion2vec_plus_base",
+    "emotion2vec_plus_seed": "iic/emotion2vec_plus_seed",
+}
+name_maps_hf = {
+    "paraformer": "funasr/paraformer-zh",
+    "paraformer-zh": "funasr/paraformer-zh",
+    "paraformer-en": "funasr/paraformer-zh",
+    "paraformer-zh-streaming": "funasr/paraformer-zh-streaming",
+    "fsmn-vad": "funasr/fsmn-vad",
+    "ct-punc": "funasr/ct-punc",
+    "ct-punc-c": "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    "fa-zh": "funasr/fa-zh",
+    "cam++": "funasr/campplus",
+    "iic/emotion2vec_plus_large": "emotion2vec/emotion2vec_plus_large",
+    "iic/emotion2vec_plus_base": "emotion2vec/emotion2vec_plus_base",
+    "iic/emotion2vec_plus_seed": "emotion2vec/emotion2vec_plus_seed",
+}
+name_maps_openai = {
+    "Whisper-base.en": "base.en",
+    "Whisper-base": "base",
+    "Whisper-large": "large",
+    "Whisper-large-v1": "large-v1",
+    "Whisper-large-v2": "large-v2",
+    "Whisper-large-v3": "large-v3",
+    "Whisper-large-v3-turbo": "turbo",
+}

funcineforge/face/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .face_recognition import FaceRecIR101

funcineforge/face/face_recognition.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def FaceRecIR101(init_param_path, **kwargs):
+    """
+    Face embeddings extraction with CurricularFace pretrained model.
+    Reference:
+    - https://modelscope.cn/models/iic/cv_ir101_facerecognition_cfglint
+    """
+    import onnxruntime
+    options = onnxruntime.SessionOptions()
+    options.intra_op_num_threads = 8
+    options.inter_op_num_threads = 8
+    ort_session = onnxruntime.InferenceSession(
+        init_param_path,
+        sess_options=options,
+        providers=['CPUExecutionProvider']
+    )
+    return ort_session

funcineforge/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .specaug.specaug import SpecAug as FunCineForgeSpecAug
+from .language_model import FunCineForgeLM
+from .causal_hifigan import CausalHifiGan
+from .flow_matching_model import CosyVoiceFlowMatching
+from .inference_model import FunCineForgeInferModel

funcineforge/models/causal_hifigan.py ADDED Viewed

	@@ -0,0 +1,834 @@

+# Copyright 2023 KaiHu
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""HIFI-GAN"""
+from typing import Dict
+from typing import Tuple, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torchaudio
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+from torch.nn.utils.parametrizations import weight_norm
+import logging
+from funcineforge.utils.device_funcs import to_device
+import os
+from torch.nn.utils.rnn import pad_sequence
+from funcineforge.models.utils import dtype_map
+from funcineforge.models.modules.hifigan import init_weights
+from funcineforge.models.modules.hifigan.activations import Snake
+class LookRightConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(LookRightConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = kernel_size - 1
+    def forward(self, x: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        if context.size(2) == 0:
+            x = F.pad(x, (0, self.causal_padding), value=0.0)
+        else:
+            assert context.size(2) == self.causal_padding
+            x = torch.concat([x, context], dim=2)
+        x = super(LookRightConv1d, self).forward(x)
+        return x
+class LookLeftConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(LookLeftConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1 and dilation == 1
+        self.causal_padding = kernel_size - 1
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        # NOTE 兼容kernel_size=1的情况
+        if self.causal_padding == 0:
+            cache_new = x[:, :, :0]
+        else:
+            cache_new = x[:, :, -self.causal_padding:]
+        x = super(LookLeftConv1d, self).forward(x)
+        return x, cache_new
+class CausalConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                LookRightConv1d(in_channels, cond_channels, kernel_size=4)
+            ),
+            nn.ELU(),
+            weight_norm(
+                LookLeftConv1d(cond_channels, cond_channels, kernel_size=3)
+            ),
+            nn.ELU(),
+            weight_norm(
+                LookLeftConv1d(cond_channels, cond_channels, kernel_size=3)
+            ),
+            nn.ELU(),
+            weight_norm(
+                LookLeftConv1d(cond_channels, cond_channels, kernel_size=3)
+            ),
+            nn.ELU(),
+            weight_norm(
+                LookLeftConv1d(cond_channels, cond_channels, kernel_size=3)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0, 0), finalize: bool = True) -> torch.Tensor:
+        if finalize is False:
+            x, context = x[:, :, :-self.condnet[0].causal_padding], x[:, :, -self.condnet[0].causal_padding:]
+        else:
+            x, context = x, x[:, :, :0]
+        x = self.condnet[0](x, context)
+        x = self.condnet[1](x)
+        if cache.size(0) != 0:
+            x, cache[0] = self.condnet[2](x, cache[0])
+        else:
+            x, _ = self.condnet[2](x)
+        x = self.condnet[3](x)
+        if cache.size(0) != 0:
+            x, cache[1] = self.condnet[4](x, cache[1])
+        else:
+            x, _ = self.condnet[4](x)
+        x = self.condnet[5](x)
+        if cache.size(0) != 0:
+            x, cache[2] = self.condnet[6](x, cache[2])
+        else:
+            x, _ = self.condnet[6](x)
+        x = self.condnet[7](x)
+        if cache.size(0) != 0:
+            x, cache[3] = self.condnet[8](x, cache[3])
+        else:
+            x, _ = self.condnet[8](x)
+        x = self.condnet[9](x)
+        x = x.transpose(1, 2)
+        x = torch.abs(self.classifier(x).squeeze(-1))
+        return x, cache
+    def init_cache(self, device):
+        return torch.zeros(4, 1, 512, 2).to(device)
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        try:
+            remove_weight_norm(self.condnet[0])
+            remove_weight_norm(self.condnet[2])
+            remove_weight_norm(self.condnet[4])
+            remove_weight_norm(self.condnet[6])
+            remove_weight_norm(self.condnet[8])
+        except:
+            remove_parametrizations(self.condnet[0], 'weight')
+            remove_parametrizations(self.condnet[2], 'weight')
+            remove_parametrizations(self.condnet[4], 'weight')
+            remove_parametrizations(self.condnet[6], 'weight')
+            remove_parametrizations(self.condnet[8], 'weight')
+class LookLeftConvTranspose1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(LookLeftConvTranspose1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, 1,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert dilation == 1 and stride != 1
+        self.causal_padding = kernel_size - 1
+        self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.upsample(x)
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        cache_new = x[:, :, -self.causal_padding:]
+        x = super(LookLeftConvTranspose1d, self).forward(x)
+        return x, cache_new
+class LookLeftConv1dWithStride(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(LookLeftConv1dWithStride, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride != 1 and dilation == 1
+        assert kernel_size % stride == 0
+        self.causal_padding = stride - 1
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        cache_new = x[:, :, -self.causal_padding:]
+        x = super(LookLeftConv1dWithStride, self).forward(x)
+        return x, cache_new
+class LookLeftConv1dWithDilation(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(LookLeftConv1dWithDilation, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        # NOTE(lyuxiang.lx) 这个causal_padding仅在kernel_size为奇数时才成立
+        assert kernel_size // 2 * dilation * 2 == int((kernel_size * dilation - dilation) / 2) * 2
+        self.causal_padding = int((kernel_size * dilation - dilation) / 2) * 2
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache.size(2) == 0:
+            x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        else:
+            assert cache.size(2) == self.causal_padding
+            x = torch.concat([cache, x], dim=2)
+        cache_new = x[:, :, -self.causal_padding:]
+        x = super(LookLeftConv1dWithDilation, self).forward(x)
+        return x, cache_new
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    LookLeftConv1dWithDilation(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation
+                    ) if dilation != 1 else
+                    LookLeftConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    LookLeftConv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0, 0, 0)) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt, _ = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt, _ = self.convs2[idx](xt)
+            x = xt + x
+        return x, cache
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            try:
+                remove_weight_norm(self.convs1[idx])
+                remove_weight_norm(self.convs2[idx])
+            except:
+                remove_parametrizations(self.convs1[idx], 'weight')
+                remove_parametrizations(self.convs2[idx], 'weight')
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+        self.rand_ini = torch.rand(1, 9)
+        self.rand_ini[:, 0] = 0
+        self.sine_waves = torch.rand(1, 300 * 24000, 9)
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rad_values[:, 0, :] = rad_values[:, 0, :] + self.rand_ini.to(rad_values.device)
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+#             # for normal case
+#             # To prevent torch.cumsum numerical overflow,
+#             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+#             # Buffer tmp_over_one_idx indicates the time step to add -1.
+#             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+#             phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1/self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="nearest").transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                             device=f0.device)
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        # uv = torch.ones(f0.shape)
+        # uv = uv * (f0 > self.voiced_threshold)
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * self.sine_waves[:, :sine_waves.shape[1]].to(sine_waves.device)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+        self.uv = torch.rand(1, 300 * 24000, 1)
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = self.uv[:, :uv.shape[1]] * self.sine_amp / 3
+        return sine_merge, noise, uv
+class CausalHiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(CausalHiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"], mode='nearest')
+        self.conv_pre = weight_norm(
+            LookRightConv1d(in_channels, base_channels, 5, 1)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    LookLeftConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    LookLeftConv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    LookLeftConv1dWithStride(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u)
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(LookLeftConv1d(ch, istft_params["n_fft"] + 2, 7, 1))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+        # f0回退3帧，hift回退5帧
+        self.context_size = 8
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            try:
+                remove_weight_norm(l)
+            except:
+                remove_parametrizations(l, 'weight')
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        try:
+            remove_weight_norm(self.conv_pre)
+            remove_weight_norm(self.conv_post)
+        except:
+            remove_parametrizations(self.conv_pre, 'weight')
+            remove_parametrizations(self.conv_post, 'weight')
+        self.f0_predictor.remove_weight_norm()
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(0, 0, 0), finalize: bool = True) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        # NOTE(lyuxiang.lx) 回退4帧
+        if finalize is False:
+            s_stft_real, s_stft_imag = s_stft_real[:, :, :-int(480 * 4 / self.istft_params["hop_len"])], s_stft_imag[:, :, :-int(480 * 4 / self.istft_params["hop_len"])]
+            x = self.conv_pre(x[:, :, :-4], x[:, :, -4:])
+        else:
+            x = self.conv_pre(x)
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x, _ = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si, _ = self.source_downs[i](s_stft)
+            si, _ = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                this_xs, _ = self.resblocks[i * self.num_kernels + j](x)
+                if xs is None:
+                    xs = this_xs
+                else:
+                    xs += this_xs
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x, _ = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        # NOTE(lyuxiang.lx) 回退1帧
+        if finalize is False:
+            x = x[:, :-480]
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, f0_cpu: bool = False, finalize: bool = True) -> torch.Tensor:
+        # mel->f0->source
+        if f0_cpu is True:
+            self.f0_predictor.to('cpu')
+            f0, _ = self.f0_predictor(speech_feat.cpu(), finalize=finalize)
+            f0 = f0.to(speech_feat.device)
+        else:
+            self.f0_predictor.to(speech_feat.device)
+            f0, _ = self.f0_predictor(speech_feat, finalize=finalize)
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        if finalize is False:
+            generated_speech = self.decode(speech_feat[:, :, :-3], s, finalize=finalize)
+        else:
+            generated_speech = self.decode(speech_feat, s, finalize=finalize)
+        return generated_speech, []
+class CausalHifiGan(nn.Module):
+    """HIFIGAN-style vocoders (generator [stack of time-level-upsampling blocks] + discriminator).
+       NSF-HIFIGAN, HiFTNet Optional.
+    """
+    def __init__(
+            self,
+            CausalHiFTGenerator_conf: dict = {},
+            CausalConvRNNF0Predictor_conf: dict = {},
+            sample_rate: float = 24000,
+            **kwargs
+    ):
+        super().__init__()
+        self.generator = CausalHiFTGenerator(**CausalHiFTGenerator_conf)
+        self.generator.f0_predictor = CausalConvRNNF0Predictor(**CausalConvRNNF0Predictor_conf)
+        self.generator.remove_weight_norm()
+        self.sample_rate = sample_rate
+    def inference_prepare(
+            self,
+            data_in,
+            data_lengths=None,
+            key: list = None,
+            **kwargs,
+    ):
+        if kwargs.get("batch_size", 1) > 1:
+            raise NotImplementedError("batch decoding is not implemented")
+        feat_list = []
+        feat_len_list = []
+        for i, feat in enumerate(data_in):
+            if isinstance(feat, str) and os.path.exists(feat):
+                feat = np.load(feat)
+            if isinstance(feat, np.ndarray):
+                feat = torch.from_numpy(feat)
+            feat_list.append(feat)
+            feat_len_list.append(feat.shape[0])
+        batch = {
+            "x": pad_sequence(feat_list, batch_first=True),
+            "x_lengths": torch.tensor(feat_len_list, dtype=torch.int64),
+        }
+        batch = to_device(batch, kwargs["device"])
+        return batch
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = None,
+        f0_cpu: bool = True,
+        finalize: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run inference.
+        Args:
+            x (torch.Tensor): input representation, B x T x C
+        Returns:
+            Dict[str, Tensor]:
+                * recon_speech (Tensor): Reconstructed waveform tensor (B, T_wav).
+        """
+        uttid = key[0]
+        batch = self.inference_prepare(data_in, data_lengths, key, **kwargs)
+        voc_dtype = dtype_map[kwargs.get("voc_dtype", "fp32")]
+        x = batch["x"].to(voc_dtype)
+        recon_speech = self.generator.inference(x.transpose(1, 2), f0_cpu=f0_cpu, finalize=finalize)[0].squeeze(1)
+        recon_speech = recon_speech.float()
+        logging.info(f"{uttid}: wav lengths {recon_speech.shape[1]}")
+        output_dir = kwargs.get("output_dir", None)
+        output_sr = kwargs.get("output_sr", None)
+        if output_dir is not None:
+            wav_out_dir = os.path.join(output_dir, "wav")
+            os.makedirs(wav_out_dir, exist_ok=True)
+            wav_sr = self.sample_rate
+            if output_sr is not None and output_sr != self.sample_rate:
+                recon_speech = torchaudio.functional.resample(
+                    recon_speech,
+                    orig_freq=self.sample_rate,
+                    new_freq=output_sr
+                )
+                wav_sr = output_sr
+            torchaudio.save(
+                os.path.join(wav_out_dir, f"{key[0]}.wav"), recon_speech.cpu(),
+                sample_rate=wav_sr, encoding='PCM_S', bits_per_sample=16
+            )
+        return recon_speech

funcineforge/models/flow_matching_model.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import os.path
+import torch
+import torch.nn as nn
+from typing import Dict
+import logging
+from librosa.filters import mel as librosa_mel_fn
+import torch.nn.functional as F
+from funcineforge.models.utils.nets_utils import make_pad_mask
+from funcineforge.utils.device_funcs import to_device
+import numpy as np
+from funcineforge.utils.load_utils import extract_campp_xvec
+import time
+from funcineforge.models.utils import dtype_map
+from funcineforge.utils.hinter import hint_once
+from funcineforge.models.utils.masks import add_optional_chunk_mask
+from .modules.dit_flow_matching.dit_model import DiT
+class Audio2Mel(nn.Module):
+    def __init__(
+        self,
+        n_fft=1024,
+        hop_length=256,
+        win_length=1024,
+        sampling_rate=22050,
+        n_mel_channels=80,
+        mel_fmin=0.0,
+        mel_fmax=None,
+        center=False,
+        device='cuda',
+        feat_type="power_log",
+    ):
+        super().__init__()
+        ##############################################
+        # FFT Parameters
+        ##############################################
+        window = torch.hann_window(win_length, device=device).float()
+        mel_basis = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float().to(device)
+        self.register_buffer("mel_basis", mel_basis)
+        self.register_buffer("window", window)
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.mel_fmax = mel_fmax
+        self.center = center
+        self.feat_type = feat_type
+    def forward(self, audioin):
+        p = (self.n_fft - self.hop_length) // 2
+        audio = F.pad(audioin, (p, p), "reflect").squeeze(1)
+        fft = torch.stft(
+            audio,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        if self.feat_type == "mag_log10":
+            power_spec = torch.sqrt(torch.pow(fft.imag, 2) + torch.pow(fft.real, 2))
+            mel_output = torch.matmul(self.mel_basis, power_spec)
+            return torch.log10(torch.clamp(mel_output, min=1e-5))
+        power_spec = torch.pow(fft.imag, 2) + torch.pow(fft.real, 2)
+        mel_spec = torch.matmul(self.mel_basis, torch.sqrt(power_spec + 1e-9))
+        return self.spectral_normalize(mel_spec)
+    @classmethod
+    def spectral_normalize(cls, spec, C=1, clip_val=1e-5):
+        output = cls.dynamic_range_compression(spec, C, clip_val)
+        return output
+    @classmethod
+    def spectral_de_normalize_torch(cls, spec, C=1, clip_val=1e-5):
+        output = cls.dynamic_range_decompression(spec, C, clip_val)
+        return output
+    @staticmethod
+    def dynamic_range_compression(x, C=1, clip_val=1e-5):
+        return torch.log(torch.clamp(x, min=clip_val) * C)
+    @staticmethod
+    def dynamic_range_decompression(x, C=1):
+        return torch.exp(x) / C
+class LookaheadBlock(nn.Module):
+    def __init__(self, in_channels: int, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            in_channels, channels,
+            kernel_size=pre_lookahead_len+1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, in_channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs, ilens, context: torch.Tensor = torch.zeros(0, 0, 0)):
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        context = context.transpose(1, 2).contiguous()
+        # look ahead
+        if context.size(2) == 0:
+            outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0)
+        else:
+            assert context.size(2) == self.pre_lookahead_len
+            outputs = torch.concat([outputs, context], dim=2)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (2, 0), mode='constant', value=0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        mask = (~make_pad_mask(ilens).unsqueeze(-1).to(inputs.device))
+        # residual connection
+        outputs = (outputs + inputs) * mask
+        return outputs, ilens
+class CosyVoiceFlowMatching(nn.Module):
+    def __init__(
+            self,
+            codebook_size: int,
+            model_size: int,
+            xvec_size: int = 198,
+            dit_conf: Dict = {},
+            mel_feat_conf: Dict = {},
+            prompt_conf: Dict = None,
+            **kwargs):
+        super().__init__()
+        # feat related
+        self.feat_token_ratio = kwargs.get("feat_token_ratio", None)
+        try:
+            self.mel_extractor = Audio2Mel(**mel_feat_conf)
+            self.sample_rate = self.mel_extractor.sampling_rate
+        except:
+            self.mel_extractor = None
+            self.sample_rate = 24000
+        self.mel_norm_type = kwargs.get("mel_norm_type", None)
+        self.num_mels = num_mels = mel_feat_conf["n_mel_channels"]
+        self.token_rate = kwargs.get("token_rate", 25)
+        self.model_dtype = kwargs.get("model_dtype", "fp32")
+        self.codebook_size = codebook_size
+        # condition related
+        self.prompt_conf = prompt_conf
+        if self.prompt_conf is not None:
+            self.prompt_masker = self.build_prompt_masker()
+        # codec related
+        self.codec_embedder = nn.Embedding(codebook_size, num_mels)
+        lookahead_length = kwargs.get("lookahead_length", 4)
+        self.lookahead_conv1d = LookaheadBlock(num_mels, model_size, lookahead_length)
+        # spk embed related
+        if xvec_size is not None:
+            self.xvec_proj = torch.nn.Linear(xvec_size, num_mels)
+        # dit model related
+        self.dit_conf = dit_conf
+        self.dit_model = DiT(**dit_conf)
+        self.training_cfg_rate = kwargs.get("training_cfg_rate", 0)
+        self.only_mask_loss = kwargs.get("only_mask_loss", True)
+        # NOTE fm需要右看的下文
+        self.context_size = self.lookahead_conv1d.pre_lookahead_len
+    def build_prompt_masker(self):
+        prompt_type = self.prompt_conf.get("prompt_type", "free")
+        if prompt_type == "prefix":
+            from funcineforge.models.utils.mask_along_axis import MaskTailVariableMaxWidth
+            masker = MaskTailVariableMaxWidth(
+                mask_width_ratio_range=self.prompt_conf["prompt_width_ratio_range"],
+            )
+        else:
+            raise NotImplementedError
+        return masker
+    @staticmethod
+    def norm_spk_emb(xvec):
+        xvec_mask = (~xvec.norm(dim=-1).isnan()) * (~xvec.norm(dim=-1).isinf())
+        xvec = xvec * xvec_mask.unsqueeze(-1)
+        xvec = xvec.mean(dim=1)
+        xvec = F.normalize(xvec, dim=1)
+        return xvec
+    def select_target_prompt(self, y: torch.Tensor, y_lengths: torch.Tensor):
+        # cond_mask: 1, 1, 1, ..., 0, 0, 0
+        cond_mask = self.prompt_masker(y, y_lengths, return_mask=True)
+        return cond_mask
+    @torch.no_grad()
+    def normalize_mel_feat(self, feat, feat_lengths):
+        # feat in B,T,D
+        if self.mel_norm_type == "mean_std":
+            max_length = feat.shape[1]
+            mask = (~make_pad_mask(feat_lengths, maxlen=max_length))
+            mask = mask.unsqueeze(-1).to(feat)
+            mean = ((feat * mask).sum(dim=(1, 2), keepdim=True) /
+                    (mask.sum(dim=(1, 2), keepdim=True) * feat.shape[-1]))
+            var = (((feat - mean)**2 * mask).sum(dim=(1, 2), keepdim=True) /
+                   (mask.sum(dim=(1, 2), keepdim=True) * feat.shape[-1] - 1))  # -1 for unbiased estimation
+            std = torch.sqrt(var)
+            feat = (feat - mean) / std
+            feat = feat * mask
+            return feat
+        if self.mel_norm_type == "min_max":
+            bb, tt, dd = feat.shape
+            mask = (~make_pad_mask(feat_lengths, maxlen=tt))
+            mask = mask.unsqueeze(-1).to(feat)
+            feat_min = (feat * mask).reshape([bb, tt * dd]).min(dim=1, keepdim=True).values.unsqueeze(-1)
+            feat_max = (feat * mask).reshape([bb, tt * dd]).max(dim=1, keepdim=True).values.unsqueeze(-1)
+            feat = (feat - feat_min) / (feat_max - feat_min)
+            # noise ~ N(0, I), P(x >= 3sigma) = 0.001, 3 is enough.
+            feat = (feat * 3) * mask  # feat in [-3, 3]
+            return feat
+        else:
+            raise NotImplementedError
+    @torch.no_grad()
+    def extract_feat(self, y: torch.Tensor, y_lengths: torch.Tensor):
+        mel_extractor = self.mel_extractor.float()
+        feat = mel_extractor(y)
+        feat = feat.transpose(1, 2)
+        feat_lengths = (y_lengths / self.mel_extractor.hop_length).to(y_lengths)
+        if self.mel_norm_type is not None:
+            feat = self.normalize_mel_feat(feat, feat_lengths)
+        return feat, feat_lengths
+    def load_data(self, contents: dict, **kwargs):
+        fm_use_prompt = kwargs.get("fm_use_prompt", True)
+        # codec
+        codec = contents["codec"]
+        if isinstance(codec, np.ndarray):
+            codec = torch.from_numpy(codec)
+            # codec = torch.from_numpy(codec)[None, :]
+        codec_lengths = torch.tensor([codec.shape[1]], dtype=torch.int64)
+        # prompt codec (optional)
+        prompt_codec = kwargs.get("prompt_codec", None)
+        prompt_codec_lengths = None
+        if prompt_codec is not None and fm_use_prompt:
+            if isinstance(prompt_codec, str) and os.path.exists(prompt_codec):
+                prompt_codec = np.load(prompt_codec)
+            if isinstance(prompt_codec, np.ndarray):
+                prompt_codec = torch.from_numpy(prompt_codec)[None, :]
+            prompt_codec_lengths = torch.tensor([prompt_codec.shape[1]], dtype=torch.int64)
+        else:
+            prompt_codec = None
+        spk_emb = kwargs.get("spk_emb", None)
+        spk_emb_lengths = None
+        if spk_emb is not None:
+            if isinstance(spk_emb, str) and os.path.exists(spk_emb):
+                spk_emb = np.load(spk_emb)
+            if isinstance(spk_emb, np.ndarray):
+                spk_emb = torch.from_numpy(spk_emb)[None, :]
+            spk_emb_lengths = torch.tensor([spk_emb.shape[1]], dtype=torch.int64)
+        # prompt wav as condition
+        prompt_wav = contents["vocal"]
+        prompt_wav_lengths = None
+        if prompt_wav is not None and fm_use_prompt and os.path.exists(prompt_wav):
+            if prompt_wav.endswith(".npy"):
+                spk_emb = np.load(prompt_wav)
+                spk_emb_lengths = torch.tensor([spk_emb.shape[1]], dtype=torch.int64)
+            else:
+                spk_emb = extract_campp_xvec(prompt_wav, **kwargs)
+                spk_emb = torch.from_numpy(spk_emb)
+                spk_emb_lengths = torch.tensor([spk_emb.shape[1]], dtype=torch.int64)
+            # prompt_wav = load_audio_text_image_video(prompt_wav, fs=self.sample_rate)
+            # prompt_wav = prompt_wav[None, :]
+            # prompt_wav_lengths = torch.tensor([prompt_wav.shape[1]], dtype=torch.int64)
+        else:
+            logging.info("[error] prompt_wav is None or not path or path not exists! Please provide the correct speaker embedding.")
+        output = {
+            "codec": codec,
+            "codec_lengths": codec_lengths,
+            "prompt_codec": prompt_codec,
+            "prompt_codec_lengths": prompt_codec_lengths,
+            "prompt_wav": None,
+            "prompt_wav_lengths": None,
+            "xvec": spk_emb,
+            "xvec_lengths": spk_emb_lengths,
+        }
+        return output
+    @torch.no_grad()
+    def inference(
+            self,
+            data_in,
+            data_lengths=None,
+            key: list = None,
+            chunk_size: int = -1,
+            finalize: bool = True,
+            **kwargs,
+    ):
+        uttid = key[0]
+        if kwargs.get("batch_size", 1) > 1:
+            raise NotImplementedError("batch decoding is not implemented")
+        batch = self.load_data(data_in[0], **kwargs)
+        batch = to_device(batch, kwargs["device"])
+        batch.update({'finalize': finalize, 'chunk_size': chunk_size})
+        feat = self._inference(**batch, **kwargs)
+        feat = feat.float()
+        logging.info(f"{uttid}: feat lengths {feat.shape[1]}")
+        return feat
+    @torch.no_grad()
+    def _inference(
+            self,
+            codec, codec_lengths,
+            prompt_codec=None, prompt_codec_lengths=None,
+            prompt_wav=None, prompt_wav_lengths=None,
+            xvec=None, xvec_lengths=None, chunk_size=-1, finalize=False,
+            **kwargs
+    ):
+        fm_dtype = dtype_map[kwargs.get("fm_dtype", "fp32")]
+        rand_xvec = None
+        if xvec is not None:
+            if xvec.dim() == 2:
+                xvec = xvec.unsqueeze(1)
+                xvec_lens = torch.ones_like(xvec_lengths)
+            rand_xvec = self.norm_spk_emb(xvec)
+            self.xvec_proj.to(fm_dtype)
+            rand_xvec = self.xvec_proj(rand_xvec.to(fm_dtype))
+            rand_xvec = rand_xvec.unsqueeze(1)
+        if (codec >= self.codebook_size).any():
+            new_codec = codec[codec < self.codebook_size].unsqueeze(0)
+            logging.info(f"remove out-of-range token for FM: from {codec.shape[1]} to {new_codec.shape[1]}.")
+            codec_lengths = codec_lengths - (codec.shape[1] - new_codec.shape[1])
+            codec = new_codec
+        if prompt_codec is not None:
+            codec, codec_lengths = self.concat_prompt(prompt_codec, prompt_codec_lengths, codec, codec_lengths)
+        mask = (codec != -1).float().unsqueeze(-1)
+        codec_emb = self.codec_embedder(torch.clamp(codec, min=0)) * mask
+        self.lookahead_conv1d.to(fm_dtype)
+        if finalize is True:
+            context = torch.zeros(1, 0, self.codec_embedder.embedding_dim).to(fm_dtype)
+        else:
+            codec_emb, context = codec_emb[:, :-self.context_size].to(fm_dtype), codec_emb[:, -self.context_size:].to(fm_dtype)
+            codec_lengths = codec_lengths - self.context_size
+        mu, _ = self.lookahead_conv1d(codec_emb, codec_lengths, context)
+        mu = mu.repeat_interleave(self.feat_token_ratio, dim=1)
+        # print(mu.size())
+        conditions = torch.zeros([mu.size(0), mu.shape[1], self.num_mels]).to(mu)
+        # get conditions
+        if prompt_wav is not None:
+            if prompt_wav.ndim == 2:
+                prompt_wav, prompt_wav_lengths = self.extract_feat(prompt_wav, prompt_wav_lengths)
+            # NOTE 在fmax12k fm中，尝试mel interploate成token 2倍shape，而不是强制截断
+            prompt_wav = prompt_wav.to(fm_dtype)
+            for i, _len in enumerate(prompt_wav_lengths):
+                conditions[i, :_len] = prompt_wav[i]
+        feat_lengths = codec_lengths * self.feat_token_ratio
+        # NOTE add_optional_chunk_mask支持生成-1/1/15/30不同chunk_size的mask
+        mask = add_optional_chunk_mask(mu, torch.ones([1, 1, mu.shape[1]]).to(mu).bool(), False, False, 0, chunk_size, -1)
+        feat = self.solve_ode(mu, rand_xvec, conditions.to(fm_dtype), mask, **kwargs)
+        if prompt_codec is not None and prompt_wav is not None:
+            feat, feat_lens = self.remove_prompt(None, prompt_wav_lengths, feat, feat_lengths)
+        return feat
+    @staticmethod
+    def concat_prompt(prompt, prompt_lengths, text, text_lengths):
+        xs_list, x_len_list = [], []
+        for idx, (_prompt_len, _text_len) in enumerate(zip(prompt_lengths, text_lengths)):
+            xs_list.append(torch.concat([prompt[idx, :_prompt_len], text[idx, :_text_len]], dim=0))
+            x_len_list.append(_prompt_len + _text_len)
+        xs = torch.nn.utils.rnn.pad_sequence(xs_list, batch_first=True, padding_value=0.0)
+        x_lens = torch.tensor(x_len_list, dtype=torch.int64).to(xs.device)
+        return xs, x_lens
+    @staticmethod
+    def remove_prompt(prompt, prompt_lengths, padded, padded_lengths):
+        xs_list = []
+        for idx, (_prompt_len, _x_len) in enumerate(zip(prompt_lengths, padded_lengths)):
+            xs_list.append(padded[idx, _prompt_len: _x_len])
+        xs = torch.nn.utils.rnn.pad_sequence(xs_list, batch_first=True, padding_value=0.0)
+        return xs, padded_lengths - prompt_lengths
+    def get_rand_noise(self, mu: torch.Tensor, **kwargs):
+        use_fixed_noise_infer = kwargs.get("use_fixed_noise_infer", True)
+        max_len = kwargs.get("max_len", 50*300)
+        if use_fixed_noise_infer:
+            if not hasattr(self, "rand_noise") or self.rand_noise is None or self.rand_noise.shape[2] < mu.shape[2]:
+                self.rand_noise = torch.randn([1, max_len, mu.shape[2]]).to(mu)
+                logging.info("init random noise for Flow")
+            # return self.rand_noise[:, :mu.shape[1], :]
+            return torch.concat([self.rand_noise[:, :mu.shape[1], :] for _ in range(mu.size(0))], dim = 0)
+        else:
+            return torch.randn_like(mu)
+    def solve_ode(self, mu, rand_xvec, conditions, mask, **kwargs):
+        fm_dtype = dtype_map[kwargs.get("fm_dtype", "fp32")]
+        temperature = kwargs.get("temperature", 1.0)
+        n_timesteps = kwargs.get("n_timesteps", 10)
+        infer_t_scheduler = kwargs.get("infer_t_scheduler", "cosine")
+        z = self.get_rand_noise(mu) * temperature
+        # print("z", z.size(), "mu", mu.size())
+        t_span = torch.linspace(0, 1, n_timesteps + 1).to(mu)
+        # print("t_span", t_span)
+        if infer_t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        fm_time = time.time()
+        self.dit_model.to(fm_dtype)
+        feat = self.solve_euler(
+            z.to(fm_dtype), t_span=t_span.to(fm_dtype), mu=mu.to(fm_dtype), mask=mask,
+            spks=rand_xvec.to(fm_dtype), cond=conditions.to(fm_dtype), **kwargs
+        )
+        escape_time = (time.time() - fm_time) * 1000.0
+        logging.info(f"fm dec {n_timesteps} step time: {escape_time:.2f}, avg {escape_time/n_timesteps:.2f} ms")
+        return feat
+    def solve_euler(self, x, t_span, mu, mask, spks=None, cond=None, **kwargs):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        inference_cfg_rate = kwargs.get("inference_cfg_rate", 0.7)
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # print("solve_euler cond", cond.size())
+        steps = 1
+        z, bz = x, x.shape[0]
+        while steps <= len(t_span) - 1:
+            if inference_cfg_rate > 0:
+                x_in = torch.concat([x, x], dim=0)
+                spks_in = torch.cat([spks, torch.zeros_like(spks)], dim=0)
+                mask_in = torch.concat([mask, mask], dim=0)
+                mu_in = torch.concat([mu, torch.zeros_like(mu)], dim=0)
+                t_in = torch.concat([t.unsqueeze(0) for _ in range(mu_in.size(0))], dim=0)
+                if isinstance(cond, torch.Tensor):
+                    cond_in = torch.concat([cond, torch.zeros_like(cond)], dim=0)
+                else:
+                    cond_in = dict(
+                        prompt=[
+                            torch.concat([cond["prompt"][0], torch.zeros_like(cond["prompt"][0])], dim=0),
+                            torch.concat([cond["prompt"][1], cond["prompt"][1]], dim=0),
+                        ]
+                    )
+            else:
+                x_in, mask_in, mu_in, spks_in, t_in, cond_in = x, mask, mu, spks, t, cond
+            # if spks is not None:
+            #     cond_in = cond_in + spks
+            infer_causal_mask_type = kwargs.get("infer_causal_mask_type", 0)
+            chunk_mask_value = self.dit_model.causal_mask_type[infer_causal_mask_type]["prob_min"]
+            hint_once(
+                f"flow mask type: {infer_causal_mask_type}, mask_rank value: {chunk_mask_value}.",
+                "chunk_mask_value"
+            )
+            # print("dit_model cond", x_in.size(), cond_in.size(), mu_in.size(), spks_in.size(), t_in.size())
+            # print(t_in)
+            dphi_dt = self.dit_model(
+                x_in, cond_in, mu_in, spks_in, t_in,
+                mask=mask_in,
+                mask_rand=torch.ones_like(t_in).reshape(-1, 1, 1) * chunk_mask_value
+            )
+            if inference_cfg_rate > 0:
+                dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [bz, bz], dim=0)
+                dphi_dt = ((1.0 + inference_cfg_rate) * dphi_dt -
+                           inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            # sol.append(x)
+            if steps < len(t_span) - 1:
+                dt = t_span[steps + 1] - t
+            steps += 1
+        return x

funcineforge/models/inference_model.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+import torch.nn as nn
+import logging
+import numpy as np
+import os
+import torchaudio
+import time
+import shutil
+from funcineforge.utils.set_all_random_seed import set_all_random_seed
+from moviepy.video.io.VideoFileClip import VideoFileClip, AudioFileClip
+class FunCineForgeInferModel(nn.Module):
+    def __init__(
+        self,
+        lm_model,
+        fm_model,
+        voc_model,
+        **kwargs
+    ):
+        from funcineforge.auto.auto_model import AutoModel
+        super().__init__()
+        self.tokenizer = lm_model.kwargs["tokenizer"]
+        self.frontend = fm_model.kwargs["frontend"]
+        self.lm_model = lm_model.model
+        self.fm_model = fm_model.model
+        self.voc_model = voc_model.model
+        mel_extractor = self.fm_model.mel_extractor
+        if mel_extractor:
+            self.mel_frame_rate = mel_extractor.sampling_rate // mel_extractor.hop_length
+            self.sample_rate = mel_extractor.sampling_rate
+        else:
+            self.mel_frame_rate = self.fm_model.sample_rate // 480
+            self.sample_rate = self.fm_model.sample_rate
+    @torch.no_grad()
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = None,
+        **kwargs,
+    ):
+        uttid = key[0]
+        logging.info(f"generating {uttid}")
+        # text -> codec in [1, T]
+        kwargs["tokenizer"] = self.tokenizer
+        set_all_random_seed(kwargs.get("random_seed", 0))
+        lm_time = time.time()
+        codec, hit_eos, states = self.lm_model.inference(data_in, data_lengths, key, **kwargs)
+        logging.info(f"[llm time]: {((time.time()-lm_time)*1000):.2f} ms, [hit_eos]: {hit_eos}, [gen len]: {codec.shape[1]}, [speech tokens]: {codec[0].cpu().tolist()}")
+        wav, batch_data_time = None, 1.0
+        if codec.shape[1] > 0:
+            fm_time = time.time()
+            data_in[0]["codec"] = codec
+            set_all_random_seed(kwargs.get("random_seed", 0))
+            feat = self.fm_model.inference(data_in, data_lengths, key, **kwargs)
+            # feat -> wav
+            set_all_random_seed(kwargs.get("random_seed", 0))
+            wav = self.voc_model.inference([feat[0]], data_lengths, key, **kwargs)
+            # output save
+            output_dir = kwargs.get("output_dir", None)
+            if output_dir is not None:
+                feat_out_dir = os.path.join(output_dir, "feat")
+                os.makedirs(feat_out_dir, exist_ok=True)
+                np.save(os.path.join(feat_out_dir, f"{key[0]}.npy"), feat[0].cpu().numpy())
+                wav_out_dir = os.path.join(output_dir, "wav")
+                os.makedirs(wav_out_dir, exist_ok=True)
+                output_wav_path = os.path.join(wav_out_dir, f"{key[0]}.wav")
+                torchaudio.save(
+                    output_wav_path, wav.cpu(),
+                    sample_rate=self.sample_rate, encoding='PCM_S', bits_per_sample=16
+                )
+                silent_video_path = data_in[0]["video"]
+                if os.path.exists(silent_video_path):
+                    video_out_dir = os.path.join(output_dir, "mp4")
+                    video_gt_dir = os.path.join(output_dir, "gt")
+                    os.makedirs(video_out_dir, exist_ok=True)
+                    os.makedirs(video_gt_dir, exist_ok=True)
+                    output_video_path = os.path.join(video_out_dir, f"{key[0]}.mp4")
+                    copy_video_path = os.path.join(video_gt_dir, f"{key[0]}.mp4")
+                    shutil.copy2(silent_video_path, copy_video_path)
+                    self.merge_video_audio(
+                        silent_video_path=silent_video_path,
+                        wav_path=output_wav_path,
+                        output_path=output_video_path,
+                    )
+            logging.info(f"fm_voc time: {((time.time()-fm_time)*1000):.2f} ms")
+            batch_data_time = wav.shape[1] / self.voc_model.sample_rate
+        return [[wav]], {"batch_data_time": batch_data_time}
+    def merge_video_audio(self, silent_video_path, wav_path, output_path):
+        video_clip = VideoFileClip(silent_video_path)
+        video_duration = video_clip.duration
+        audio_clip = AudioFileClip(wav_path)
+        audio_duration = audio_clip.duration
+        if audio_duration >= video_duration:
+            audio_clip = audio_clip.subclipped(0, video_duration)
+        video_clip = video_clip.with_audio(audio_clip)
+        video_clip.write_videofile(
+            output_path,
+            codec='libx264',
+            audio_codec='aac',
+            fps=video_clip.fps,
+            logger=None
+        )
+        video_clip.close()
+        audio_clip.close()

funcineforge/models/language_model.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import logging
+import os
+import torch
+import torch.nn as nn
+from funcineforge.models.utils.llm_decoding import LLMDecoder
+from funcineforge.utils.device_funcs import to_device
+import numpy as np
+from funcineforge.models.utils import dtype_map
+from funcineforge.models import FunCineForgeSpecAug
+from transformers import AutoModelForCausalLM
+import pickle
+class FunCineForgeLM(nn.Module):
+    def __init__(
+        self,
+        llm: str = None,
+        llm_conf: dict = None,
+        input_size: int = 80,
+        length_normalized_loss: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        # llm
+        self.llm_conf = llm_conf
+        self.llm = None
+        init_param_path = llm_conf.get("init_param_path", "")
+        llm_load_kwargs = llm_conf.get("load_kwargs", {})
+        self.sample_rate = kwargs.get("sample_rate", 24000)
+        self.token_rate = kwargs.get("token_rate", 25)
+        if kwargs.get("infer_lora_merged", False):
+            llm_conf["use_qlora"] = False
+            llm_conf["use_lora"] = False
+            kwargs["infer_use_lora"] = False
+        model = AutoModelForCausalLM.from_pretrained(
+            init_param_path,
+            load_in_8bit=None,
+            device_map=None,
+            use_cache=None,
+            **llm_load_kwargs,
+        )
+        freeze = llm_conf.get("freeze", True)
+        if freeze:
+            for name, param in model.named_parameters():
+                param.requires_grad = False
+            model.eval()
+        logging.info(f"use_lora: {llm_conf.get('use_lora', False)}, use_qlora: {llm_conf.get('use_qlora', False)}, infer_use_lora: {kwargs.get('infer_use_lora',False)}, infer_lora_merged: {kwargs.get('infer_lora_merged',False)}")
+        if llm_conf.get("activation_checkpoint", False):
+            model.gradient_checkpointing_enable()
+        self.llm_dtype = llm_conf.get("llm_dtype", "fp32")
+        self.llm = model.to(dtype_map[self.llm_dtype])
+        llm_dim = model.get_input_embeddings().weight.shape[-1]
+        if (not llm_conf.get("use_lora", False)) and (not kwargs.get("infer_use_lora",False)):
+            del self.llm.lm_head
+        self.codec_unit = kwargs.get("codec_unit", 6761)
+        self.timespk_unit = kwargs.get("timespk_unit", 1550)
+        self.codec_embed = nn.Embedding(self.codec_unit, llm_dim, 0)
+        self.timespk_embed = nn.Embedding(self.timespk_unit, llm_dim, 0)
+        self.codec_head = nn.Linear(llm_dim, self.codec_unit, bias=False)
+        self.face_size = kwargs.get("face_size", 512)
+        self.face_linear = nn.Linear(self.face_size, llm_dim)
+        self.length_normalized_loss = length_normalized_loss
+        self.ignore_id = kwargs.get("ignore_id", -100)
+        specaug = kwargs.get("specaug", None)
+        specaug_conf = kwargs.get("specaug_conf", {})
+        if specaug is not None:
+            specaug = FunCineForgeSpecAug(**specaug_conf)
+        self.specaug = specaug
+        rank = int(os.environ.get("RANK", 0))
+        logging.info(f"rank: {rank}, model is builded.")
+    def insert_face_embeddings(
+        self, inputs_embeds, face_emb, attention_mask, labels_ids,
+        codec_len, insert_pos, device
+    ):
+        """
+        将face_emb插入到inputs_embeds中的指定位置, 同步更新attention_mask和labels_ids
+        Args:
+            inputs_embeds: (batch_size, token_num, dims) 输入embedding
+            face_emb: (batch_size, max_face_len, dims) 面部embedding
+            attention_mask: (batch_size, token_num) 注意力mask
+            labels_ids: (batch_size, token_num) 标签ID
+            codec_len: (batch_size,) 每个样本的实际face_emb长度
+            insert_pos: int 插入位置, SOS token之后
+            device
+        Returns:
+            padded_inputs_embeds: 插入face_emb并padding后的inputs_embeds
+            padded_attention_mask: 更新后的attention_mask
+            padded_labels: 更新后的labels_ids
+        """
+        batch_size, token_num, dims = inputs_embeds.shape
+        max_face_len = face_emb.size(1)
+        # 预计算新序列的最大长度
+        new_max_length = token_num + max_face_len
+        # 预分配输出张量
+        padded_inputs_embeds = torch.zeros(batch_size, new_max_length, dims, device=device)
+        padded_attention_mask = torch.zeros(batch_size, new_max_length, device=device, dtype=attention_mask.dtype)
+        padded_labels = torch.full((batch_size, new_max_length), self.ignore_id, device=device, dtype=labels_ids.dtype)
+        for i in range(batch_size):
+            current_face_len = codec_len[i].item()
+            # 直接填充，避免中间拼接
+            padded_inputs_embeds[i, :insert_pos] = inputs_embeds[i, :insert_pos]
+            padded_inputs_embeds[i, insert_pos:insert_pos+current_face_len] = face_emb[i, :current_face_len]
+            padded_inputs_embeds[i, insert_pos+current_face_len:token_num+current_face_len] = inputs_embeds[i, insert_pos:]
+            # 同样处理mask和labels
+            padded_attention_mask[i, :insert_pos] = attention_mask[i, :insert_pos]
+            padded_attention_mask[i, insert_pos:insert_pos+current_face_len] = 1
+            padded_attention_mask[i, insert_pos+current_face_len:token_num+current_face_len] = attention_mask[i, insert_pos:]
+            padded_labels[i, :insert_pos] = labels_ids[i, :insert_pos]
+            padded_labels[i, insert_pos:insert_pos+current_face_len] = self.ignore_id
+            padded_labels[i, insert_pos+current_face_len:token_num+current_face_len] = labels_ids[i, insert_pos:]
+        return padded_inputs_embeds, padded_attention_mask, padded_labels
+    def load_data(self, contents: dict, **kwargs):
+        lm_use_prompt = kwargs.get("lm_use_prompt", True)
+        tokenizer = kwargs.get("tokenizer")
+        # text + clue
+        text = contents["text"]
+        clue = "<|startofclue|>" + contents["clue"] + "<|endofclue|>"
+        if lm_use_prompt:
+            text = clue + text
+        text_ids = tokenizer.encode(text)
+        text_len = len(text_ids)
+        # timespk_ids
+        timespk_ids = contents["timespk_ids"].tolist()
+        type_id = contents["type_id"]
+        # sequence
+        sequence = [
+            kwargs['dataset_conf']["sos"],
+            *text_ids,
+            type_id,
+            *timespk_ids,
+            kwargs['dataset_conf']["turn_of_speech"]
+        ]
+        input_ids = torch.tensor(sequence, dtype=torch.int64)
+        # flag tensors
+        text_flag = torch.zeros(len(sequence), dtype=torch.float32)
+        timespk_flag = torch.zeros(len(sequence), dtype=torch.float32)
+        codec_flag = torch.zeros(len(sequence), dtype=torch.float32)
+        text_flag[1: text_len+1] = 1
+        timespk_flag[text_len+1: -1] = 1
+        codec_flag = 1 - text_flag - timespk_flag
+        # face embs
+        speech_len = contents["speech_len"]
+        face_embs = torch.zeros((speech_len, self.face_size), dtype=torch.float32)
+        face_path = contents.get("face")
+        with open(face_path, 'rb') as f:
+            stat_obj = pickle.load(f)
+            embeddings = stat_obj['embeddings']
+            faceI = stat_obj['faceI']
+            for emb, frameI in zip(embeddings, faceI):
+                fi = int(frameI)
+                if 0 <= fi < speech_len:
+                    end = min(fi + 5, speech_len)
+                    face_embs[fi:end] = torch.from_numpy(emb).expand(end - fi, -1)
+        # batch dimension
+        input_ids = input_ids[None, :]
+        text_flag = text_flag[None, :]
+        timespk_flag = timespk_flag[None, :]
+        codec_flag = codec_flag[None, :]
+        face_embs = face_embs[None, :, :]
+        output = {
+            "input_ids": input_ids,
+            "face_embs": face_embs,
+            "text_flag": text_flag > 0,
+            "timespk_flag": timespk_flag > 0,
+            "codec_flag": codec_flag > 0,
+            "prompt_codec": None, # you can add prompt codec here if needed
+        }
+        return output
+    def inference_prepare(self, data_in, **kwargs):
+        if kwargs.get("batch_size", 1) > 1:
+            raise NotImplementedError("batch decoding is not implemented")
+        output = self.load_data(data_in[0], **kwargs)
+        batch = to_device(output, kwargs["device"])
+        input_ids = batch["input_ids"]
+        input_ids = input_ids * (input_ids > 0)
+        text_flag = batch["text_flag"]
+        timespk_flag = batch["timespk_flag"]
+        codec_flag = batch["codec_flag"]
+        face_embs = batch["face_embs"]
+        if (kwargs.get("use_qlora",False) or kwargs.get("infer_use_lora",False)) and (not kwargs.get("infer_lora_merged",False)):
+            text_embeds = self.llm.base_model.model.model.get_input_embeddings()(input_ids * text_flag) * text_flag.unsqueeze(-1)
+        else:
+            text_embeds = self.llm.model.get_input_embeddings()(input_ids * text_flag) * text_flag.unsqueeze(-1)
+        timespk_embeds = self.timespk_embed(input_ids * timespk_flag) * timespk_flag.unsqueeze(-1)
+        codec_embs = self.codec_embed(input_ids * codec_flag) * codec_flag.unsqueeze(-1)
+        face_embs = self.face_linear(face_embs)
+        inputs_embeds = text_embeds + timespk_embeds + codec_embs
+        inputs_embeds = torch.cat([
+            inputs_embeds[:, 0:1, :],   # sos token
+            face_embs,                  # face embeddings
+            inputs_embeds[:, 1:, :]     # inputs_embeds after sos
+        ], dim=1)
+        prompt_codec = batch.get("prompt_codec", None)
+        if prompt_codec is not None:
+            codec_emb = self.codec_embed(prompt_codec)
+            inputs_embeds = torch.cat((inputs_embeds, codec_emb), dim=1)
+        return inputs_embeds
+    @torch.no_grad()
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = None,
+        **kwargs,
+    ):
+        uttid = key[0]
+        inputs_emb = self.inference_prepare(data_in, **kwargs)
+        logging.info(f"{uttid}: min length: {kwargs['min_length']}, max length: {kwargs['max_length']}")
+        dtype = dtype_map[kwargs.get("llm_dtype", "fp32")]
+        if not hasattr(self, "llm_generator"):
+            llm_generator_conf = kwargs.get("dataset_conf", {})
+            self.llm_generator = LLMDecoder(
+                token_embeder=self.codec_embed,
+                **llm_generator_conf
+            ).to(dtype)
+        if (kwargs.get("use_qlora",False) or kwargs.get("infer_use_lora",False)) and (not kwargs.get("infer_lora_merged",False)):
+            self.llm.base_model.model.lm_head = self.codec_head.to(dtype)
+        else:
+            self.llm.lm_head = self.codec_head.to(dtype)
+        gen_codec, hit_eos, states = self.llm_generator(
+            inputs_emb.to(dtype),
+            self.llm,
+            states=kwargs.get("states", {}),
+            **kwargs
+        )
+        output_dir = kwargs.get("output_dir", None)
+        if output_dir is not None:
+            output_dir = os.path.join(output_dir, "codec")
+            os.makedirs(output_dir, exist_ok=True)
+            np.save(
+                os.path.join(output_dir, f"{key[0]}.npy"),
+                gen_codec[0].cpu().numpy()
+            )
+        return gen_codec, hit_eos, states

funcineforge/models/modules/__init__.py ADDED Viewed

File without changes

funcineforge/models/modules/dit_flow_matching/__init__.py ADDED Viewed

File without changes

funcineforge/models/modules/dit_flow_matching/dit_model.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import repeat
+from x_transformers.x_transformers import RotaryEmbedding
+from funcineforge.models.utils.masks import causal_block_mask
+from .dit_modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    CausalConvPositionEmbedding,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis,
+    get_pos_embed_indices,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        batch, text_len = text.shape[0], text.shape[1]
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        text = F.pad(text, (0, seq_len - text_len), value=0)
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim, spk_dim=None):
+        super().__init__()
+        spk_dim = 0 if spk_dim is None else spk_dim
+        self.spk_dim = spk_dim
+        self.proj = nn.Linear(mel_dim * 2 + text_dim + spk_dim, out_dim)
+        self.conv_pos_embed = CausalConvPositionEmbedding(dim=out_dim)
+    def forward(
+            self,
+            x: float["b n d"],
+            cond: float["b n d"],
+            text_embed: float["b n d"],
+            spks: float["b d"],
+    ):
+        to_cat = [x, cond, text_embed]
+        if self.spk_dim > 0:
+            spks = repeat(spks, "b c -> b t c", t=x.shape[1])
+            to_cat.append(spks)
+        x = self.proj(torch.cat(to_cat, dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=80,
+        mu_dim=None,
+        long_skip_connection=False,
+        spk_dim=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        if mu_dim is None:
+            mu_dim = mel_dim
+        self.input_embed = InputEmbedding(mel_dim, mu_dim, dim, spk_dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+        self.causal_mask_type = kwargs.get("causal_mask_type", None)
+    def build_mix_causal_mask(self, attn_mask, rand=None, ratio=None):
+        b, _, _, t = attn_mask.shape
+        if rand is None:
+            rand = torch.rand((b, 1, 1, 1), device=attn_mask.device, dtype=torch.float32)
+        mixed_mask = attn_mask.clone()
+        for item in self.causal_mask_type:
+            prob_min, prob_max = item["prob_min"], item["prob_max"]
+            _ratio = 1
+            if "ratio" in item:
+                _ratio = item["ratio"]
+            if ratio is not None:
+                _ratio = ratio
+            block_size = item["block_size"] * _ratio
+            if block_size <= 0:
+                causal_mask = attn_mask
+            else:
+                causal_mask = causal_block_mask(
+                    t, block_size, attn_mask.device, torch.float32
+                ).unsqueeze(0).unsqueeze(1)  # 1,1,T,T
+            flag = (prob_min <= rand) & (rand < prob_max)
+            mixed_mask = mixed_mask * (~flag) + (causal_mask * attn_mask) * flag
+        return mixed_mask
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio
+        cond: float["b n d"],  # masked cond audio
+        mu: int["b nt d"],  # mu
+        spks: float["b 1 d"],  # spk xvec
+        time: float["b"] | float[""],  # time step
+        return_hidden: bool = False,
+        mask: bool["b 1 n"] | None = None,
+        mask_rand: float["b 1 1"] = None,  # for mask flag type
+        **kwargs,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        x = self.input_embed(x, cond, mu, spks.squeeze(1))
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if self.long_skip_connection is not None:
+            residual = x
+        mask = mask.unsqueeze(1)  # B,1,1,T
+        if self.causal_mask_type is not None:
+            mask = self.build_mix_causal_mask(mask, rand=mask_rand.unsqueeze(-1))
+        for block in self.transformer_blocks:
+            # mask-out padded values for amp training
+            x = x * mask[:, 0, -1, :].unsqueeze(-1)
+            x = block(x, t, mask=mask.bool(), rope=rope)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        if return_hidden:
+            return output, None
+        return output

funcineforge/models/modules/dit_flow_matching/dit_modules.py ADDED Viewed

	@@ -0,0 +1,622 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+from typing import Optional
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+from x_transformers.x_transformers import apply_rotary_pos_emb
+# raw wav to mel spec
+class MelSpec(nn.Module):
+    def __init__(
+        self,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=100,
+        target_sample_rate=24_000,
+        normalize=False,
+        power=1,
+        norm=None,
+        center=True,
+    ):
+        super().__init__()
+        self.n_mel_channels = n_mel_channels
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=target_sample_rate,
+            n_fft=filter_length,
+            win_length=win_length,
+            hop_length=hop_length,
+            n_mels=n_mel_channels,
+            power=power,
+            center=center,
+            normalized=normalize,
+            norm=norm,
+        )
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+    def forward(self, inp):
+        if len(inp.shape) == 3:
+            inp = inp.squeeze(1)  # 'b 1 nw -> b nw'
+        assert len(inp.shape) == 2
+        if self.dummy.device != inp.device:
+            self.to(inp.device)
+        mel = self.mel_stft(inp)
+        mel = mel.clamp(min=1e-5).log()
+        return mel
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# convolutional position embedding
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = self.conv1d(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+class CausalConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.kernel_size = kernel_size
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=0),
+            nn.Mish(),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=0),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):  # noqa: F722
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = F.pad(x, (self.kernel_size - 1, 0, 0, 0))
+        x = self.conv1(x)
+        x = F.pad(x, (self.kernel_size - 1, 0, 0, 0))
+        x = self.conv2(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+# rotary positional embedding related
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+def get_pos_embed_indices(start, length, max_pos, scale=1.0):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
+    pos = (
+        start.unsqueeze(1)
+        + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
+    )
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+# Global Response Normalization layer (Instance Normalization ?)
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        with torch.cuda.amp.autocast(enabled=False):
+            Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+    def forward(self, x):
+        return self.ff(x)
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class Attention(nn.Module):
+    def __init__(
+        self,
+        processor: JointAttnProcessor | AttnProcessor,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,  # if not None -> joint attention
+        context_pre_only=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.processor = processor
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.context_dim = context_dim
+        self.context_pre_only = context_pre_only
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if self.context_dim is not None:
+            self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+            if self.context_pre_only is not None:
+                self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_out_c = nn.Linear(self.inner_dim, dim)
+    def forward(
+        self,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b n d"] = None,  # context c  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.Tensor:
+        if c is not None:
+            return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
+        else:
+            return self.processor(self, x, mask=mask, rope=rope)
+# Attention processor
+class AttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = mask
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+                attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if mask is not None:
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(-1)
+            else:
+                mask = mask[:, 0, -1].unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+        return x
+# Joint Attention processor for MM-DiT
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class JointAttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x  # noqa: F722
+        c: float["b nt d"] = None,  # context c, here text # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.FloatTensor:
+        residual = x
+        batch_size = c.shape[0]
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # `context` projections.
+        c_query = attn.to_q_c(c)
+        c_key = attn.to_k_c(c)
+        c_value = attn.to_v_c(c)
+        # apply rope for context and noised input independently
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if c_rope is not None:
+            freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
+            c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
+        # attention
+        query = torch.cat([query, c_query], dim=1)
+        key = torch.cat([key, c_key], dim=1)
+        value = torch.cat([value, c_value], dim=1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # Split the attention outputs.
+        x, c = (
+            x[:, : residual.shape[1]],
+            x[:, residual.shape[1] :],
+        )
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if not attn.context_pre_only:
+            c = attn.to_out_c(c)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+            # c = c.masked_fill(~mask, 0.)  # no mask for c (text)
+        return x, c
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
+        super().__init__()
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+        )
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        with torch.cuda.amp.autocast(enabled=False):
+            ff_norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(ff_norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+# MMDiT Block https://arxiv.org/abs/2403.03206
+class MMDiTBlock(nn.Module):
+    r"""
+    modified from diffusers/src/diffusers/models/attention.py
+    notes.
+    _c: context related. text, cond, etc. (left part in sd3 fig2.b)
+    _x: noised input related. (right part)
+    context_pre_only: last layer only do prenorm + modulation cuz no more ffn
+    """
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_pre_only=False):
+        super().__init__()
+        self.context_pre_only = context_pre_only
+        self.attn_norm_c = AdaLayerNormZero_Final(dim) if context_pre_only else AdaLayerNormZero(dim)
+        self.attn_norm_x = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=JointAttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            context_dim=dim,
+            context_pre_only=context_pre_only,
+        )
+        if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+            self.ff_c = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+        else:
+            self.ff_norm_c = None
+            self.ff_c = None
+        self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+    def forward(self, x, c, t, mask=None, rope=None, c_rope=None):  # x: noised input, c: context, t: time embedding
+        # pre-norm & modulation for attention input
+        if self.context_pre_only:
+            norm_c = self.attn_norm_c(c, t)
+        else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
+        # attention
+        x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
+        # process attention output for context c
+        if self.context_pre_only:
+            c = None
+        else:  # if not last layer
+            c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            with torch.cuda.amp.autocast(enabled=False):
+                norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            c_ff_output = self.ff_c(norm_c)
+            c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
+        # process attention output for input x
+        x = x + x_gate_msa.unsqueeze(1) * x_attn_output
+        with torch.cuda.amp.autocast(enabled=False):
+            norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
+        x_ff_output = self.ff_x(norm_x)
+        x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
+        return c, x
+# time step conditioning embedding
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+    def forward(self, timestep: float["b"]):  # noqa: F821
+        time_hidden = self.time_embed(timestep)
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time

funcineforge/models/modules/hifigan/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+from funcineforge.models.modules.hifigan.generator import HifiGenerator, NsfHifiGenerator, HiFTGenerator
+from funcineforge.models.modules.hifigan.discriminator import MultipleDiscriminator
+from funcineforge.models.modules.hifigan.nsf_utils import ConvRNNF0Predictor

funcineforge/models/modules/hifigan/activations.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

funcineforge/models/modules/hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""hifigan based dicriminator implementation.
+This code is modified from https://github.com/jik876/hifi-gan and https://github.com/kan-bayashi/ParallelWaveGAN.
+"""
+import typing as tp
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv2d, AvgPool1d, Conv1d
+from torch.nn.utils import weight_norm, spectral_norm
+from funcineforge.models.modules.hifigan import get_padding
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False, lrelu_slope=0.1):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.lrelu_slope = lrelu_slope
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self,
+                 in_channels: int = 1,
+                 periods: tp.List[int] = [2, 3, 5, 7, 11]):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(p) for p in periods
+        ])
+    def forward(self, x: torch.Tensor, return_intermediates: bool = True):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each
+                layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            # outs += [f(x)]
+            if return_intermediates:
+                outs.append(f(x))
+            else:
+                outs.append(f(x)[0])
+        return outs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False, lrelu_slope=0.1):
+        super(DiscriminatorS, self).__init__()
+        self.lrelu_slope = lrelu_slope
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self, in_channels: int = 1, nb_scales: int = 3):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+    def forward(self, x: torch.Tensor, return_intermediates: bool = True):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each
+                layer output tensors.
+        """
+        outs = []
+        for i, f in enumerate(self.discriminators):
+            if i != 0:
+                x = self.meanpools[i - 1](x)
+            if return_intermediates:
+                outs.append(f(x))
+            else:
+                outs.append(f(x)[0])
+        return outs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        stft_params: tp.List[int],
+        lrelu_slope: float = 0.1,
+        use_spectral_norm: bool = False,
+    ):
+        super().__init__()
+        self.stft_params = stft_params
+        self.lrelu_slope = lrelu_slope
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.stft_params
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        spec = torch.stft(x, n_fft, hop_length=hop_length, win_length=win_length,
+                          center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        mag = torch.norm(spec, p=2, dim =-1) #[B, F, TT]
+        return mag
+    def forward(self, x):
+        fmap = []
+        x = self.spectrogram(x).unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        fft_sizes: tp.List[int] = [1024, 2048, 512],
+        hop_sizes: tp.List[int] = [120, 240, 50],
+        win_lengths: tp.List[int] = [600, 1200, 240],
+        lrelu_slope: float = 0.1,
+    ):
+        super().__init__()
+        self.discriminators = nn.ModuleList()
+        for fft, hop, win in zip(fft_sizes, hop_sizes, win_lengths):
+            self.discriminators.append(DiscriminatorR([fft, hop, win], lrelu_slope))
+    def forward(self, x: torch.Tensor, return_intermediates: bool = True):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each
+                layer output tensors.
+        """
+        outs = []
+        for f in self.discriminators:
+            if return_intermediates:
+                outs.append(f(x))
+            else:
+                outs.append(f(x)[0])
+        return outs
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self,
+            input_size: int = 1,
+            disc_conf_list: tp.List[tp.Dict[str, tp.Any]] = None,
+    ):
+        super().__init__()
+        self.support_disc_choices = dict(
+            mpd=MultiPeriodDiscriminator,
+            msd=MultiScaleDiscriminator,
+            mrd=MultiResolutionDiscriminator,
+        )
+        self.discriminators = nn.ModuleList()
+        self.discriminator_type_lst = []
+        for args in disc_conf_list:
+            assert "name" in args, "disc_conf must have `name` attr to specific disc type."
+            disc_type = args.pop("name")
+            assert disc_type in self.support_disc_choices, \
+                "Unsupported discriminator type, only support {}".format(
+                    ",".join(self.support_disc_choices.keys())
+                )
+            disc_class = self.support_disc_choices[disc_type]
+            one_disc = disc_class(in_channels=input_size, **args)
+            self.discriminators.append(one_disc)
+            # add back to the args for dump config.yaml
+            args["name"] = disc_type
+            self.discriminator_type_lst.append(disc_type)
+    def get_discriminator_type_lst(self) -> tp.List[str]:
+        return self.discriminator_type_lst
+    def forward(self, x, return_intermediates=True):
+        retval = []
+        for disc in self.discriminators:
+            out = disc(x, return_intermediates=return_intermediates)
+            if isinstance(out, tuple):
+                retval.append(out)
+            elif isinstance(out, list):
+                retval.extend(out)
+            else:
+                raise TypeError("The return value of discriminator must be tuple or list[tuple]")
+        return retval

funcineforge/models/modules/hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,625 @@

+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+import typing as tp
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm
+from torch.nn.utils import remove_weight_norm
+from funcineforge.models.modules.hifigan import get_padding, init_weights
+from funcineforge.models.modules.hifigan.activations import Snake, SnakeBeta
+from funcineforge.models.modules.hifigan.nsf_utils import SourceModule, SourceModuleHnNSF
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: tp.List[int] = [1, 3, 5],
+        use_additional_convs: bool = True,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: tp.Dict[str, tp.Any] = {"negative_slope": 0.1},
+    ):
+        super(ResBlock, self).__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            if use_additional_convs:
+                self.convs2.append(
+                    weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1)
+                        )
+                    )
+                )
+        self.convs1.apply(init_weights)
+        if use_additional_convs:
+            self.convs2.apply(init_weights)
+        if nonlinear_activation == "LeakyReLU":
+            self.activations1 = nn.ModuleList([
+                nn.LeakyReLU(nonlinear_activation_params["negative_slope"])
+                for _ in range(len(self.convs1))
+            ])
+            if use_additional_convs:
+                self.activations2 = nn.ModuleList([
+                    nn.LeakyReLU(nonlinear_activation_params["negative_slope"])
+                    for _ in range(len(self.convs2))
+                ])
+        elif nonlinear_activation == "Snake":
+            self.activations1 = nn.ModuleList([
+                Snake(channels, alpha_logscale=nonlinear_activation_params.get("alpha_logscale", False))
+                for _ in range(len(self.convs1))
+            ])
+            if use_additional_convs:
+                self.activations2 = nn.ModuleList([
+                    Snake(channels, alpha_logscale=nonlinear_activation_params.get("alpha_logscale", False))
+                    for _ in range(len(self.convs2))
+                ])
+        elif nonlinear_activation == "SnakeBeta":
+            self.activations1 = nn.ModuleList([
+                SnakeBeta(channels, alpha_logscale=nonlinear_activation_params.get("alpha_logscale", False))
+                for _ in range(len(self.convs1))
+            ])
+            if use_additional_convs:
+                self.activations2 = nn.ModuleList([
+                    SnakeBeta(channels, alpha_logscale=nonlinear_activation_params.get("alpha_logscale", False))
+                    for _ in range(len(self.convs2))
+                ])
+        else:
+            raise NotImplementedError
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            if self.use_additional_convs:
+                xt = self.activations2[idx](xt)
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            if self.use_additional_convs:
+                remove_weight_norm(self.convs2[idx])
+class HifiGenerator(nn.Module):
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            global_channels: int = -1,
+            upsample_rates: tp.List[int] = [8, 8, 2, 2],
+            upsample_kernel_sizes: tp.List[int] = [16, 16, 4, 4],
+            resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
+            resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            resblock_nonlinear_activation: str = "LeakyReLU",
+            resblock_nonlinear_activation_params: tp.Dict[str, tp.Any] = {"negative_slope": 0.1},
+            use_additional_convs: bool = True,
+            cond_in_each_up_layer: bool = False,
+            lrelu_slope: float = 0.1,
+            act_pre_each_up_layer: bool = True
+    ):
+        super(HifiGenerator, self).__init__()
+        self.out_channels = 1
+        self.global_channels = global_channels
+        self.use_additional_convs = use_additional_convs
+        self.cond_in_each_up_layer = cond_in_each_up_layer if global_channels > 0 else False
+        self.lrelu_slope = lrelu_slope
+        self.act_pre_each_up_layer = act_pre_each_up_layer
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d, use_additional_convs,
+                                               resblock_nonlinear_activation,
+                                               resblock_nonlinear_activation_params))
+        if self.global_channels > 0:
+            self.conv_global_cond = weight_norm(
+                Conv1d(global_channels, base_channels, 1)
+            )
+            self.conv_global_cond.apply(init_weights)
+            if self.cond_in_each_up_layer:
+                self.conv_conds = nn.ModuleList()
+                for i in range(len(self.ups)):
+                    self.conv_conds.append(weight_norm(
+                        nn.Conv1d(global_channels, base_channels // (2**(i + 1)), 1))
+                    )
+                self.conv_conds.apply(init_weights)
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def output_size(self):
+        return self.out_channels
+    def forward(self, x: torch.Tensor, g: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
+        # x in (B, in_channels, T), g in (B, global_channels, 1)
+        x = self.conv_pre(x)
+        if self.global_channels > 0 and g is not None:
+            x = x + self.conv_global_cond(g)
+        for i in range(self.num_upsamples):
+            if self.act_pre_each_up_layer:
+                x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if self.cond_in_each_up_layer and g is not None:
+                x = x + self.conv_conds[i](g)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        if self.global_channels > 0:
+            remove_weight_norm(self.conv_global_cond)
+        if self.cond_in_each_up_layer:
+            for l in self.conv_conds:
+                remove_weight_norm(l)
+class NsfHifiGenerator(nn.Module):
+    """
+    Neural Source Filter + HifiGan
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            global_channels: int = -1,
+            nb_harmonics: int = 7,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: tp.List[int] = [8, 8, 2, 2],
+            upsample_kernel_sizes: tp.List[int] = [16, 16, 4, 4],
+            resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
+            resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            resblock_nonlinear_activation: str = "LeakyReLU",
+            resblock_nonlinear_activation_params: tp.Dict[str, tp.Any] = {"negative_slope": 0.1},
+            use_additional_convs: bool = True,
+            cond_in_each_up_layer: bool = False,
+            lrelu_slope: float = 0.1,
+            act_pre_each_up_layer: bool = True
+    ):
+        super(NsfHifiGenerator, self).__init__()
+        self.out_channels = 1
+        self.global_channels = global_channels
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.use_additional_convs = use_additional_convs
+        self.cond_in_each_up_layer = cond_in_each_up_layer if global_channels > 0 else False
+        self.lrelu_slope = lrelu_slope
+        self.act_pre_each_up_layer = act_pre_each_up_layer
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.source_module = SourceModule(nb_harmonics, np.cumprod(upsample_rates)[-1],
+                                          sampling_rate, nsf_alpha, nsf_sigma, nsf_voiced_threshold)
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, u in enumerate(downsample_cum_rates[::-1]):
+            if (u == 1):
+                self.source_downs.append(
+                weight_norm(Conv1d(1, base_channels // (2 ** (i + 1)), 1, 1))
+                )
+            else:
+                self.source_downs.append(
+                weight_norm(Conv1d(1, base_channels // (2 ** (i + 1)), u*2, u, padding=(u//2)))
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d, use_additional_convs,
+                                               resblock_nonlinear_activation,
+                                               resblock_nonlinear_activation_params))
+        if self.global_channels > 0:
+            self.conv_global_cond = weight_norm(
+                Conv1d(global_channels, base_channels, 1)
+            )
+            self.conv_global_cond.apply(init_weights)
+            if self.cond_in_each_up_layer:
+                self.conv_conds = nn.ModuleList()
+                for i in range(len(self.ups)):
+                    self.conv_conds.append(weight_norm(
+                        nn.Conv1d(global_channels, base_channels // (2**(i + 1)), 1))
+                    )
+                self.conv_conds.apply(init_weights)
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def output_size(self):
+        return self.out_channels
+    def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
+        return self.source_module(f0.unsqueeze(1))
+    def forward(self, x: torch.Tensor, f0: torch.Tensor, g: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
+        # x in (B, in_channels, T), f0 in (B, T), g in (B, global_channels, 1)
+        s = self._f02source(f0)
+        x = self.conv_pre(x)
+        if self.global_channels > 0 and g is not None:
+            x = x + self.conv_global_cond(g)
+        for i in range(self.num_upsamples):
+            if self.act_pre_each_up_layer:
+                x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if self.cond_in_each_up_layer and g is not None:
+                x = x + self.conv_conds[i](g)
+            # fusion
+            x = x + self.source_downs[i](s)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        if self.global_channels > 0:
+            remove_weight_norm(self.conv_global_cond)
+        if self.cond_in_each_up_layer:
+            for l in self.conv_conds:
+                remove_weight_norm(l)
+        self.source_module.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            global_channels: int = -1,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: tp.List[int] = [8, 8],
+            upsample_kernel_sizes: tp.List[int] = [16, 16],
+            istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
+            resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            resblock_nonlinear_activation: str = "Snake",
+            resblock_nonlinear_activation_params: tp.Dict[str, tp.Any] = {"alpha_logscale": False},
+            source_resblock_kernel_sizes: tp.List[int] = [7, 11],
+            source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]],
+            source_resblock_nonlinear_activation: str = "Snake",
+            source_resblock_nonlinear_activation_params: tp.Dict[str, tp.Any] = {"alpha_logscale": False},
+            use_additional_convs: bool = True,
+            cond_in_each_up_layer: bool = False,
+            lrelu_slope: float = 0.1,
+            act_pre_each_up_layer: bool = True,
+            audio_limit: float = 0.99,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.global_channels = global_channels
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.use_additional_convs = use_additional_convs
+        self.cond_in_each_up_layer = cond_in_each_up_layer if global_channels > 0 else False
+        self.lrelu_slope = lrelu_slope
+        self.act_pre_each_up_layer = act_pre_each_up_layer
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes,
+                                          source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u*2, u, padding=(u//2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d,
+                         use_additional_convs, source_resblock_nonlinear_activation,
+                         source_resblock_nonlinear_activation_params)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d, use_additional_convs,
+                                               resblock_nonlinear_activation,
+                                               resblock_nonlinear_activation_params))
+        if self.global_channels > 0:
+            self.conv_global_cond = weight_norm(
+                Conv1d(global_channels, base_channels, 1)
+            )
+            self.conv_global_cond.apply(init_weights)
+            if self.cond_in_each_up_layer:
+                self.conv_conds = nn.ModuleList()
+                for i in range(len(self.ups)):
+                    self.conv_conds.append(weight_norm(
+                        nn.Conv1d(global_channels, base_channels // (2**(i + 1)), 1))
+                    )
+                self.conv_conds.apply(init_weights)
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.register_buffer("stft_window", window)
+    def output_size(self):
+        return self.out_channels
+    def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
+        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        har_source, _, _ = self.m_source(f0)
+        return har_source.transpose(1, 2)
+    def forward(self, x: torch.Tensor, f0: torch.Tensor, g: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
+        # x in (B, in_channels, T), f0 in (B, T), g in (B, global_channels, 1)
+        s = self._f02source(f0)
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        if self.global_channels > 0 and g is not None:
+            x = x + self.conv_global_cond(g)
+        for i in range(self.num_upsamples):
+            if self.act_pre_each_up_layer:
+                x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if self.cond_in_each_up_layer and g is not None:
+                x = x + self.conv_conds[i](g)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        if self.global_channels > 0:
+            remove_weight_norm(self.conv_global_cond)
+        if self.cond_in_each_up_layer:
+            for l in self.conv_conds:
+                remove_weight_norm(l)
+        self.source_module.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window,
+            return_complex=True)
+        spec = torch.view_as_real(spec) # [B, F, TT, 2]
+        return spec[...,0], spec[...,1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(
+            # torch.cat([real.unsqueeze(-1), img.unsqueeze(-1)], dim=-1),
+            torch.complex(real, img),
+            self.istft_params["n_fft"], self.istft_params["hop_len"],
+            self.istft_params["n_fft"], window=self.stft_window,
+            return_complex=False
+        )
+        return inverse_transform.unsqueeze(-2)  # unsqueeze to stay consistent with conv_transpose1d implementation

funcineforge/models/modules/hifigan/mel_spectrum.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.utils.data
+import numpy as np
+from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def power_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_from_power_spectrogram(spec, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    global mel_basis, hann_window
+    spec = spectral_de_normalize_torch(spec)
+    spec = torch.matmul(mel_basis[str(fmax) + '_' + str(spec.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

funcineforge/models/modules/hifigan/nsf_utils.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Neural Source Filter based modules implementation.
+Neural source-filter waveform models for statistical parametric speech synthesis
+"""
+import numpy as np
+import typing as tp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, remove_weight_norm
+from torch.distributions.uniform import Uniform
+from torch.distributions.normal import Normal
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i:i+1, :] = f0 * (i+1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1,2))
+            sine_wavs = sine_wavs.transpose(1,2)
+            uv = uv.transpose(1,2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class SourceModule(torch.nn.Module):
+    def __init__(self,
+                 nb_harmonics: int,
+                 upsample_ratio: int,
+                 sampling_rate: int,
+                 alpha: float = 0.1,
+                 sigma: float = 0.003,
+                 voiced_threshold: float = 10
+                 ):
+        super(SourceModule, self).__init__()
+        self.nb_harmonics = nb_harmonics
+        self.upsample_ratio = upsample_ratio
+        self.sampling_rate = sampling_rate
+        self.alpha = alpha
+        self.sigma = sigma
+        self.voiced_threshold = voiced_threshold
+        self.ffn = nn.Sequential(
+            weight_norm(nn.Conv1d(self.nb_harmonics + 1, 1, kernel_size=1, stride=1)),
+            nn.Tanh())
+    def f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, frame_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        with torch.no_grad():
+            uv = self.f02uv(f0)
+            f0_samples = F.interpolate(f0, scale_factor=(self.upsample_ratio), mode='nearest')
+            uv_samples = F.interpolate(uv, scale_factor=(self.upsample_ratio), mode='nearest')
+            F_mat = torch.zeros((f0_samples.size(0), self.nb_harmonics + 1, f0_samples.size(-1))).to(f0_samples.device)
+            for i in range(self.nb_harmonics + 1):
+                F_mat[:, i:i+1, :] = f0_samples * (i+1) / self.sampling_rate
+            theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+            u_dist = Uniform(low=-np.pi, high=np.pi)
+            phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.nb_harmonics + 1, 1)).to(F_mat.device)
+            phase_vec[:, 0, :] = 0
+            n_dist = Normal(loc=0., scale=self.sigma)
+            noise = n_dist.sample(sample_shape=(f0_samples.size(0), self.nb_harmonics + 1, f0_samples.size(-1))).to(F_mat.device)
+            e_voice = self.alpha * torch.sin(theta_mat + phase_vec) + noise
+            e_unvoice = self.alpha / 3 / self.sigma * noise
+            e = e_voice * uv_samples + e_unvoice * (1 - uv_samples)
+        return self.ffn(e)
+    def remove_weight_norm(self):
+        remove_weight_norm(self.ffn[0])
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512,
+                 use_cond_rnn: bool = True,
+                 bidirectional_rnn: bool = False,
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.use_cond_rnn = use_cond_rnn
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        if self.use_cond_rnn:
+            self.rnn = nn.GRU(
+                cond_channels,
+                cond_channels // 2 if bidirectional_rnn else cond_channels,
+                num_layers=1,
+                batch_first=True,
+                bidirectional=bidirectional_rnn,
+            )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        if self.use_cond_rnn:
+            x, _ = self.rnn(x.transpose(1, 2))
+        else:
+            x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))

funcineforge/models/specaug/__init__.py ADDED Viewed

File without changes

funcineforge/models/specaug/mask_along_axis.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import math
+import torch
+from typing import Sequence
+from typing import Union
+def mask_along_axis(
+    spec: torch.Tensor,
+    spec_lengths: torch.Tensor,
+    mask_width_range: Sequence[int] = (0, 30),
+    dim: int = 1,
+    num_mask: int = 2,
+    replace_with_zero: bool = True,
+    fill_value: float = 0.0,
+):
+    """Apply mask along the specified direction.
+    Args:
+        spec: (Batch, Length, Freq)
+        spec_lengths: (Length): Not using lengths in this implementation
+        mask_width_range: Select the width randomly between this range
+    """
+    org_size = spec.size()
+    if spec.dim() == 4:
+        # spec: (Batch, Channel, Length, Freq) -> (Batch * Channel, Length, Freq)
+        spec = spec.view(-1, spec.size(2), spec.size(3))
+    B = spec.shape[0]
+    # D = Length or Freq
+    D = spec.shape[dim]
+    # mask_length: (B, num_mask, 1)
+    mask_length = torch.randint(
+        mask_width_range[0],
+        mask_width_range[1],
+        (B, num_mask),
+        device=spec.device,
+    ).unsqueeze(2)
+    # mask_pos: (B, num_mask, 1)
+    mask_pos = torch.randint(
+        0, max(1, D - mask_length.max()), (B, num_mask), device=spec.device
+    ).unsqueeze(2)
+    # aran: (1, 1, D)
+    aran = torch.arange(D, device=spec.device)[None, None, :]
+    # mask: (Batch, num_mask, D)
+    mask = (mask_pos <= aran) * (aran < (mask_pos + mask_length))
+    # Multiply masks: (Batch, num_mask, D) -> (Batch, D)
+    mask = mask.any(dim=1)
+    if dim == 1:
+        # mask: (Batch, Length, 1)
+        mask = mask.unsqueeze(2)
+    elif dim == 2:
+        # mask: (Batch, 1, Freq)
+        mask = mask.unsqueeze(1)
+    if replace_with_zero:
+        value = fill_value
+    else:
+        value = spec.mean()
+    spec = spec.masked_fill(mask, value)
+    spec = spec.view(*org_size)
+    return spec, spec_lengths
+class MaskAlongAxis(torch.nn.Module):
+    def __init__(
+        self,
+        mask_width_range: Union[int, Sequence[int]] = (0, 30),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+        fill_value: float = 0.0,
+    ):
+        if isinstance(mask_width_range, int):
+            mask_width_range = (0, mask_width_range)
+        if len(mask_width_range) != 2:
+            raise TypeError(
+                f"mask_width_range must be a tuple of int and int values: " f"{mask_width_range}",
+            )
+        assert mask_width_range[1] > mask_width_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+        super().__init__()
+        self.mask_width_range = mask_width_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+        self.fill_value = fill_value
+    def extra_repr(self):
+        return (
+            f"mask_width_range={self.mask_width_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+        return mask_along_axis(
+            spec,
+            spec_lengths,
+            mask_width_range=self.mask_width_range,
+            dim=self.dim,
+            num_mask=self.num_mask,
+            replace_with_zero=self.replace_with_zero,
+            fill_value=self.fill_value,
+        )
+class MaskAlongAxisVariableMaxWidth(torch.nn.Module):
+    """Mask input spec along a specified axis with variable maximum width.
+    Formula:
+        max_width = max_width_ratio * seq_len
+    """
+    def __init__(
+        self,
+        mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+        fill_value: float = 0.0,
+    ):
+        if isinstance(mask_width_ratio_range, float):
+            mask_width_ratio_range = (0.0, mask_width_ratio_range)
+        if len(mask_width_ratio_range) != 2:
+            raise TypeError(
+                f"mask_width_ratio_range must be a tuple of float and float values: "
+                f"{mask_width_ratio_range}",
+            )
+        assert mask_width_ratio_range[1] > mask_width_ratio_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+        self.fill_value = fill_value
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+        max_seq_len = spec.shape[self.dim]
+        min_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[0])
+        min_mask_width = max([0, min_mask_width])
+        max_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[1])
+        max_mask_width = min([max_seq_len, max_mask_width])
+        if max_mask_width > min_mask_width:
+            return mask_along_axis(
+                spec,
+                spec_lengths,
+                mask_width_range=(min_mask_width, max_mask_width),
+                dim=self.dim,
+                num_mask=self.num_mask,
+                replace_with_zero=self.replace_with_zero,
+                fill_value=self.fill_value,
+            )
+        return spec, spec_lengths

funcineforge/models/specaug/specaug.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""SpecAugment module."""
+from typing import Optional
+from typing import Sequence
+from typing import Union
+from funcineforge.models.specaug.mask_along_axis import MaskAlongAxis
+from funcineforge.models.specaug.mask_along_axis import MaskAlongAxisVariableMaxWidth
+from funcineforge.models.specaug.time_warp import TimeWarp
+import torch.nn as nn
+class SpecAug(nn.Module):
+    """Implementation of SpecAug.
+    Reference:
+        Daniel S. Park et al.
+        "SpecAugment: A Simple Data
+         Augmentation Method for Automatic Speech Recognition"
+    .. warning::
+        When using cuda mode, time_warp doesn't have reproducibility
+        due to `torch.nn.functional.interpolate`.
+    """
+    def __init__(
+        self,
+        apply_time_warp: bool = True,
+        time_warp_window: int = 5,
+        time_warp_mode: str = "bicubic",
+        apply_freq_mask: bool = True,
+        freq_mask_width_range: Union[int, Sequence[int]] = (0, 20),
+        num_freq_mask: int = 2,
+        apply_time_mask: bool = True,
+        time_mask_width_range: Optional[Union[int, Sequence[int]]] = None,
+        time_mask_width_ratio_range: Optional[Union[float, Sequence[float]]] = None,
+        num_time_mask: int = 2,
+        fill_value: float = 0.0,
+    ):
+        if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+            raise ValueError("Either one of time_warp, time_mask, or freq_mask should be applied")
+        if (
+            apply_time_mask
+            and (time_mask_width_range is not None)
+            and (time_mask_width_ratio_range is not None)
+        ):
+            raise ValueError(
+                'Either one of "time_mask_width_range" or '
+                '"time_mask_width_ratio_range" can be used'
+            )
+        super().__init__()
+        self.apply_time_warp = apply_time_warp
+        self.apply_freq_mask = apply_freq_mask
+        self.apply_time_mask = apply_time_mask
+        if apply_time_warp:
+            self.time_warp = TimeWarp(window=time_warp_window, mode=time_warp_mode)
+        else:
+            self.time_warp = None
+        if apply_freq_mask:
+            self.freq_mask = MaskAlongAxis(
+                dim="freq",
+                mask_width_range=freq_mask_width_range,
+                num_mask=num_freq_mask,
+                fill_value=fill_value,
+            )
+        else:
+            self.freq_mask = None
+        if apply_time_mask:
+            if time_mask_width_range is not None:
+                self.time_mask = MaskAlongAxis(
+                    dim="time",
+                    mask_width_range=time_mask_width_range,
+                    num_mask=num_time_mask,
+                    fill_value=fill_value,
+                )
+            elif time_mask_width_ratio_range is not None:
+                self.time_mask = MaskAlongAxisVariableMaxWidth(
+                    dim="time",
+                    mask_width_ratio_range=time_mask_width_ratio_range,
+                    num_mask=num_time_mask,
+                    fill_value=fill_value,
+                )
+            else:
+                raise ValueError(
+                    'Either one of "time_mask_width_range" or '
+                    '"time_mask_width_ratio_range" should be used.'
+                )
+        else:
+            self.time_mask = None
+    def forward(self, x, x_lengths=None):
+        if self.time_warp is not None:
+            x, x_lengths = self.time_warp(x, x_lengths)
+        if self.freq_mask is not None:
+            x, x_lengths = self.freq_mask(x, x_lengths)
+        if self.time_mask is not None:
+            x, x_lengths = self.time_mask(x, x_lengths)
+        return x, x_lengths

funcineforge/models/specaug/time_warp.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Time warp module."""
+import torch
+from funcineforge.models.utils.nets_utils import pad_list
+DEFAULT_TIME_WARP_MODE = "bicubic"
+def time_warp(x: torch.Tensor, window: int = 80, mode: str = DEFAULT_TIME_WARP_MODE):
+    """Time warping using torch.interpolate.
+    Args:
+        x: (Batch, Time, Freq)
+        window: time warp parameter
+        mode: Interpolate mode
+    """
+    # bicubic supports 4D or more dimension tensor
+    org_size = x.size()
+    if x.dim() == 3:
+        # x: (Batch, Time, Freq) -> (Batch, 1, Time, Freq)
+        x = x[:, None]
+    t = x.shape[2]
+    if t - window <= window:
+        return x.view(*org_size)
+    center = torch.randint(window, t - window, (1,))[0]
+    warped = torch.randint(center - window, center + window, (1,))[0] + 1
+    # left: (Batch, Channel, warped, Freq)
+    # right: (Batch, Channel, time - warped, Freq)
+    left = torch.nn.functional.interpolate(
+        x[:, :, :center], (warped, x.shape[3]), mode=mode, align_corners=False
+    )
+    right = torch.nn.functional.interpolate(
+        x[:, :, center:], (t - warped, x.shape[3]), mode=mode, align_corners=False
+    )
+    if x.requires_grad:
+        x = torch.cat([left, right], dim=-2)
+    else:
+        x[:, :, :warped] = left
+        x[:, :, warped:] = right
+    return x.view(*org_size)
+class TimeWarp(torch.nn.Module):
+    """Time warping using torch.interpolate.
+    Args:
+        window: time warp parameter
+        mode: Interpolate mode
+    """
+    def __init__(self, window: int = 80, mode: str = DEFAULT_TIME_WARP_MODE):
+        super().__init__()
+        self.window = window
+        self.mode = mode
+    def extra_repr(self):
+        return f"window={self.window}, mode={self.mode}"
+    def forward(self, x: torch.Tensor, x_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            x: (Batch, Time, Freq)
+            x_lengths: (Batch,)
+        """
+        if x_lengths is None or all(le == x_lengths[0] for le in x_lengths):
+            # Note that applying same warping for each sample
+            y = time_warp(x, window=self.window, mode=self.mode)
+        else:
+            # FIXME(kamo): I have no idea to batchify Timewarp
+            ys = []
+            for i in range(x.size(0)):
+                _y = time_warp(
+                    x[i][None, : x_lengths[i]],
+                    window=self.window,
+                    mode=self.mode,
+                )[0]
+                ys.append(_y)
+            y = pad_list(ys, 0.0)
+        return y, x_lengths

funcineforge/models/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import torch
2	+ dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}

funcineforge/models/utils/llm_decoding.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from contextlib import nullcontext
+import torch
+import torch.nn as nn
+from typing import Union
+from funcineforge.utils.hinter import hint_once
+import numpy as np
+dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
+class LLMDecoder(nn.Module):
+    def __init__(self, **kwargs):
+        super(LLMDecoder, self).__init__()
+        self.eos_token = kwargs["eos"]
+        if isinstance(self.eos_token, int):
+            self.eos_token = [self.eos_token]
+        self.token_embeder = kwargs["token_embeder"]
+        self.ras_conf = kwargs.get("ras_conf", {})
+        self.token_offset = kwargs.get("token_offset", 0)
+    def nucleus_sampling(self, weighted_scores, top_p=0.8, top_k=25, beam_size=1):
+        prob, indices = [], []
+        cum_prob = 0.0
+        sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
+        for i in range(len(sorted_idx)):
+            # sampling both top-p and numbers.
+            if cum_prob < top_p and len(prob) < top_k:
+                cum_prob += sorted_value[i]
+                prob.append(sorted_value[i])
+                indices.append(sorted_idx[i])
+            else:
+                break
+        prob = torch.tensor(prob).to(weighted_scores)
+        indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
+        sampling_ids = prob.multinomial(beam_size, replacement=True)
+        top_ids = indices[sampling_ids]
+        return top_ids
+    def random_sampling(self, weighted_scores, beam_size=1):
+        top_ids = weighted_scores.softmax(dim=0).multinomial(beam_size, replacement=True)
+        return top_ids
+    # Repetition Aware Sampling in VALL-E 2
+    def ras_sampling(
+            self, weighted_scores, decoded_tokens, *,
+            top_p=0.8, top_k=25, win_size=10, tau_r=0.1
+    ):
+        if self.ras_conf is not None:
+            top_p = self.ras_conf.get("top_p", top_p)
+            top_k = self.ras_conf.get("top_k", top_k)
+            win_size = self.ras_conf.get("win_size", win_size)
+            tau_r = self.ras_conf.get("tau_r", tau_r)
+        hint_once(f"using Repetition Aware Sampling: top_p: {top_p}, top_k: {top_k},win_size: {win_size}, tau_r: {tau_r}", "ras_sampling")
+        top_ids = self.nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
+        rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(top_ids) == top_ids).sum().item()
+        if rep_num >= win_size * tau_r:
+            top_ids = self.random_sampling(weighted_scores)
+        return top_ids
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            sampling: Union[bool, int, float] = True,
+            decoded_tokens: list = None,
+    ):
+        if isinstance(sampling, bool):
+            if sampling:
+                top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
+            else:
+                top_ids = weighted_scores.topk(1)[1]
+        elif isinstance(sampling, int):
+            prob, indices = weighted_scores.softmax(dim=0).topk(sampling)
+            sampling_ids = prob.multinomial(1, replacement=True)
+            top_ids = indices[sampling_ids]
+        elif isinstance(sampling, float):
+            prob, indices = [], []
+            cum_prob = 0.0
+            sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
+            for i in range(len(sorted_idx)):
+                # sampling both top-p and numbers.
+                if cum_prob < sampling and len(prob) < 25:
+                    cum_prob += sorted_value[i]
+                    prob.append(sorted_value[i])
+                    indices.append(sorted_idx[i])
+                else:
+                    break
+            prob = torch.tensor(prob).to(weighted_scores)
+            indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
+            sampling_ids = prob.multinomial(1, replacement=True)
+            top_ids = indices[sampling_ids]
+        elif isinstance(sampling, str) and sampling.lower() == "ras":
+            top_ids = self.ras_sampling(weighted_scores, decoded_tokens=decoded_tokens)
+        else:
+            raise NotImplementedError(f"Not implemented for {type(sampling)} sampling")
+        return top_ids
+    def __call__(self, input_embeddings, llm, states, quantize=False, **kwargs):
+        max_length = kwargs.get("max_length", 60 * 25)
+        min_length = kwargs.get("min_length", 2 * 25)
+        sampling = kwargs.get("sampling", True)
+        device = kwargs.get("device", "cuda")
+        llm_dtype = kwargs.get("llm_dtype", "fp32")
+        use_llm_cache = kwargs.get("use_llm_cache", True)
+        include_eos = kwargs.get("include_eos", False)
+        custom_eos_token = kwargs.get("custom_eos_token", self.eos_token)
+        avoid_token = kwargs.get("avoid_token", None)
+        llm_cache = states.get("llm_cache", None)
+        out_tokens, hit_eos = [], False
+        for i in range(max_length):
+            with torch.cuda.amp.autocast(
+                enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
+            ) if quantize is False else nullcontext():
+                # default attention_mask is causal, no longer need manually construct
+                # input_masks = torch.ones((1, input_embeddings.shape[1]), device=input_embeddings.device).to(torch.bool)
+                if (kwargs.get("use_qlora",False) or kwargs.get("infer_use_lora",False)) and (not kwargs.get("infer_lora_merged",False)):
+                    outputs = llm.base_model.model(
+                        inputs_embeds=input_embeddings.to(torch.bfloat16) if quantize is True else input_embeddings,
+                        # attention_mask=input_masks,
+                        output_hidden_states=True,
+                        return_dict=True,
+                        use_cache=use_llm_cache,
+                        past_key_values=llm_cache,
+                    )
+                else:
+                    outputs = llm(
+                        inputs_embeds=input_embeddings.to(torch.bfloat16) if quantize is True else input_embeddings,
+                        # attention_mask=input_masks,
+                        output_hidden_states=True,
+                        return_dict=True,
+                        use_cache=use_llm_cache,
+                        past_key_values=llm_cache,
+                    )
+                lm_hidden_states = outputs.hidden_states[-1]
+                h = llm.lm_head(lm_hidden_states[:, -1])
+                # logp = h.log_softmax(dim=-1).squeeze(0)
+                logp = h.squeeze(0)
+                if use_llm_cache:
+                    llm_cache = outputs.past_key_values
+                pred = torch.log_softmax(logp, dim=-1)
+            if min_length is not None and i < min_length:
+                for x in custom_eos_token:
+                    if pred.dtype == torch.bfloat16:
+                        pred[x] = float(np.finfo(np.float16).min)
+                    else:
+                        pred[x] = float(np.finfo(np.float32).min)
+            if avoid_token is not None and len(avoid_token) > 0:
+                for x in avoid_token:
+                    if pred.dtype == torch.bfloat16:
+                        pred[x] = float(np.finfo(np.float16).min)
+                    else:
+                        pred[x] = float(np.finfo(np.float32).min)
+            top_id = self.sampling_ids(pred, sampling, out_tokens)[0].item()
+            if top_id in custom_eos_token:
+                if include_eos:
+                    out_tokens.append(top_id)
+                hit_eos = True
+                break
+            out_tokens.append(top_id)
+            if use_llm_cache:
+                input_embeddings = self.token_embeder(torch.tensor([[top_id]], dtype=torch.int64, device=device) + self.token_offset)
+            else:
+                input_embeddings = torch.cat([
+                    input_embeddings,
+                    self.token_embeder(torch.tensor([[top_id]], dtype=torch.int64, device=device) + self.token_offset)
+                ], dim=1)
+        out_tokens = torch.tensor([out_tokens], dtype=torch.int64, device=device)
+        states = {"llm_cache": llm_cache}
+        return out_tokens, hit_eos, states

funcineforge/models/utils/mask_along_axis.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+from typing import Sequence
+from typing import Union
+class MaskTailVariableMaxWidth(torch.nn.Module):
+    def __init__(
+            self,
+            mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+            replace_value: float = 0.0,
+    ):
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.replace_value = replace_value
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        bb, tt, _ = spec.shape
+        mask_width_ratio = torch.rand((bb, 1), device=spec.device)
+        ratio_st, ratio_ed = self.mask_width_ratio_range
+        mask_width_ratio = mask_width_ratio * (ratio_ed - ratio_st) + ratio_st
+        mask_length = (mask_width_ratio * spec_lengths.unsqueeze(1)).to(spec_lengths)
+        # mask_pos: (B, 1)
+        mask_start_pos = spec_lengths.unsqueeze(-1) - mask_length
+        aran = torch.arange(tt, device=spec.device)[None, :]
+        # mask: (Batch, L)
+        mask = aran < mask_start_pos
+        # (Batch, L) -> (Batch, L, 1)
+        mask = mask.unsqueeze(2)
+        return mask
+class PrefixMaskVariableMaxWidth(torch.nn.Module):
+    def __init__(
+            self,
+            mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+            replace_value: float = 0.0,
+    ):
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.replace_value = replace_value
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None, return_mask: bool = False):
+        bb, tt, _ = spec.shape
+        mask_width_ratio_range = torch.tensor(self.mask_width_ratio_range, dtype=torch.float32, device=spec.device)
+        mask_width_range = (mask_width_ratio_range * tt).long()
+        mask_length = torch.randint(
+            mask_width_range[0],
+            mask_width_range[1],
+            (bb, 1),
+            device=spec.device,
+        ).unsqueeze(2)
+        # mask_pos: (B, num_mask, 1)
+        mask_pos = tt - mask_length
+        aran = torch.arange(tt, device=spec.device)[None, None, :]
+        # mask: (Batch, num_mask, L)
+        mask = (mask_pos <= aran) * (aran < (mask_pos + mask_length))
+        # Multiply masks: (Batch, num_mask, L) -> (Batch, L, 1)
+        mask = mask.any(dim=1).unsqueeze(2)
+        return mask

funcineforge/models/utils/masks.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    assert chunk_masks.dtype == torch.bool
+    if (chunk_masks.sum(dim=-1) == 0).sum().item() != 0:
+        print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
+        chunk_masks[chunk_masks.sum(dim=-1) == 0] = True
+    return chunk_masks
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
+def causal_block_mask(size, block_size=1, device="cpu", dtype=torch.bool):
+    """Create mask for subsequent steps (size, size).
+    :param int size: size of mask
+    :param int block_size: block size of mask
+    :param str device: "cpu" or "cuda" or torch.Tensor.device
+    :param torch.dtype dtype: result dtype
+    :rtype: torch.Tensor
+    >>> causal_block_mask(4, 2)
+    [[1, 1, 0, 0],
+     [1, 1, 0, 0],
+     [1, 1, 1, 1],
+     [1, 1, 1, 1]]
+    """
+    # assert size % block_size == 0
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, block_size, rounding_mode='trunc') + 1) * block_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret.to(dtype)

funcineforge/models/utils/nets_utils.py ADDED Viewed

	@@ -0,0 +1,734 @@

+# -*- coding: utf-8 -*-
+"""Network related utility tools."""
+import logging
+from typing import Dict, List, Tuple
+import numpy as np
+import torch
+def to_device(m, x):
+    """Send tensor into the device of the module.
+    Args:
+        m (torch.nn.Module): Torch module.
+        x (Tensor): Torch tensor.
+    Returns:
+        Tensor: Torch tensor located in the same place as torch module.
+    """
+    if isinstance(m, torch.nn.Module):
+        device = next(m.parameters()).device
+    elif isinstance(m, torch.Tensor):
+        device = m.device
+    else:
+        raise TypeError("Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}")
+    return x.to(device)
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+    return pad
+def pad_list_all_dim(xs, pad_value):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    num_dim = len(xs[0].shape)
+    max_len_all_dim = []
+    for i in range(num_dim):
+        max_len_all_dim.append(max(x.size(i) for x in xs))
+    pad = xs[0].new(n_batch, *max_len_all_dim).fill_(pad_value)
+    for i in range(n_batch):
+        if num_dim == 1:
+            pad[i, : xs[i].size(0)] = xs[i]
+        elif num_dim == 2:
+            pad[i, : xs[i].size(0), : xs[i].size(1)] = xs[i]
+        elif num_dim == 3:
+            pad[i, : xs[i].size(0), : xs[i].size(1), : xs[i].size(2)] = xs[i]
+        else:
+            raise ValueError(
+                "pad_list_all_dim only support 1-D, 2-D and 3-D tensors, not {}-D.".format(num_dim)
+            )
+    return pad
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    """Make mask tensor containing indices of padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0],
+                 [0, 0, 0, 0]],
+                [[0, 0, 0, 1],
+                 [0, 0, 0, 1]],
+                [[0, 0, 1, 1],
+                 [0, 0, 1, 1]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_pad_mask(lengths, xs, 1)
+        tensor([[[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)
+        >>> make_pad_mask(lengths, xs, 2)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+    if not isinstance(lengths, list):
+        lengths = lengths.tolist()
+    bs = int(len(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
+    else:
+        assert xs is None
+        assert maxlen >= int(max(lengths))
+    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    if xs is not None:
+        assert xs.size(0) == bs, (xs.size(0), bs)
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(slice(None) if i in (0, length_dim) else None for i in range(xs.dim()))
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+def make_non_pad_mask(lengths, xs=None, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 1, 0],
+                 [1, 1, 1, 0]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_non_pad_mask(lengths, xs, 1)
+        tensor([[[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
+        >>> make_non_pad_mask(lengths, xs, 2)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+    """
+    return ~make_pad_mask(lengths, xs, length_dim)
+def mask_by_length(xs, lengths, fill=0):
+    """Mask tensor according to length.
+    Args:
+        xs (Tensor): Batch of input tensor (B, `*`).
+        lengths (LongTensor or List): Batch of lengths (B,).
+        fill (int or float): Value to fill masked part.
+    Returns:
+        Tensor: Batch of masked input tensor (B, `*`).
+    Examples:
+        >>> x = torch.arange(5).repeat(3, 1) + 1
+        >>> x
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5]])
+        >>> lengths = [5, 3, 2]
+        >>> mask_by_length(x, lengths)
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 0, 0],
+                [1, 2, 0, 0, 0]])
+    """
+    assert xs.size(0) == len(lengths)
+    ret = xs.data.new(*xs.size()).fill_(fill)
+    for i, l in enumerate(lengths):
+        ret[i, :l] = xs[i, :l]
+    return ret
+def to_torch_tensor(x):
+    """Change to torch.Tensor or ComplexTensor from numpy.ndarray.
+    Args:
+        x: Inputs. It should be one of numpy.ndarray, Tensor, ComplexTensor, and dict.
+    Returns:
+        Tensor or ComplexTensor: Type converted inputs.
+    Examples:
+        >>> xs = np.ones(3, dtype=np.float32)
+        >>> xs = to_torch_tensor(xs)
+        tensor([1., 1., 1.])
+        >>> xs = torch.ones(3, 4, 5)
+        >>> assert to_torch_tensor(xs) is xs
+        >>> xs = {'real': xs, 'imag': xs}
+        >>> to_torch_tensor(xs)
+        ComplexTensor(
+        Real:
+        tensor([1., 1., 1.])
+        Imag;
+        tensor([1., 1., 1.])
+        )
+    """
+    # If numpy, change to torch tensor
+    if isinstance(x, np.ndarray):
+        if x.dtype.kind == "c":
+            # Dynamically importing because torch_complex requires python3
+            from torch_complex.tensor import ComplexTensor
+            return ComplexTensor(x)
+        else:
+            return torch.from_numpy(x)
+    # If {'real': ..., 'imag': ...}, convert to ComplexTensor
+    elif isinstance(x, dict):
+        # Dynamically importing because torch_complex requires python3
+        from torch_complex.tensor import ComplexTensor
+        if "real" not in x or "imag" not in x:
+            raise ValueError("has 'real' and 'imag' keys: {}".format(list(x)))
+        # Relative importing because of using python3 syntax
+        return ComplexTensor(x["real"], x["imag"])
+    # If torch.Tensor, as it is
+    elif isinstance(x, torch.Tensor):
+        return x
+    else:
+        error = (
+            "x must be numpy.ndarray, torch.Tensor or a dict like "
+            "{{'real': torch.Tensor, 'imag': torch.Tensor}}, "
+            "but got {}".format(type(x))
+        )
+        try:
+            from torch_complex.tensor import ComplexTensor
+        except Exception:
+            # If PY2
+            raise ValueError(error)
+        else:
+            # If PY3
+            if isinstance(x, ComplexTensor):
+                return x
+            else:
+                raise ValueError(error)
+def get_subsample(train_args, mode, arch):
+    """Parse the subsampling factors from the args for the specified `mode` and `arch`.
+    Args:
+        train_args: argument Namespace containing options.
+        mode: one of ('asr', 'mt', 'st')
+        arch: one of ('rnn', 'rnn-t', 'rnn_mix', 'rnn_mulenc', 'transformer')
+    Returns:
+        np.ndarray / List[np.ndarray]: subsampling factors.
+    """
+    if arch == "transformer":
+        return np.array([1])
+    elif mode == "mt" and arch == "rnn":
+        # +1 means input (+1) and layers outputs (train_args.elayer)
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int32)
+        logging.warning("Subsampling is not performed for machine translation.")
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif (
+        (mode == "asr" and arch in ("rnn", "rnn-t"))
+        or (mode == "mt" and arch == "rnn")
+        or (mode == "st" and arch == "rnn")
+    ):
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int32)
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(min(train_args.elayers + 1, len(ss))):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mix":
+        subsample = np.ones(train_args.elayers_sd + train_args.elayers + 1, dtype=np.int32)
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(min(train_args.elayers_sd + train_args.elayers + 1, len(ss))):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mulenc":
+        subsample_list = []
+        for idx in range(train_args.num_encs):
+            subsample = np.ones(train_args.elayers[idx] + 1, dtype=np.int32)
+            if train_args.etype[idx].endswith("p") and not train_args.etype[idx].startswith("vgg"):
+                ss = train_args.subsample[idx].split("_")
+                for j in range(min(train_args.elayers[idx] + 1, len(ss))):
+                    subsample[j] = int(ss[j])
+            else:
+                logging.warning(
+                    "Encoder %d: Subsampling is not performed for vgg*. "
+                    "It is performed in max pooling layers at CNN.",
+                    idx + 1,
+                )
+            logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+            subsample_list.append(subsample)
+        return subsample_list
+    else:
+        raise ValueError("Invalid options: mode={}, arch={}".format(mode, arch))
+def rename_state_dict(old_prefix: str, new_prefix: str, state_dict: Dict[str, torch.Tensor]):
+    """Replace keys of old prefix with new prefix in state dict."""
+    # need this list not to break the dict iterator
+    old_keys = [k for k in state_dict if k.startswith(old_prefix)]
+    if len(old_keys) > 0:
+        logging.warning(f"Rename: {old_prefix} -> {new_prefix}")
+    for k in old_keys:
+        v = state_dict.pop(k)
+        new_k = k.replace(old_prefix, new_prefix)
+        state_dict[new_k] = v
+class Swish(torch.nn.Module):
+    """Swish activation definition.
+    Swish(x) = (beta * x) * sigmoid(x)
+                 where beta = 1 defines standard Swish activation.
+    References:
+        https://arxiv.org/abs/2108.12943 / https://arxiv.org/abs/1710.05941v1.
+        E-swish variant: https://arxiv.org/abs/1801.07145.
+    Args:
+        beta: Beta parameter for E-Swish.
+                (beta >= 1. If beta < 1, use standard Swish).
+        use_builtin: Whether to use PyTorch function if available.
+    """
+    def __init__(self, beta: float = 1.0, use_builtin: bool = False) -> None:
+        super().__init__()
+        self.beta = beta
+        if beta > 1:
+            self.swish = lambda x: (self.beta * x) * torch.sigmoid(x)
+        else:
+            if use_builtin:
+                self.swish = torch.nn.SiLU()
+            else:
+                self.swish = lambda x: x * torch.sigmoid(x)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward computation."""
+        return self.swish(x)
+def get_activation(act):
+    """Return activation function."""
+    activation_funcs = {
+        "hardtanh": torch.nn.Hardtanh,
+        "tanh": torch.nn.Tanh,
+        "relu": torch.nn.ReLU,
+        "selu": torch.nn.SELU,
+        "swish": Swish,
+    }
+    return activation_funcs[act]()
+class TooShortUttError(Exception):
+    """Raised when the utt is too short for subsampling.
+    Args:
+        message: Error message to display.
+        actual_size: The size that cannot pass the subsampling.
+        limit: The size limit for subsampling.
+    """
+    def __init__(self, message: str, actual_size: int, limit: int) -> None:
+        """Construct a TooShortUttError module."""
+        super().__init__(message)
+        self.actual_size = actual_size
+        self.limit = limit
+def check_short_utt(sub_factor: int, size: int) -> Tuple[bool, int]:
+    """Check if the input is too short for subsampling.
+    Args:
+        sub_factor: Subsampling factor for Conv2DSubsampling.
+        size: Input size.
+    Returns:
+        : Whether an error should be sent.
+        : Size limit for specified subsampling factor.
+    """
+    if sub_factor == 2 and size < 3:
+        return True, 7
+    elif sub_factor == 4 and size < 7:
+        return True, 7
+    elif sub_factor == 6 and size < 11:
+        return True, 11
+    return False, -1
+def sub_factor_to_params(sub_factor: int, input_size: int) -> Tuple[int, int, int]:
+    """Get conv2D second layer parameters for given subsampling factor.
+    Args:
+        sub_factor: Subsampling factor (1/X).
+        input_size: Input size.
+    Returns:
+        : Kernel size for second convolution.
+        : Stride for second convolution.
+        : Conv2DSubsampling output size.
+    """
+    if sub_factor == 2:
+        return 3, 1, (((input_size - 1) // 2 - 2))
+    elif sub_factor == 4:
+        return 3, 2, (((input_size - 1) // 2 - 1) // 2)
+    elif sub_factor == 6:
+        return 5, 3, (((input_size - 1) // 2 - 2) // 3)
+    else:
+        raise ValueError("subsampling_factor parameter should be set to either 2, 4 or 6.")
+def make_chunk_mask(
+    size: int,
+    chunk_size: int,
+    left_chunk_size: int = 0,
+    device: torch.device = None,
+) -> torch.Tensor:
+    """Create chunk mask for the subsequent steps (size, size).
+    Reference: https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
+    Args:
+        size: Size of the source mask.
+        chunk_size: Number of frames in chunk.
+        left_chunk_size: Size of the left context in chunks (0 means full context).
+        device: Device for the mask tensor.
+    Returns:
+        mask: Chunk mask. (size, size)
+    """
+    mask = torch.zeros(size, size, device=device, dtype=torch.bool)
+    for i in range(size):
+        if left_chunk_size < 0:
+            start = 0
+        else:
+            start = max((i // chunk_size - left_chunk_size) * chunk_size, 0)
+        end = min((i // chunk_size + 1) * chunk_size, size)
+        mask[i, start:end] = True
+    return ~mask
+def make_source_mask(lengths: torch.Tensor) -> torch.Tensor:
+    """Create source mask for given lengths.
+    Reference: https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
+    Args:
+        lengths: Sequence lengths. (B,)
+    Returns:
+        : Mask for the sequence lengths. (B, max_len)
+    """
+    max_len = lengths.max()
+    batch_size = lengths.size(0)
+    expanded_lengths = torch.arange(max_len).expand(batch_size, max_len).to(lengths)
+    return expanded_lengths >= lengths.unsqueeze(1)
+def get_transducer_task_io(
+    labels: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    ignore_id: int = -1,
+    blank_id: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Get Transducer loss I/O.
+    Args:
+        labels: Label ID sequences. (B, L)
+        encoder_out_lens: Encoder output lengths. (B,)
+        ignore_id: Padding symbol ID.
+        blank_id: Blank symbol ID.
+    Returns:
+        decoder_in: Decoder inputs. (B, U)
+        target: Target label ID sequences. (B, U)
+        t_len: Time lengths. (B,)
+        u_len: Label lengths. (B,)
+    """
+    def pad_list(labels: List[torch.Tensor], padding_value: int = 0):
+        """Create padded batch of labels from a list of labels sequences.
+        Args:
+            labels: Labels sequences. [B x (?)]
+            padding_value: Padding value.
+        Returns:
+            labels: Batch of padded labels sequences. (B,)
+        """
+        batch_size = len(labels)
+        padded = (
+            labels[0]
+            .new(batch_size, max(x.size(0) for x in labels), *labels[0].size()[1:])
+            .fill_(padding_value)
+        )
+        for i in range(batch_size):
+            padded[i, : labels[i].size(0)] = labels[i]
+        return padded
+    device = labels.device
+    labels_unpad = [y[y != ignore_id] for y in labels]
+    blank = labels[0].new([blank_id])
+    decoder_in = pad_list(
+        [torch.cat([blank, label], dim=0) for label in labels_unpad], blank_id
+    ).to(device)
+    target = pad_list(labels_unpad, blank_id).type(torch.int32).to(device)
+    encoder_out_lens = list(map(int, encoder_out_lens))
+    t_len = torch.IntTensor(encoder_out_lens).to(device)
+    u_len = torch.IntTensor([y.size(0) for y in labels_unpad]).to(device)
+    return decoder_in, target, t_len, u_len
+def pad_to_len(t: torch.Tensor, pad_len: int, dim: int):
+    """Pad the tensor `t` at `dim` to the length `pad_len` with right padding zeros."""
+    if t.size(dim) == pad_len:
+        return t
+    else:
+        pad_size = list(t.shape)
+        pad_size[dim] = pad_len - t.size(dim)
+        return torch.cat([t, torch.zeros(*pad_size, dtype=t.dtype, device=t.device)], dim=dim)

funcineforge/tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .tokenizer import FunCineForgeTokenizer