Humair332 commited on
Commit
545efe4
·
verified ·
1 Parent(s): a070ef9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import soundfile as sf
5
+ from scipy.signal import resample
6
+
7
+ # import your codec
8
+ from irodori_tts.codec import DACVAECodec
9
+
10
+
11
+ # =============================
12
+ # LOAD MODEL
13
+ # =============================
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ codec = DACVAECodec.load(
17
+ repo_id="Aratako/Semantic-DACVAE-Japanese-32dim",
18
+ device=DEVICE,
19
+ )
20
+
21
+
22
+ # =============================
23
+ # AUDIO UTILS (NO TORCHAUDIO)
24
+ # =============================
25
+ def load_audio(path):
26
+ audio, sr = sf.read(path, dtype="float32")
27
+
28
+ # convert to mono
29
+ if audio.ndim > 1:
30
+ audio = np.mean(audio, axis=1)
31
+
32
+ return audio, sr
33
+
34
+
35
+ def resample_audio(audio, orig_sr, target_sr):
36
+ if orig_sr == target_sr:
37
+ return audio
38
+
39
+ num_samples = int(len(audio) * target_sr / orig_sr)
40
+ return resample(audio, num_samples)
41
+
42
+
43
+ def to_tensor(audio):
44
+ return torch.from_numpy(audio).unsqueeze(0).unsqueeze(0) # (1,1,T)
45
+
46
+
47
+ # =============================
48
+ # ENCODE
49
+ # =============================
50
+ def encode_audio(file):
51
+ audio, sr = load_audio(file)
52
+
53
+ # resample
54
+ audio = resample_audio(audio, sr, codec.sample_rate)
55
+
56
+ wav = to_tensor(audio).to(DEVICE)
57
+
58
+ latent = codec.encode_waveform(wav, codec.sample_rate)
59
+
60
+ return latent.cpu().numpy()
61
+
62
+
63
+ # =============================
64
+ # DECODE
65
+ # =============================
66
+ def decode_audio(latent_np):
67
+ latent = torch.tensor(latent_np).to(DEVICE)
68
+
69
+ if latent.ndim == 2:
70
+ latent = latent.unsqueeze(0)
71
+
72
+ audio = codec.decode_latent(latent)
73
+
74
+ audio = audio.squeeze().cpu().numpy()
75
+
76
+ return (codec.sample_rate, audio)
77
+
78
+
79
+ # =============================
80
+ # GRADIO UI
81
+ # =============================
82
+ with gr.Blocks() as demo:
83
+ gr.Markdown("## 🎧 DACVAE Audio Codec (SoundFile Version)")
84
+
85
+ with gr.Tab("Encode"):
86
+ audio_in = gr.Audio(type="filepath")
87
+ latent_out = gr.Textbox(label="Latent (numpy array)")
88
+
89
+ btn_encode = gr.Button("Encode")
90
+ btn_encode.click(encode_audio, inputs=audio_in, outputs=latent_out)
91
+
92
+ with gr.Tab("Decode"):
93
+ latent_in = gr.Textbox(label="Paste latent numpy array")
94
+ audio_out = gr.Audio()
95
+
96
+ def decode_from_text(text):
97
+ latent = np.array(eval(text))
98
+ return decode_audio(latent)
99
+
100
+ btn_decode = gr.Button("Decode")
101
+ btn_decode.click(decode_from_text, inputs=latent_in, outputs=audio_out)
102
+
103
+
104
+ # =============================
105
+ # RUN
106
+ # =============================
107
+ if __name__ == "__main__":
108
+ demo.launch()