File size: 7,559 Bytes
b28205b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import torch
from TTS.api import TTS
import os
from tqdm import tqdm
import argparse
# Parse arguments
parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
parser.add_argument('-t', '--text_file', type=str, required=True,
help='Path to text file containing text and audio reference files')
parser.add_argument('-r', '--ref_dir', type=str, required=True,
help='Root directory containing reference audio files')
parser.add_argument('-s', '--savedir', type=str, required=True,
help='Directory to store synthesized audio files')
parser.add_argument('-d', '--device', type=str, required=True,
help='Device to use for synthesis (cpu or cuda)')
parser.add_argument('-m', '--model_path', type=str,
default="/app/models/best_model_479919.pth",
help='Path to the model file')
parser.add_argument('-c', '--config_path', type=str,
default="/app/models/config.json",
help='Path to the config file')
args = parser.parse_args()
# Get device
device = args.device if args.device in ["cpu", "cuda"] else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize TTS model
print(f"Loading model from {args.model_path} with config {args.config_path}")
tts = TTS(
model_path=args.model_path,
config_path=args.config_path,
progress_bar=False,
).to(device)
# Create output directory
os.makedirs(args.savedir, exist_ok=True)
print(f"Output directory: {args.savedir}")
# Read the text file
print(f"Reading text file: {args.text_file}")
with open(args.text_file, 'r') as f:
lines = f.readlines()
# Process each line
print(f"Processing {len(lines)} entries...")
for i, line in enumerate(tqdm(lines)):
parts = line.strip().split('\t')
if len(parts) != 4:
print(f"Warning: Line {i+1} does not have 4 tab-separated parts. Skipping.")
continue
idx, lang, text, ref_file = parts
ref_path = os.path.join(args.ref_dir, ref_file)
save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
print(f"Synthesizing: {text[:30]}... using reference {ref_path}")
tts.tts_to_file(text=text, speaker_wav=ref_path, language=lang, file_path=save_path)
print(f"Saved to: {save_path}")
print("Synthesis complete!")
# import torch
# from TTS.api import TTS
# import os
# from tqdm import tqdm
# import argparse
# # Get device
# device = "cuda:3" if torch.cuda.is_available() else "cpu"
# sentences_dict = {
# "te": ["వడ్రంగి, క్షురక వృత్తులలో పెట్టుబడి ప్రధానమై ఇతరులు కూడా ఈ వృత్తిలో ప్రవేశించి వ్యాపారంగా మార్చేసార",
# "నేను ఈ రోజు నాకు ఇష్టమైన పుస్తకాన్ని చదివాను మరియు తరువాత నా స్నేహితుడితో సినిమాకు వెళ్ళాను",
# "ఈ వేసవి సెలవులలో నేను నా కుటుంబంతో కలిసి ఒక అందమైన బీచ్కి వెళ్ళాలని అనుకుంటున్నాను"],
# "mr": ["जायकवाडी धरणातून तब्बल अडीच ते तीन लाख हेक्टर शेतीच्या सिंचनासाठी पाणी सोडलं जातं",
# "मी आज माझ्या आवडत्या पुस्तकाचे वाचन केले आणि नंतर माझ्या मित्रासोबत चित्रपटाला गेलो",
# "या उन्हाळी सुट्टीत मी माझ्या कुटुंबासोबत एक सुंदर समुद्रकिनाऱ्यावर जाण्याचा विचार करतो"],
# "bho": ["बिहार के बक्सर जिला के बक्सर नगर निगम क्षेत्र में गंगा नदी पर बने बक्सर पुल का उद्घाटन आज प्रधानमंत्री नरेंद्र मोदी करेंगे",
# "एन्ट्रापी कंप्यूटिंग में एन्ट्रोपी ऊ ऑपरेटिंग सिस्टम ह जे पे सरा क्रिप्टोग्राफिक फंक्शन सब काम करे लें",
# "हमार मंडराये वाली जहाज़ सर्पमीनन से भरी है"],
# }
# tts = TTS(
# model_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/best_model_479919.pth",
# config_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/config.json",
# progress_bar=False,
# ).to(device)
# parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
# parser.add_argument('-t', '--text_file', type=str, required=True,
# help='Path to text file containing text and audio reference files')
# parser.add_argument('-r', '--ref_dir', type=str, required=True,
# help='Root directory containing reference audio files')
# parser.add_argument('-s', '--savedir', type=str, required=True,
# help='Directory to store synthesized audio files')
# parser.add_argument('-d', '--device', type=str, required=True,
# help='Device to use for synthesis (cpu or cuda)')
# args = parser.parse_args()
# os.makedirs(args.savedir, exist_ok=True)
# # Read the text file
# with open(args.text_file, 'r') as f:
# lines = f.readlines()
# for line in lines:
# idx, lang, text, ref_file = line.strip().split('\t')
# ref_file = os.path.join(args.ref_dir, ref_file)
# save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
# tts.tts_to_file(text=text, speaker_wav=ref_file, language=lang, file_path=save_path)
# # ref_files = [os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/", x) for x in os.listdir("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/")]
# # for ref_file in ref_files:
# # for language_key in sentences_dict.keys():
# # for s_idx, sentence in enumerate(sentences_dict[language_key]):
# # save_path = os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_infers/", f"test_{language_key}_{s_idx}_{os.path.basename(ref_file)}")
# # tts.tts_to_file(text=sentence, speaker_wav=ref_file, language=language_key, file_path=save_path)
# # tts.tts_to_file(text="ಹಸ್ದೇವ್ ನದಿ, ರಿಹಂಡ್ ನದಿ ಮತ್ತು ಕನ್ಹರ್ ನದಿಗಳು ಸುರ್ಗುಜಾದ ಮುಖಜ ಭೂಮಿಯಲ್ಲಿ ಹರಿಯುತ್ತವೆ.", speaker_wav="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/syspin_data/Chhattisgarhi_Male/wavs/IISc_SYSPINProject_chha_m_AGRI_00001.wav", language="kn", file_path="test_kn.wav")
|