Spaces:
Running
Running
files update
Browse files- app.py +461 -378
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,15 +1,23 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import os
|
| 11 |
import base64
|
|
|
|
| 12 |
import math
|
|
|
|
| 13 |
import shutil
|
| 14 |
import struct
|
| 15 |
import subprocess
|
|
@@ -18,9 +26,10 @@ import time
|
|
| 18 |
import re
|
| 19 |
|
| 20 |
import gradio as gr
|
|
|
|
| 21 |
from openai import OpenAI
|
| 22 |
|
| 23 |
-
# Optional document parsers
|
| 24 |
try:
|
| 25 |
import pypdf
|
| 26 |
HAS_PYPDF = True
|
|
@@ -33,139 +42,118 @@ try:
|
|
| 33 |
except ImportError:
|
| 34 |
HAS_DOCX = False
|
| 35 |
|
| 36 |
-
# ──────────────────────────────────────────────
|
| 37 |
# Configuration
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
# Maximum characters per chunk sent to the API
|
| 43 |
-
# The model has token limits, so we split long texts
|
| 44 |
MAX_CHARS_PER_CHUNK = 1500
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
# Core 10 languages have the best quality; extended languages are supported
|
| 48 |
-
# but may vary in quality as they include dialects
|
| 49 |
LANGUAGES = {
|
| 50 |
-
# ── Core 10 Languages (highest quality) ──
|
| 51 |
"English": {"code": "en", "native": "English", "tier": "core"},
|
| 52 |
-
"Chinese (Mandarin)": {"code": "zh", "native": "
|
| 53 |
-
"Japanese": {"code": "ja", "native": "
|
| 54 |
-
"Korean": {"code": "ko", "native": "
|
| 55 |
"German": {"code": "de", "native": "Deutsch", "tier": "core"},
|
| 56 |
-
"French": {"code": "fr", "native": "
|
| 57 |
-
"Russian": {"code": "ru", "native": "
|
| 58 |
-
"Portuguese": {"code": "pt", "native": "
|
| 59 |
-
"Spanish": {"code": "es", "native": "
|
| 60 |
"Italian": {"code": "it", "native": "Italiano", "tier": "core"},
|
| 61 |
-
|
| 62 |
-
"Arabic": {"code": "ar", "native": "العربية", "tier": "extended"},
|
| 63 |
"Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
|
| 64 |
"Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
|
| 65 |
-
"Turkish": {"code": "tr", "native": "
|
| 66 |
-
"Vietnamese": {"code": "vi", "native": "
|
| 67 |
-
"Thai": {"code": "th", "native": "
|
| 68 |
"Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
|
| 69 |
"Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
|
| 70 |
-
"Hindi": {"code": "hi", "native": "
|
| 71 |
-
"Bengali": {"code": "bn", "native": "
|
| 72 |
-
"Urdu": {"code": "ur", "native": "
|
| 73 |
"Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
|
| 74 |
-
"Czech": {"code": "cs", "native": "
|
| 75 |
-
"Romanian": {"code": "ro", "native": "
|
| 76 |
-
"Greek": {"code": "el", "native": "
|
| 77 |
"Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
|
| 78 |
"Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
|
| 79 |
"Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
|
| 80 |
"Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
|
| 81 |
-
"Ukrainian": {"code": "uk", "native": "
|
| 82 |
-
"Hebrew": {"code": "he", "native": "
|
| 83 |
-
"Persian": {"code": "fa", "native": "
|
| 84 |
-
"Cantonese": {"code": "yue", "native": "
|
| 85 |
"Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
|
| 86 |
"Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
|
| 87 |
-
"Tamil": {"code": "ta", "native": "
|
| 88 |
}
|
| 89 |
|
| 90 |
-
|
| 91 |
-
"
|
| 92 |
-
|
| 93 |
-
"Ryan — Dramatic, rhythmic",
|
| 94 |
-
"Kai — Soothing, calm",
|
| 95 |
-
"Neil — Precise, clear",
|
| 96 |
-
"Lenn — Rational, steady",
|
| 97 |
-
"Aiden — Young, lively",
|
| 98 |
-
"Eldric Sage — Authoritative narrator",
|
| 99 |
-
"Arthur — Classic, mature",
|
| 100 |
-
"Elias — Soft, thoughtful",
|
| 101 |
-
"Alek — Confident, modern",
|
| 102 |
-
"Andre — Deep, resonant",
|
| 103 |
-
"Emilien — Gentle, French-inspired",
|
| 104 |
-
"Vincent — Rich, theatrical",
|
| 105 |
-
],
|
| 106 |
-
"Female Voices": [
|
| 107 |
-
"Cherry — Sunny, friendly",
|
| 108 |
-
"Serena — Gentle, soft",
|
| 109 |
-
"Jennifer — Cinematic narrator",
|
| 110 |
-
"Katerina — Mature, rich rhythm",
|
| 111 |
-
"Chelsie — Bright, expressive",
|
| 112 |
-
"Mia — Young, versatile",
|
| 113 |
-
"Bella — Elegant, warm",
|
| 114 |
-
"Vivian — Professional, clear",
|
| 115 |
-
"Moon — Dreamy, ethereal",
|
| 116 |
-
"Maia — Confident, articulate",
|
| 117 |
-
"Seren — Calm, measured",
|
| 118 |
-
"Dolce — Sweet, melodic",
|
| 119 |
-
"Bellona — Strong, commanding",
|
| 120 |
-
"Bunny — Playful, light",
|
| 121 |
-
"Momo — Cute, upbeat",
|
| 122 |
-
"Mochi — Soft, adorable",
|
| 123 |
-
],
|
| 124 |
}
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
def get_voice_name(voice_label: str) -> str:
|
| 134 |
-
"""Extract just the voice name from 'Name — Description' format."""
|
| 135 |
-
return voice_label.split("—")[0].strip()
|
| 136 |
|
| 137 |
-
|
| 138 |
-
# ──────────────────────────────────────────────
|
| 139 |
# Audio helpers
|
| 140 |
-
|
| 141 |
-
def base64_to_wav(b64_data: str, output_path: str):
|
| 142 |
-
"""Decode base64 PCM data and write a proper WAV file."""
|
| 143 |
audio_bytes = base64.b64decode(b64_data)
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
with open(output_path, "wb") as f:
|
| 151 |
f.write(b"RIFF")
|
| 152 |
-
f.write(struct.pack("<I", 36 +
|
| 153 |
f.write(b"WAVE")
|
| 154 |
f.write(b"fmt ")
|
| 155 |
f.write(struct.pack("<I", 16))
|
| 156 |
f.write(struct.pack("<H", 1))
|
| 157 |
-
f.write(struct.pack("<H",
|
| 158 |
-
f.write(struct.pack("<I",
|
| 159 |
-
f.write(struct.pack("<I",
|
| 160 |
-
f.write(struct.pack("<H",
|
| 161 |
-
f.write(struct.pack("<H",
|
| 162 |
f.write(b"data")
|
| 163 |
-
f.write(struct.pack("<I",
|
| 164 |
f.write(audio_bytes)
|
| 165 |
|
| 166 |
|
| 167 |
-
def concatenate_wavs(wav_files
|
| 168 |
-
"""Concatenate multiple WAV files using ffmpeg."""
|
| 169 |
if not wav_files:
|
| 170 |
return
|
| 171 |
if len(wav_files) == 1:
|
|
@@ -183,43 +171,90 @@ def concatenate_wavs(wav_files: list, output_path: str):
|
|
| 183 |
os.remove(list_file)
|
| 184 |
|
| 185 |
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# Text splitting
|
| 188 |
-
|
| 189 |
-
def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> list:
|
| 190 |
-
"""
|
| 191 |
-
Split text into chunks at sentence boundaries.
|
| 192 |
-
Tries to keep paragraphs together when possible.
|
| 193 |
-
"""
|
| 194 |
-
# Normalize whitespace
|
| 195 |
text = text.strip()
|
| 196 |
if not text:
|
| 197 |
return []
|
| 198 |
-
|
| 199 |
-
# If short enough, return as-is
|
| 200 |
if len(text) <= max_chars:
|
| 201 |
return [text]
|
| 202 |
|
| 203 |
chunks = []
|
| 204 |
-
# First split by paragraphs
|
| 205 |
paragraphs = re.split(r"\n\s*\n", text)
|
| 206 |
-
|
| 207 |
current_chunk = ""
|
|
|
|
| 208 |
for para in paragraphs:
|
| 209 |
para = para.strip()
|
| 210 |
if not para:
|
| 211 |
continue
|
| 212 |
-
|
| 213 |
-
# If adding this paragraph keeps us under the limit
|
| 214 |
if len(current_chunk) + len(para) + 2 <= max_chars:
|
| 215 |
current_chunk = (current_chunk + "\n\n" + para).strip()
|
| 216 |
else:
|
| 217 |
-
# Save current chunk if it has content
|
| 218 |
if current_chunk:
|
| 219 |
chunks.append(current_chunk)
|
| 220 |
current_chunk = ""
|
| 221 |
-
|
| 222 |
-
# If the paragraph itself is too long, split by sentences
|
| 223 |
if len(para) > max_chars:
|
| 224 |
sentences = re.split(r"(?<=[.!?])\s+", para)
|
| 225 |
for sentence in sentences:
|
|
@@ -228,7 +263,6 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
|
|
| 228 |
else:
|
| 229 |
if current_chunk:
|
| 230 |
chunks.append(current_chunk)
|
| 231 |
-
# If a single sentence is too long, force-split it
|
| 232 |
if len(sentence) > max_chars:
|
| 233 |
words = sentence.split()
|
| 234 |
current_chunk = ""
|
|
@@ -246,64 +280,148 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
|
|
| 246 |
|
| 247 |
if current_chunk:
|
| 248 |
chunks.append(current_chunk)
|
| 249 |
-
|
| 250 |
return chunks
|
| 251 |
|
| 252 |
|
| 253 |
-
#
|
| 254 |
-
#
|
| 255 |
-
#
|
| 256 |
-
def
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
""
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
|
| 273 |
|
| 274 |
if translate and language != "English":
|
| 275 |
system_prompt = (
|
| 276 |
f"You are a professional audiobook narrator and translator.\n"
|
| 277 |
-
f"
|
| 278 |
-
f"
|
| 279 |
-
f"
|
| 280 |
-
f"3. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
|
| 281 |
-
f" descriptions, and emotional moments.\n"
|
| 282 |
-
f"4. Respond ONLY with the spoken {language} narration — no English,\n"
|
| 283 |
-
f" no meta-commentary, no chapter headers unless they're in the text.\n"
|
| 284 |
-
f"5. Maintain a natural reading pace suitable for an audiobook.\n"
|
| 285 |
-
f"6. Translate idioms and cultural references appropriately."
|
| 286 |
-
)
|
| 287 |
-
user_text = (
|
| 288 |
-
f"Translate the following English text into {language} and narrate it "
|
| 289 |
-
f"as an audiobook. Respond only with the spoken {language} narration:\n\n{text}"
|
| 290 |
)
|
|
|
|
| 291 |
else:
|
| 292 |
system_prompt = (
|
| 293 |
"You are a professional audiobook narrator.\n"
|
| 294 |
-
"
|
| 295 |
-
"
|
| 296 |
-
"2. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
|
| 297 |
-
" descriptions, and emotional moments.\n"
|
| 298 |
-
"3. Respond ONLY with the spoken narration — no meta-commentary.\n"
|
| 299 |
-
"4. Maintain a natural reading pace suitable for an audiobook.\n"
|
| 300 |
-
"5. Pause appropriately between paragraphs and at punctuation."
|
| 301 |
)
|
| 302 |
-
user_text = f"Narrate
|
| 303 |
|
| 304 |
try:
|
| 305 |
completion = client.chat.completions.create(
|
| 306 |
-
model=
|
| 307 |
messages=[
|
| 308 |
{"role": "system", "content": system_prompt},
|
| 309 |
{"role": "user", "content": user_text},
|
|
@@ -336,103 +454,18 @@ def generate_speech_chunk(
|
|
| 336 |
full_audio_b64 = "".join(audio_chunks)
|
| 337 |
base64_to_wav(full_audio_b64, output_wav)
|
| 338 |
return output_wav, transcript
|
| 339 |
-
|
| 340 |
-
return None, "No audio received from API"
|
| 341 |
|
| 342 |
except Exception as e:
|
| 343 |
return None, str(e)
|
| 344 |
|
| 345 |
|
| 346 |
-
#
|
| 347 |
-
#
|
| 348 |
-
#
|
| 349 |
-
def
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
["ffmpeg", "-y", "-f", "lavfi",
|
| 353 |
-
"-i", f"anullsrc=r=24000:cl=mono",
|
| 354 |
-
"-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
|
| 355 |
-
capture_output=True, check=True,
|
| 356 |
-
)
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
# ──────────────────────────────────────────────
|
| 360 |
-
# Document text extraction
|
| 361 |
-
# ──────────────────────────────────────────────
|
| 362 |
-
def extract_text_from_pdf(filepath: str) -> str:
|
| 363 |
-
"""Extract text from a PDF file using pypdf."""
|
| 364 |
-
if not HAS_PYPDF:
|
| 365 |
-
raise ImportError("pypdf is not installed. Cannot read PDF files.")
|
| 366 |
-
reader = pypdf.PdfReader(filepath)
|
| 367 |
-
pages = []
|
| 368 |
-
for page in reader.pages:
|
| 369 |
-
text = page.extract_text()
|
| 370 |
-
if text:
|
| 371 |
-
pages.append(text.strip())
|
| 372 |
-
return "\n\n".join(pages)
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
def extract_text_from_docx(filepath: str) -> str:
|
| 376 |
-
"""Extract text from a .docx file using python-docx."""
|
| 377 |
-
if not HAS_DOCX:
|
| 378 |
-
raise ImportError("python-docx is not installed. Cannot read Word files.")
|
| 379 |
-
doc = docx.Document(filepath)
|
| 380 |
-
paragraphs = []
|
| 381 |
-
for para in doc.paragraphs:
|
| 382 |
-
text = para.text.strip()
|
| 383 |
-
if text:
|
| 384 |
-
paragraphs.append(text)
|
| 385 |
-
return "\n\n".join(paragraphs)
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
def extract_text_from_file(filepath: str) -> str:
|
| 389 |
-
"""Extract text from a file based on its extension."""
|
| 390 |
-
ext = os.path.splitext(filepath)[1].lower()
|
| 391 |
-
|
| 392 |
-
if ext == ".pdf":
|
| 393 |
-
return extract_text_from_pdf(filepath)
|
| 394 |
-
elif ext in (".docx", ".doc"):
|
| 395 |
-
if ext == ".doc":
|
| 396 |
-
# .doc (old format) — try converting with LibreOffice if available
|
| 397 |
-
try:
|
| 398 |
-
tmp_dir = tempfile.mkdtemp()
|
| 399 |
-
subprocess.run(
|
| 400 |
-
["libreoffice", "--headless", "--convert-to", "docx",
|
| 401 |
-
"--outdir", tmp_dir, filepath],
|
| 402 |
-
capture_output=True, check=True, timeout=60,
|
| 403 |
-
)
|
| 404 |
-
docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
|
| 405 |
-
docx_path = os.path.join(tmp_dir, docx_name)
|
| 406 |
-
if os.path.exists(docx_path):
|
| 407 |
-
text = extract_text_from_docx(docx_path)
|
| 408 |
-
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 409 |
-
return text
|
| 410 |
-
except Exception:
|
| 411 |
-
pass
|
| 412 |
-
raise gr.Error(
|
| 413 |
-
"Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
|
| 414 |
-
)
|
| 415 |
-
return extract_text_from_docx(filepath)
|
| 416 |
-
else:
|
| 417 |
-
# Plain text files (.txt, .md, etc.)
|
| 418 |
-
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
| 419 |
-
return f.read()
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
# ──────────────────────────────────────────────
|
| 423 |
-
# Main pipeline
|
| 424 |
-
# ──────────────────────────────────────────────
|
| 425 |
-
def generate_audiobook(
|
| 426 |
-
text_input: str,
|
| 427 |
-
file_input,
|
| 428 |
-
target_language: str,
|
| 429 |
-
voice_label: str,
|
| 430 |
-
add_pauses: bool,
|
| 431 |
-
progress=gr.Progress(),
|
| 432 |
-
):
|
| 433 |
-
"""Main audiobook generation pipeline."""
|
| 434 |
-
|
| 435 |
-
# ── Resolve text source ──
|
| 436 |
if file_input is not None:
|
| 437 |
try:
|
| 438 |
progress(0.02, desc="Extracting text from document...")
|
|
@@ -447,32 +480,45 @@ def generate_audiobook(
|
|
| 447 |
raise gr.Error("Please provide text or upload a file.")
|
| 448 |
|
| 449 |
if len(text) < 10:
|
| 450 |
-
raise gr.Error("Text is too short.
|
| 451 |
|
| 452 |
-
# ── API key ──
|
| 453 |
api_key = os.environ.get("DASHSCOPE_API_KEY", "")
|
| 454 |
if not api_key:
|
| 455 |
-
raise gr.Error(
|
| 456 |
-
"DASHSCOPE_API_KEY not set. Add it as a Space Secret "
|
| 457 |
-
"(Settings → Secrets → New Secret)."
|
| 458 |
-
)
|
| 459 |
|
| 460 |
-
voice = get_voice_name(voice_label)
|
| 461 |
lang_config = LANGUAGES[target_language]
|
|
|
|
| 462 |
translate = target_language != "English"
|
| 463 |
client = OpenAI(api_key=api_key, base_url=BASE_URL)
|
| 464 |
tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
try:
|
| 467 |
-
#
|
| 468 |
-
progress(0.
|
| 469 |
chunks = split_text_into_chunks(text)
|
| 470 |
total_chunks = len(chunks)
|
| 471 |
total_chars = sum(len(c) for c in chunks)
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
# ── Generate speech for each chunk ──
|
| 476 |
audio_files = []
|
| 477 |
all_transcripts = []
|
| 478 |
silence_path = os.path.join(tmp_dir, "silence.wav")
|
|
@@ -480,39 +526,64 @@ def generate_audiobook(
|
|
| 480 |
generate_silence(1.5, silence_path)
|
| 481 |
|
| 482 |
for i, chunk in enumerate(chunks):
|
| 483 |
-
frac = 0.
|
| 484 |
progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
|
| 485 |
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
| 496 |
else:
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
-
|
| 504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
if not audio_files:
|
| 507 |
-
raise gr.Error("No audio was generated.
|
| 508 |
|
| 509 |
-
#
|
| 510 |
-
progress(0.
|
| 511 |
final_audio = os.path.join(tmp_dir, "audiobook.wav")
|
| 512 |
concatenate_wavs(audio_files, final_audio)
|
| 513 |
|
| 514 |
-
#
|
| 515 |
-
progress(0.
|
| 516 |
final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
|
| 517 |
subprocess.run(
|
| 518 |
["ffmpeg", "-y", "-i", final_audio,
|
|
@@ -523,21 +594,20 @@ def generate_audiobook(
|
|
| 523 |
|
| 524 |
progress(1.0, desc="Done!")
|
| 525 |
|
| 526 |
-
# Build transcript display
|
| 527 |
-
transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
|
| 528 |
-
|
| 529 |
-
# Stats
|
| 530 |
audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
|
|
|
|
| 531 |
stats = (
|
| 532 |
f"**Audiobook Generated!**\n\n"
|
| 533 |
f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
|
| 534 |
f"- **Language:** {target_language} ({lang_config['native']})\n"
|
| 535 |
-
f"- **Voice:** {
|
|
|
|
| 536 |
f"- **File size:** {audio_size:.1f} MB\n"
|
| 537 |
-
f"- **Quality tier:** {lang_config['tier'].title()}\n"
|
| 538 |
)
|
| 539 |
-
if lang_config["tier"] == "extended":
|
| 540 |
-
stats += "\n>
|
|
|
|
|
|
|
| 541 |
|
| 542 |
return final_mp3, stats, transcript_text
|
| 543 |
|
|
@@ -545,33 +615,11 @@ def generate_audiobook(
|
|
| 545 |
raise
|
| 546 |
except Exception as e:
|
| 547 |
raise gr.Error(f"Pipeline error: {str(e)}")
|
| 548 |
-
finally:
|
| 549 |
-
# Don't clean up tmp_dir yet — Gradio needs the files
|
| 550 |
-
pass
|
| 551 |
-
|
| 552 |
|
| 553 |
-
# ──────────────────────────────────────────────
|
| 554 |
-
# Build language choices with tier labels
|
| 555 |
-
# ──────────────────────────────────────────────
|
| 556 |
-
def get_language_choices():
|
| 557 |
-
core = [f"⭐ {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "core"]
|
| 558 |
-
extended = [f" {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "extended"]
|
| 559 |
-
return core + extended
|
| 560 |
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
return choice.replace("⭐ ", "").replace(" ", "").strip()
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
def generate_wrapper(text_input, file_input, language_choice, voice, add_pauses, progress=gr.Progress()):
|
| 568 |
-
language = clean_language_name(language_choice)
|
| 569 |
-
return generate_audiobook(text_input, file_input, language, voice, add_pauses, progress)
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
# ──────────────────────────────────────────────
|
| 573 |
-
# Sample text
|
| 574 |
-
# ──────────────────────────────────────────────
|
| 575 |
SAMPLE_TEXT = """Chapter 1: The Beginning
|
| 576 |
|
| 577 |
The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
|
|
@@ -580,41 +628,59 @@ The old lighthouse stood at the edge of the world, or so it seemed to the girl w
|
|
| 580 |
|
| 581 |
The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
|
| 582 |
|
| 583 |
-
Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather
|
| 584 |
|
| 585 |
The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
|
| 586 |
|
| 587 |
"Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
|
| 588 |
|
| 589 |
-
And he would smile
|
| 590 |
|
| 591 |
-
|
| 592 |
-
# ──────────────────────────────────────────────
|
| 593 |
-
# Gradio UI
|
| 594 |
-
# ──────────────────────────────────────────────
|
| 595 |
DESCRIPTION = """
|
| 596 |
-
#
|
| 597 |
-
### English Text
|
|
|
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
| 601 |
|
| 602 |
-
|
|
|
|
|
|
|
|
|
|
| 603 |
"""
|
| 604 |
|
| 605 |
-
# Language dropdown choices
|
| 606 |
lang_choices = []
|
| 607 |
-
lang_choices.append("── Core Languages (Best Quality) ──")
|
| 608 |
for name, cfg in LANGUAGES.items():
|
| 609 |
if cfg["tier"] == "core":
|
| 610 |
-
lang_choices.append(f"
|
| 611 |
-
lang_choices.append("── Extended Languages ──")
|
| 612 |
for name, cfg in LANGUAGES.items():
|
| 613 |
if cfg["tier"] == "extended":
|
| 614 |
lang_choices.append(name)
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
with gr.Blocks(
|
| 617 |
-
title="Audiobook Generator
|
| 618 |
theme=gr.themes.Soft(
|
| 619 |
primary_hue="indigo",
|
| 620 |
secondary_hue="purple",
|
|
@@ -625,81 +691,98 @@ with gr.Blocks(
|
|
| 625 |
gr.Markdown(DESCRIPTION)
|
| 626 |
|
| 627 |
with gr.Row():
|
| 628 |
-
# ── Left column: Input ──
|
| 629 |
with gr.Column(scale=1):
|
| 630 |
text_input = gr.Textbox(
|
| 631 |
label="English Text",
|
| 632 |
placeholder="Paste your English text here...",
|
| 633 |
-
lines=
|
| 634 |
-
max_lines=
|
| 635 |
)
|
| 636 |
-
|
| 637 |
file_input = gr.File(
|
| 638 |
label="Or Upload a Document (.txt, .md, .pdf, .docx)",
|
| 639 |
file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
|
| 640 |
type="filepath",
|
| 641 |
)
|
|
|
|
| 642 |
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
info="⭐ = Core (best quality). Choose English for no translation.",
|
| 651 |
-
)
|
| 652 |
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
add_pauses = gr.Checkbox(
|
| 660 |
value=True,
|
| 661 |
label="Add pauses between sections",
|
| 662 |
-
info="
|
| 663 |
)
|
| 664 |
|
| 665 |
-
generate_btn = gr.Button(
|
| 666 |
-
"🎙️ Generate Audiobook",
|
| 667 |
-
variant="primary",
|
| 668 |
-
size="lg",
|
| 669 |
-
)
|
| 670 |
|
| 671 |
-
# ── Right column: Output ──
|
| 672 |
with gr.Column(scale=1):
|
| 673 |
-
audio_output = gr.Audio(
|
| 674 |
-
label="Generated Audiobook",
|
| 675 |
-
type="filepath",
|
| 676 |
-
)
|
| 677 |
-
|
| 678 |
stats_output = gr.Markdown(label="Generation Stats")
|
| 679 |
-
|
| 680 |
with gr.Accordion("Translation / Narration Transcript", open=False):
|
| 681 |
transcript_output = gr.Markdown()
|
| 682 |
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
|
|
|
|
|
|
| 687 |
)
|
| 688 |
|
| 689 |
generate_btn.click(
|
| 690 |
fn=generate_wrapper,
|
| 691 |
-
inputs=[text_input, file_input, target_lang,
|
|
|
|
| 692 |
outputs=[audio_output, stats_output, transcript_output],
|
| 693 |
)
|
| 694 |
|
| 695 |
-
# ── Footer ──
|
| 696 |
gr.Markdown(
|
| 697 |
"---\n"
|
| 698 |
-
"**
|
| 699 |
-
"
|
| 700 |
-
"
|
| 701 |
-
"
|
| 702 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
|
| 704 |
if __name__ == "__main__":
|
| 705 |
demo.launch()
|
|
|
|
| 1 |
"""
|
| 2 |
+
Audiobook Generator - English Source to Multi-Language Audio
|
| 3 |
+
Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC via DashScope API
|
| 4 |
+
|
| 5 |
+
Three voice modes:
|
| 6 |
+
1. Preset Voices: Use built-in Qwen voices (via Qwen3.5-Omni-Plus)
|
| 7 |
+
2. Cloned Voice: Clone a voice from audio sample (via Qwen3-TTS-VC)
|
| 8 |
+
3. Both support translation from English to 36 languages
|
| 9 |
+
|
| 10 |
+
Deploy as a Hugging Face Space:
|
| 11 |
+
1. Create a new Space (SDK: Gradio)
|
| 12 |
+
2. Upload app.py and requirements.txt
|
| 13 |
+
3. Add DASHSCOPE_API_KEY as a Space Secret
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
| 17 |
import base64
|
| 18 |
+
import json
|
| 19 |
import math
|
| 20 |
+
import pathlib
|
| 21 |
import shutil
|
| 22 |
import struct
|
| 23 |
import subprocess
|
|
|
|
| 26 |
import re
|
| 27 |
|
| 28 |
import gradio as gr
|
| 29 |
+
import requests as http_requests
|
| 30 |
from openai import OpenAI
|
| 31 |
|
| 32 |
+
# Optional document parsers
|
| 33 |
try:
|
| 34 |
import pypdf
|
| 35 |
HAS_PYPDF = True
|
|
|
|
| 42 |
except ImportError:
|
| 43 |
HAS_DOCX = False
|
| 44 |
|
|
|
|
| 45 |
# Configuration
|
| 46 |
+
OMNI_MODEL = "qwen3.5-omni-plus"
|
| 47 |
+
TTS_VC_MODEL = "qwen3-tts-vc-2026-01-22"
|
| 48 |
+
VOICE_CLONE_MODEL = "qwen-voice-enrollment"
|
| 49 |
+
|
| 50 |
BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
|
| 51 |
+
DASHSCOPE_API_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
|
| 52 |
+
VOICE_CLONE_URL = f"{DASHSCOPE_API_URL}/services/audio/tts/customization"
|
| 53 |
+
TTS_SYNTHESIS_URL = f"{DASHSCOPE_API_URL}/services/aigc/multimodal-generation/generation"
|
| 54 |
|
|
|
|
|
|
|
| 55 |
MAX_CHARS_PER_CHUNK = 1500
|
| 56 |
|
| 57 |
+
# Languages
|
|
|
|
|
|
|
| 58 |
LANGUAGES = {
|
|
|
|
| 59 |
"English": {"code": "en", "native": "English", "tier": "core"},
|
| 60 |
+
"Chinese (Mandarin)": {"code": "zh", "native": "Chinese", "tier": "core"},
|
| 61 |
+
"Japanese": {"code": "ja", "native": "Japanese", "tier": "core"},
|
| 62 |
+
"Korean": {"code": "ko", "native": "Korean", "tier": "core"},
|
| 63 |
"German": {"code": "de", "native": "Deutsch", "tier": "core"},
|
| 64 |
+
"French": {"code": "fr", "native": "Francais", "tier": "core"},
|
| 65 |
+
"Russian": {"code": "ru", "native": "Russian", "tier": "core"},
|
| 66 |
+
"Portuguese": {"code": "pt", "native": "Portugues", "tier": "core"},
|
| 67 |
+
"Spanish": {"code": "es", "native": "Espanol", "tier": "core"},
|
| 68 |
"Italian": {"code": "it", "native": "Italiano", "tier": "core"},
|
| 69 |
+
"Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
|
|
|
|
| 70 |
"Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
|
| 71 |
"Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
|
| 72 |
+
"Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
|
| 73 |
+
"Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
|
| 74 |
+
"Thai": {"code": "th", "native": "Thai", "tier": "extended"},
|
| 75 |
"Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
|
| 76 |
"Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
|
| 77 |
+
"Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
|
| 78 |
+
"Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
|
| 79 |
+
"Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
|
| 80 |
"Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
|
| 81 |
+
"Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
|
| 82 |
+
"Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
|
| 83 |
+
"Greek": {"code": "el", "native": "Greek", "tier": "extended"},
|
| 84 |
"Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
|
| 85 |
"Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
|
| 86 |
"Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
|
| 87 |
"Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
|
| 88 |
+
"Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
|
| 89 |
+
"Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
|
| 90 |
+
"Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
|
| 91 |
+
"Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
|
| 92 |
"Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
|
| 93 |
"Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
|
| 94 |
+
"Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
|
| 95 |
}
|
| 96 |
|
| 97 |
+
VOICE_CLONE_LANGUAGES = {
|
| 98 |
+
"English", "Chinese (Mandarin)", "Japanese", "Korean", "German",
|
| 99 |
+
"French", "Russian", "Portuguese", "Spanish", "Italian",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
+
PRESET_VOICES = [
|
| 103 |
+
"Cherry -- Sunny, friendly",
|
| 104 |
+
"Serena -- Gentle, soft",
|
| 105 |
+
"Jennifer -- Cinematic narrator",
|
| 106 |
+
"Katerina -- Mature, rich rhythm",
|
| 107 |
+
"Ethan -- Warm, energetic",
|
| 108 |
+
"Ryan -- Dramatic, rhythmic",
|
| 109 |
+
"Kai -- Soothing, calm",
|
| 110 |
+
"Neil -- Precise, clear",
|
| 111 |
+
"Lenn -- Rational, steady",
|
| 112 |
+
"Aiden -- Young, lively",
|
| 113 |
+
"Eldric Sage -- Authoritative narrator",
|
| 114 |
+
"Arthur -- Classic, mature",
|
| 115 |
+
"Mia -- Young, versatile",
|
| 116 |
+
"Bella -- Elegant, warm",
|
| 117 |
+
"Vivian -- Professional, clear",
|
| 118 |
+
"Seren -- Calm, measured",
|
| 119 |
+
"Dolce -- Sweet, melodic",
|
| 120 |
+
"Bellona -- Strong, commanding",
|
| 121 |
+
"Vincent -- Rich, theatrical",
|
| 122 |
+
"Andre -- Deep, resonant",
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def get_voice_name(label):
|
| 127 |
+
return label.split("--")[0].strip()
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
|
|
|
|
|
|
|
| 130 |
# Audio helpers
|
| 131 |
+
def base64_to_wav(b64_data, output_path):
|
|
|
|
|
|
|
| 132 |
audio_bytes = base64.b64decode(b64_data)
|
| 133 |
+
sr = 24000
|
| 134 |
+
nc = 1
|
| 135 |
+
bps = 16
|
| 136 |
+
br = sr * nc * bps // 8
|
| 137 |
+
ba = nc * bps // 8
|
| 138 |
+
ds = len(audio_bytes)
|
| 139 |
with open(output_path, "wb") as f:
|
| 140 |
f.write(b"RIFF")
|
| 141 |
+
f.write(struct.pack("<I", 36 + ds))
|
| 142 |
f.write(b"WAVE")
|
| 143 |
f.write(b"fmt ")
|
| 144 |
f.write(struct.pack("<I", 16))
|
| 145 |
f.write(struct.pack("<H", 1))
|
| 146 |
+
f.write(struct.pack("<H", nc))
|
| 147 |
+
f.write(struct.pack("<I", sr))
|
| 148 |
+
f.write(struct.pack("<I", br))
|
| 149 |
+
f.write(struct.pack("<H", ba))
|
| 150 |
+
f.write(struct.pack("<H", bps))
|
| 151 |
f.write(b"data")
|
| 152 |
+
f.write(struct.pack("<I", ds))
|
| 153 |
f.write(audio_bytes)
|
| 154 |
|
| 155 |
|
| 156 |
+
def concatenate_wavs(wav_files, output_path):
|
|
|
|
| 157 |
if not wav_files:
|
| 158 |
return
|
| 159 |
if len(wav_files) == 1:
|
|
|
|
| 171 |
os.remove(list_file)
|
| 172 |
|
| 173 |
|
| 174 |
+
def generate_silence(duration_sec, output_path):
|
| 175 |
+
subprocess.run(
|
| 176 |
+
["ffmpeg", "-y", "-f", "lavfi",
|
| 177 |
+
"-i", "anullsrc=r=24000:cl=mono",
|
| 178 |
+
"-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
|
| 179 |
+
capture_output=True, check=True,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# Document extraction
|
| 184 |
+
def extract_text_from_pdf(filepath):
|
| 185 |
+
if not HAS_PYPDF:
|
| 186 |
+
raise ImportError("pypdf is not installed.")
|
| 187 |
+
reader = pypdf.PdfReader(filepath)
|
| 188 |
+
pages = []
|
| 189 |
+
for page in reader.pages:
|
| 190 |
+
text = page.extract_text()
|
| 191 |
+
if text:
|
| 192 |
+
pages.append(text.strip())
|
| 193 |
+
return "\n\n".join(pages)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def extract_text_from_docx(filepath):
|
| 197 |
+
if not HAS_DOCX:
|
| 198 |
+
raise ImportError("python-docx is not installed.")
|
| 199 |
+
doc = docx.Document(filepath)
|
| 200 |
+
paragraphs = []
|
| 201 |
+
for para in doc.paragraphs:
|
| 202 |
+
text = para.text.strip()
|
| 203 |
+
if text:
|
| 204 |
+
paragraphs.append(text)
|
| 205 |
+
return "\n\n".join(paragraphs)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def extract_text_from_file(filepath):
|
| 209 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 210 |
+
if ext == ".pdf":
|
| 211 |
+
return extract_text_from_pdf(filepath)
|
| 212 |
+
elif ext in (".docx", ".doc"):
|
| 213 |
+
if ext == ".doc":
|
| 214 |
+
try:
|
| 215 |
+
tmp_dir = tempfile.mkdtemp()
|
| 216 |
+
subprocess.run(
|
| 217 |
+
["libreoffice", "--headless", "--convert-to", "docx",
|
| 218 |
+
"--outdir", tmp_dir, filepath],
|
| 219 |
+
capture_output=True, check=True, timeout=60,
|
| 220 |
+
)
|
| 221 |
+
docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
|
| 222 |
+
docx_path = os.path.join(tmp_dir, docx_name)
|
| 223 |
+
if os.path.exists(docx_path):
|
| 224 |
+
text = extract_text_from_docx(docx_path)
|
| 225 |
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
| 226 |
+
return text
|
| 227 |
+
except Exception:
|
| 228 |
+
pass
|
| 229 |
+
raise gr.Error("Cannot read .doc files. Please save as .docx or .pdf.")
|
| 230 |
+
return extract_text_from_docx(filepath)
|
| 231 |
+
else:
|
| 232 |
+
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
| 233 |
+
return f.read()
|
| 234 |
+
|
| 235 |
+
|
| 236 |
# Text splitting
|
| 237 |
+
def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
text = text.strip()
|
| 239 |
if not text:
|
| 240 |
return []
|
|
|
|
|
|
|
| 241 |
if len(text) <= max_chars:
|
| 242 |
return [text]
|
| 243 |
|
| 244 |
chunks = []
|
|
|
|
| 245 |
paragraphs = re.split(r"\n\s*\n", text)
|
|
|
|
| 246 |
current_chunk = ""
|
| 247 |
+
|
| 248 |
for para in paragraphs:
|
| 249 |
para = para.strip()
|
| 250 |
if not para:
|
| 251 |
continue
|
|
|
|
|
|
|
| 252 |
if len(current_chunk) + len(para) + 2 <= max_chars:
|
| 253 |
current_chunk = (current_chunk + "\n\n" + para).strip()
|
| 254 |
else:
|
|
|
|
| 255 |
if current_chunk:
|
| 256 |
chunks.append(current_chunk)
|
| 257 |
current_chunk = ""
|
|
|
|
|
|
|
| 258 |
if len(para) > max_chars:
|
| 259 |
sentences = re.split(r"(?<=[.!?])\s+", para)
|
| 260 |
for sentence in sentences:
|
|
|
|
| 263 |
else:
|
| 264 |
if current_chunk:
|
| 265 |
chunks.append(current_chunk)
|
|
|
|
| 266 |
if len(sentence) > max_chars:
|
| 267 |
words = sentence.split()
|
| 268 |
current_chunk = ""
|
|
|
|
| 280 |
|
| 281 |
if current_chunk:
|
| 282 |
chunks.append(current_chunk)
|
|
|
|
| 283 |
return chunks
|
| 284 |
|
| 285 |
|
| 286 |
+
# ==============================
|
| 287 |
+
# VOICE CLONING
|
| 288 |
+
# ==============================
|
| 289 |
+
def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
|
| 290 |
+
filepath = pathlib.Path(audio_path)
|
| 291 |
+
if not filepath.exists():
|
| 292 |
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
| 293 |
+
|
| 294 |
+
ext = filepath.suffix.lower()
|
| 295 |
+
mime_map = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4"}
|
| 296 |
+
mime_type = mime_map.get(ext, "audio/mpeg")
|
| 297 |
+
|
| 298 |
+
b64_str = base64.b64encode(filepath.read_bytes()).decode()
|
| 299 |
+
data_uri = f"data:{mime_type};base64,{b64_str}"
|
| 300 |
+
|
| 301 |
+
payload = {
|
| 302 |
+
"model": VOICE_CLONE_MODEL,
|
| 303 |
+
"input": {
|
| 304 |
+
"action": "create",
|
| 305 |
+
"target_model": TTS_VC_MODEL,
|
| 306 |
+
"preferred_name": preferred_name,
|
| 307 |
+
"audio": {"data": data_uri},
|
| 308 |
+
},
|
| 309 |
+
}
|
| 310 |
+
headers = {
|
| 311 |
+
"Authorization": f"Bearer {api_key}",
|
| 312 |
+
"Content-Type": "application/json",
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
resp = http_requests.post(VOICE_CLONE_URL, json=payload, headers=headers, timeout=60)
|
| 316 |
+
if resp.status_code != 200:
|
| 317 |
+
raise RuntimeError(f"Voice cloning failed ({resp.status_code}): {resp.text}")
|
| 318 |
+
|
| 319 |
+
try:
|
| 320 |
+
return resp.json()["output"]["voice"]
|
| 321 |
+
except (KeyError, ValueError) as e:
|
| 322 |
+
raise RuntimeError(f"Failed to parse voice clone response: {e}\n{resp.text}")
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# ==============================
|
| 326 |
+
# TTS WITH CLONED VOICE
|
| 327 |
+
# ==============================
|
| 328 |
+
def synthesize_with_cloned_voice(text, voice_id, language, api_key, output_dir, chunk_index):
|
| 329 |
+
lang_type_map = {
|
| 330 |
+
"English": "English", "Chinese (Mandarin)": "Chinese",
|
| 331 |
+
"Japanese": "Japanese", "Korean": "Korean",
|
| 332 |
+
"German": "German", "French": "French",
|
| 333 |
+
"Russian": "Russian", "Portuguese": "Portuguese",
|
| 334 |
+
"Spanish": "Spanish", "Italian": "Italian",
|
| 335 |
+
}
|
| 336 |
+
language_type = lang_type_map.get(language, "English")
|
| 337 |
+
|
| 338 |
+
payload = {
|
| 339 |
+
"model": TTS_VC_MODEL,
|
| 340 |
+
"input": {
|
| 341 |
+
"text": text,
|
| 342 |
+
"voice": voice_id,
|
| 343 |
+
"language_type": language_type,
|
| 344 |
+
},
|
| 345 |
+
}
|
| 346 |
+
headers = {
|
| 347 |
+
"Authorization": f"Bearer {api_key}",
|
| 348 |
+
"Content-Type": "application/json",
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
try:
|
| 352 |
+
resp = http_requests.post(TTS_SYNTHESIS_URL, json=payload, headers=headers, timeout=120)
|
| 353 |
+
if resp.status_code != 200:
|
| 354 |
+
return None, f"TTS failed ({resp.status_code}): {resp.text[:200]}"
|
| 355 |
+
|
| 356 |
+
result = resp.json()
|
| 357 |
+
audio_url = result.get("output", {}).get("audio", {}).get("url")
|
| 358 |
+
if not audio_url:
|
| 359 |
+
return None, f"No audio URL in response: {json.dumps(result)[:200]}"
|
| 360 |
+
|
| 361 |
+
output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
|
| 362 |
+
audio_resp = http_requests.get(audio_url, timeout=120)
|
| 363 |
+
if audio_resp.status_code != 200:
|
| 364 |
+
return None, "Failed to download audio from URL"
|
| 365 |
+
|
| 366 |
+
with open(output_wav, "wb") as f:
|
| 367 |
+
f.write(audio_resp.content)
|
| 368 |
+
|
| 369 |
+
return output_wav, None
|
| 370 |
+
|
| 371 |
+
except Exception as e:
|
| 372 |
+
return None, str(e)
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
# ==============================
|
| 376 |
+
# TRANSLATION (text only)
|
| 377 |
+
# ==============================
|
| 378 |
+
def translate_text(client, text, target_language, lang_config):
|
| 379 |
+
response = client.chat.completions.create(
|
| 380 |
+
model=OMNI_MODEL,
|
| 381 |
+
modalities=["text"],
|
| 382 |
+
messages=[
|
| 383 |
+
{
|
| 384 |
+
"role": "system",
|
| 385 |
+
"content": (
|
| 386 |
+
f"You are a professional translator. Translate English text into "
|
| 387 |
+
f"natural, fluent {target_language} ({lang_config['native']}). "
|
| 388 |
+
f"Output ONLY the translated text."
|
| 389 |
+
),
|
| 390 |
+
},
|
| 391 |
+
{
|
| 392 |
+
"role": "user",
|
| 393 |
+
"content": f"Translate the following into {target_language}:\n\n{text}",
|
| 394 |
+
},
|
| 395 |
+
],
|
| 396 |
+
)
|
| 397 |
+
return response.choices[0].message.content.strip()
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# ==============================
|
| 401 |
+
# SPEECH WITH PRESET VOICE
|
| 402 |
+
# ==============================
|
| 403 |
+
def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
|
| 404 |
output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
|
| 405 |
|
| 406 |
if translate and language != "English":
|
| 407 |
system_prompt = (
|
| 408 |
f"You are a professional audiobook narrator and translator.\n"
|
| 409 |
+
f"Translate the English text into natural {language} ({lang_config['native']}).\n"
|
| 410 |
+
f"Read the translation aloud with expressive audiobook narration.\n"
|
| 411 |
+
f"Respond ONLY with the spoken {language} narration."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
)
|
| 413 |
+
user_text = f"Translate into {language} and narrate as an audiobook:\n\n{text}"
|
| 414 |
else:
|
| 415 |
system_prompt = (
|
| 416 |
"You are a professional audiobook narrator.\n"
|
| 417 |
+
"Read the text with clear, expressive narration.\n"
|
| 418 |
+
"Respond ONLY with the spoken narration."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
)
|
| 420 |
+
user_text = f"Narrate as an audiobook:\n\n{text}"
|
| 421 |
|
| 422 |
try:
|
| 423 |
completion = client.chat.completions.create(
|
| 424 |
+
model=OMNI_MODEL,
|
| 425 |
messages=[
|
| 426 |
{"role": "system", "content": system_prompt},
|
| 427 |
{"role": "user", "content": user_text},
|
|
|
|
| 454 |
full_audio_b64 = "".join(audio_chunks)
|
| 455 |
base64_to_wav(full_audio_b64, output_wav)
|
| 456 |
return output_wav, transcript
|
| 457 |
+
return None, "No audio received"
|
|
|
|
| 458 |
|
| 459 |
except Exception as e:
|
| 460 |
return None, str(e)
|
| 461 |
|
| 462 |
|
| 463 |
+
# ==============================
|
| 464 |
+
# MAIN PIPELINE
|
| 465 |
+
# ==============================
|
| 466 |
+
def generate_audiobook(text_input, file_input, target_language, voice_mode,
|
| 467 |
+
preset_voice_label, clone_audio, add_pauses, progress=gr.Progress()):
|
| 468 |
+
# Resolve text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
if file_input is not None:
|
| 470 |
try:
|
| 471 |
progress(0.02, desc="Extracting text from document...")
|
|
|
|
| 480 |
raise gr.Error("Please provide text or upload a file.")
|
| 481 |
|
| 482 |
if len(text) < 10:
|
| 483 |
+
raise gr.Error("Text is too short.")
|
| 484 |
|
|
|
|
| 485 |
api_key = os.environ.get("DASHSCOPE_API_KEY", "")
|
| 486 |
if not api_key:
|
| 487 |
+
raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
|
|
|
|
|
|
|
|
|
|
| 488 |
|
|
|
|
| 489 |
lang_config = LANGUAGES[target_language]
|
| 490 |
+
use_clone = voice_mode == "Clone a Voice"
|
| 491 |
translate = target_language != "English"
|
| 492 |
client = OpenAI(api_key=api_key, base_url=BASE_URL)
|
| 493 |
tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
|
| 494 |
|
| 495 |
+
# Voice cloning setup
|
| 496 |
+
cloned_voice_id = None
|
| 497 |
+
if use_clone:
|
| 498 |
+
if clone_audio is None:
|
| 499 |
+
raise gr.Error("Please upload a voice sample (10-60 seconds of clear speech).")
|
| 500 |
+
|
| 501 |
+
if target_language not in VOICE_CLONE_LANGUAGES:
|
| 502 |
+
raise gr.Error(
|
| 503 |
+
f"Voice cloning TTS supports: {', '.join(sorted(VOICE_CLONE_LANGUAGES))}. "
|
| 504 |
+
f"'{target_language}' is not supported with cloned voices. Use a preset voice instead."
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
progress(0.03, desc="Cloning voice from audio sample...")
|
| 508 |
+
try:
|
| 509 |
+
cloned_voice_id = clone_voice(clone_audio, api_key)
|
| 510 |
+
progress(0.08, desc="Voice cloned successfully!")
|
| 511 |
+
except Exception as e:
|
| 512 |
+
raise gr.Error(f"Voice cloning failed: {e}")
|
| 513 |
+
|
| 514 |
try:
|
| 515 |
+
# Split text
|
| 516 |
+
progress(0.10, desc="Splitting text into chunks...")
|
| 517 |
chunks = split_text_into_chunks(text)
|
| 518 |
total_chunks = len(chunks)
|
| 519 |
total_chars = sum(len(c) for c in chunks)
|
| 520 |
|
| 521 |
+
# Process each chunk
|
|
|
|
|
|
|
| 522 |
audio_files = []
|
| 523 |
all_transcripts = []
|
| 524 |
silence_path = os.path.join(tmp_dir, "silence.wav")
|
|
|
|
| 526 |
generate_silence(1.5, silence_path)
|
| 527 |
|
| 528 |
for i, chunk in enumerate(chunks):
|
| 529 |
+
frac = 0.12 + 0.75 * (i / total_chunks)
|
| 530 |
progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
|
| 531 |
|
| 532 |
+
if use_clone:
|
| 533 |
+
# CLONED VOICE PIPELINE
|
| 534 |
+
final_text = chunk
|
| 535 |
+
if translate:
|
| 536 |
+
try:
|
| 537 |
+
final_text = translate_text(client, chunk, target_language, lang_config)
|
| 538 |
+
all_transcripts.append(final_text)
|
| 539 |
+
except Exception as e:
|
| 540 |
+
all_transcripts.append(f"Translation failed for chunk {i+1}: {e}")
|
| 541 |
+
final_text = chunk
|
| 542 |
+
|
| 543 |
+
wav_path, error = synthesize_with_cloned_voice(
|
| 544 |
+
final_text, cloned_voice_id, target_language, api_key, tmp_dir, i,
|
| 545 |
+
)
|
| 546 |
|
| 547 |
+
if wav_path:
|
| 548 |
+
audio_files.append(wav_path)
|
| 549 |
+
else:
|
| 550 |
+
all_transcripts.append(f"TTS failed for chunk {i+1}: {error}")
|
| 551 |
+
fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
|
| 552 |
+
generate_silence(2.0, fail_silence)
|
| 553 |
+
audio_files.append(fail_silence)
|
| 554 |
else:
|
| 555 |
+
# PRESET VOICE PIPELINE
|
| 556 |
+
voice = get_voice_name(preset_voice_label)
|
| 557 |
+
wav_path, transcript = generate_speech_preset(
|
| 558 |
+
client, chunk, voice, target_language,
|
| 559 |
+
lang_config, translate, i, tmp_dir,
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
if wav_path:
|
| 563 |
+
audio_files.append(wav_path)
|
| 564 |
+
else:
|
| 565 |
+
all_transcripts.append(f"Chunk {i+1} failed: {transcript}")
|
| 566 |
+
fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
|
| 567 |
+
generate_silence(2.0, fail_silence)
|
| 568 |
+
audio_files.append(fail_silence)
|
| 569 |
|
| 570 |
+
if transcript and "failed" not in transcript.lower():
|
| 571 |
+
all_transcripts.append(transcript)
|
| 572 |
+
|
| 573 |
+
# Pause between chunks
|
| 574 |
+
if add_pauses and i < total_chunks - 1 and audio_files:
|
| 575 |
+
audio_files.append(silence_path)
|
| 576 |
|
| 577 |
if not audio_files:
|
| 578 |
+
raise gr.Error("No audio was generated.")
|
| 579 |
|
| 580 |
+
# Concatenate
|
| 581 |
+
progress(0.90, desc="Assembling audiobook...")
|
| 582 |
final_audio = os.path.join(tmp_dir, "audiobook.wav")
|
| 583 |
concatenate_wavs(audio_files, final_audio)
|
| 584 |
|
| 585 |
+
# Convert to MP3
|
| 586 |
+
progress(0.95, desc="Converting to MP3...")
|
| 587 |
final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
|
| 588 |
subprocess.run(
|
| 589 |
["ffmpeg", "-y", "-i", final_audio,
|
|
|
|
| 594 |
|
| 595 |
progress(1.0, desc="Done!")
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
|
| 598 |
+
voice_info = f"Cloned voice (ID: {cloned_voice_id[:20]}...)" if use_clone else preset_voice_label
|
| 599 |
stats = (
|
| 600 |
f"**Audiobook Generated!**\n\n"
|
| 601 |
f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
|
| 602 |
f"- **Language:** {target_language} ({lang_config['native']})\n"
|
| 603 |
+
f"- **Voice:** {voice_info}\n"
|
| 604 |
+
f"- **Mode:** {'Voice Clone via Qwen3-TTS-VC' if use_clone else 'Preset via Qwen3.5-Omni-Plus'}\n"
|
| 605 |
f"- **File size:** {audio_size:.1f} MB\n"
|
|
|
|
| 606 |
)
|
| 607 |
+
if lang_config["tier"] == "extended" and not use_clone:
|
| 608 |
+
stats += "\n> Note: Extended language - voice quality may vary."
|
| 609 |
+
|
| 610 |
+
transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
|
| 611 |
|
| 612 |
return final_mp3, stats, transcript_text
|
| 613 |
|
|
|
|
| 615 |
raise
|
| 616 |
except Exception as e:
|
| 617 |
raise gr.Error(f"Pipeline error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
+
# ==============================
|
| 621 |
+
# GRADIO UI
|
| 622 |
+
# ==============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
SAMPLE_TEXT = """Chapter 1: The Beginning
|
| 624 |
|
| 625 |
The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
|
|
|
|
| 628 |
|
| 629 |
The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
|
| 630 |
|
| 631 |
+
Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather - grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
|
| 632 |
|
| 633 |
The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
|
| 634 |
|
| 635 |
"Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
|
| 636 |
|
| 637 |
+
And he would smile - that slow, careful smile that seemed to cost him something each time - and begin."""
|
| 638 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
DESCRIPTION = """
|
| 640 |
+
# Audiobook Generator
|
| 641 |
+
### English Text to Multi-Language Audiobook with Voice Cloning
|
| 642 |
+
**Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC**
|
| 643 |
|
| 644 |
+
Upload English text and generate a narrated audiobook in **36 languages**.
|
| 645 |
+
Choose a **preset voice** or **clone any voice** from a short audio sample!
|
| 646 |
|
| 647 |
+
| Mode | Model | Languages | How it works |
|
| 648 |
+
|------|-------|-----------|-------------|
|
| 649 |
+
| **Preset Voice** | Qwen3.5-Omni-Plus | 36 languages | Translates + speaks in one step |
|
| 650 |
+
| **Clone a Voice** | Qwen3-TTS-VC | 10 core languages | Clones voice, translates, then speaks |
|
| 651 |
"""
|
| 652 |
|
|
|
|
| 653 |
lang_choices = []
|
|
|
|
| 654 |
for name, cfg in LANGUAGES.items():
|
| 655 |
if cfg["tier"] == "core":
|
| 656 |
+
lang_choices.append(f"* {name}")
|
|
|
|
| 657 |
for name, cfg in LANGUAGES.items():
|
| 658 |
if cfg["tier"] == "extended":
|
| 659 |
lang_choices.append(name)
|
| 660 |
|
| 661 |
+
|
| 662 |
+
def clean_language_name(choice):
|
| 663 |
+
return choice.replace("* ", "").strip()
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
def on_voice_mode_change(mode):
|
| 667 |
+
if mode == "Clone a Voice":
|
| 668 |
+
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
|
| 669 |
+
else:
|
| 670 |
+
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
def generate_wrapper(text_input, file_input, language_choice, voice_mode,
|
| 674 |
+
preset_voice, clone_audio, add_pauses, progress=gr.Progress()):
|
| 675 |
+
language = clean_language_name(language_choice)
|
| 676 |
+
return generate_audiobook(
|
| 677 |
+
text_input, file_input, language, voice_mode,
|
| 678 |
+
preset_voice, clone_audio, add_pauses, progress,
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
|
| 682 |
with gr.Blocks(
|
| 683 |
+
title="Audiobook Generator",
|
| 684 |
theme=gr.themes.Soft(
|
| 685 |
primary_hue="indigo",
|
| 686 |
secondary_hue="purple",
|
|
|
|
| 691 |
gr.Markdown(DESCRIPTION)
|
| 692 |
|
| 693 |
with gr.Row():
|
|
|
|
| 694 |
with gr.Column(scale=1):
|
| 695 |
text_input = gr.Textbox(
|
| 696 |
label="English Text",
|
| 697 |
placeholder="Paste your English text here...",
|
| 698 |
+
lines=10,
|
| 699 |
+
max_lines=25,
|
| 700 |
)
|
|
|
|
| 701 |
file_input = gr.File(
|
| 702 |
label="Or Upload a Document (.txt, .md, .pdf, .docx)",
|
| 703 |
file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
|
| 704 |
type="filepath",
|
| 705 |
)
|
| 706 |
+
sample_btn = gr.Button("Load Sample Text", variant="secondary", size="sm")
|
| 707 |
|
| 708 |
+
target_lang = gr.Dropdown(
|
| 709 |
+
choices=lang_choices,
|
| 710 |
+
value="* English",
|
| 711 |
+
label="Target Language",
|
| 712 |
+
info="* = Core (best quality). Voice cloning supports core languages only.",
|
| 713 |
+
)
|
| 714 |
|
| 715 |
+
voice_mode = gr.Radio(
|
| 716 |
+
choices=["Preset Voice", "Clone a Voice"],
|
| 717 |
+
value="Preset Voice",
|
| 718 |
+
label="Voice Mode",
|
| 719 |
+
)
|
|
|
|
|
|
|
| 720 |
|
| 721 |
+
preset_voice = gr.Dropdown(
|
| 722 |
+
choices=PRESET_VOICES,
|
| 723 |
+
value="Jennifer -- Cinematic narrator",
|
| 724 |
+
label="Preset Narrator Voice",
|
| 725 |
+
visible=True,
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
clone_audio = gr.Audio(
|
| 729 |
+
label="Upload Voice Sample (10-60s of clear speech, WAV/MP3/M4A)",
|
| 730 |
+
type="filepath",
|
| 731 |
+
visible=False,
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
clone_info = gr.Markdown(
|
| 735 |
+
value=(
|
| 736 |
+
"> **Voice cloning tips:**\n"
|
| 737 |
+
"> - Use 10-60 seconds of clear, single-speaker audio\n"
|
| 738 |
+
"> - No background music or noise\n"
|
| 739 |
+
"> - WAV (16-bit), MP3, or M4A format\n"
|
| 740 |
+
"> - Sample rate at least 24 kHz recommended\n"
|
| 741 |
+
"> - Cloned voice TTS supports 10 core languages only"
|
| 742 |
+
),
|
| 743 |
+
visible=False,
|
| 744 |
+
)
|
| 745 |
|
| 746 |
add_pauses = gr.Checkbox(
|
| 747 |
value=True,
|
| 748 |
label="Add pauses between sections",
|
| 749 |
+
info="1.5s silence between chunks",
|
| 750 |
)
|
| 751 |
|
| 752 |
+
generate_btn = gr.Button("Generate Audiobook", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
|
|
|
|
| 754 |
with gr.Column(scale=1):
|
| 755 |
+
audio_output = gr.Audio(label="Generated Audiobook", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
stats_output = gr.Markdown(label="Generation Stats")
|
|
|
|
| 757 |
with gr.Accordion("Translation / Narration Transcript", open=False):
|
| 758 |
transcript_output = gr.Markdown()
|
| 759 |
|
| 760 |
+
sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
|
| 761 |
+
|
| 762 |
+
voice_mode.change(
|
| 763 |
+
fn=on_voice_mode_change,
|
| 764 |
+
inputs=voice_mode,
|
| 765 |
+
outputs=[preset_voice, clone_audio, clone_info],
|
| 766 |
)
|
| 767 |
|
| 768 |
generate_btn.click(
|
| 769 |
fn=generate_wrapper,
|
| 770 |
+
inputs=[text_input, file_input, target_lang, voice_mode,
|
| 771 |
+
preset_voice, clone_audio, add_pauses],
|
| 772 |
outputs=[audio_output, stats_output, transcript_output],
|
| 773 |
)
|
| 774 |
|
|
|
|
| 775 |
gr.Markdown(
|
| 776 |
"---\n"
|
| 777 |
+
"**How it works:**\n\n"
|
| 778 |
+
"**Preset voice mode:** Text goes to Qwen3.5-Omni-Plus (translates + speaks in one call) then outputs MP3\n\n"
|
| 779 |
+
"**Clone voice mode:** Voice sample goes to Qwen Voice Enrollment (creates voice ID), "
|
| 780 |
+
"text goes to Qwen3.5-Omni-Plus (translates to target language), "
|
| 781 |
+
"then Qwen3-TTS-VC (synthesizes speech with cloned voice) outputs MP3\n\n"
|
| 782 |
+
"**Voice cloning supports:** Chinese, English, Japanese, Korean, German, French, "
|
| 783 |
+
"Russian, Portuguese, Spanish, Italian\n\n"
|
| 784 |
+
"Built with Gradio | Model by Alibaba Qwen | API via DashScope"
|
| 785 |
+
)
|
| 786 |
|
| 787 |
if __name__ == "__main__":
|
| 788 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -3,3 +3,4 @@ gradio>=5.25.0
|
|
| 3 |
audioop-lts; python_version >= "3.13"
|
| 4 |
pypdf>=4.0.0
|
| 5 |
python-docx>=1.1.0
|
|
|
|
|
|
| 3 |
audioop-lts; python_version >= "3.13"
|
| 4 |
pypdf>=4.0.0
|
| 5 |
python-docx>=1.1.0
|
| 6 |
+
requests>=2.31.0
|