PlotweaverModel commited on
Commit
33bd369
·
verified ·
1 Parent(s): 76d6cf7

files update

Browse files
Files changed (2) hide show
  1. app.py +461 -378
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,15 +1,23 @@
1
  """
2
- 📖 Audiobook Generator English Source to Multi-Language Audio
3
-
4
- Two modes:
5
- 1. Translation + TTS: Translate English text to target language, then generate speech
6
- 2. Direct TTS: Generate speech from English text directly
7
-
 
 
 
 
 
 
8
  """
9
 
10
  import os
11
  import base64
 
12
  import math
 
13
  import shutil
14
  import struct
15
  import subprocess
@@ -18,9 +26,10 @@ import time
18
  import re
19
 
20
  import gradio as gr
 
21
  from openai import OpenAI
22
 
23
- # Optional document parsers — installed via requirements.txt
24
  try:
25
  import pypdf
26
  HAS_PYPDF = True
@@ -33,139 +42,118 @@ try:
33
  except ImportError:
34
  HAS_DOCX = False
35
 
36
- # ──────────────────────────────────────────────
37
  # Configuration
38
- # ──────────────────────────────────────────────
39
- MODEL = "qwen3.5-omni-plus"
 
 
40
  BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
 
 
 
41
 
42
- # Maximum characters per chunk sent to the API
43
- # The model has token limits, so we split long texts
44
  MAX_CHARS_PER_CHUNK = 1500
45
 
46
- # All 36 speech output languages supported by Qwen3.5-Omni
47
- # Core 10 languages have the best quality; extended languages are supported
48
- # but may vary in quality as they include dialects
49
  LANGUAGES = {
50
- # ── Core 10 Languages (highest quality) ──
51
  "English": {"code": "en", "native": "English", "tier": "core"},
52
- "Chinese (Mandarin)": {"code": "zh", "native": "中文", "tier": "core"},
53
- "Japanese": {"code": "ja", "native": "日本語", "tier": "core"},
54
- "Korean": {"code": "ko", "native": "한국어", "tier": "core"},
55
  "German": {"code": "de", "native": "Deutsch", "tier": "core"},
56
- "French": {"code": "fr", "native": "Français", "tier": "core"},
57
- "Russian": {"code": "ru", "native": "Русский", "tier": "core"},
58
- "Portuguese": {"code": "pt", "native": "Português", "tier": "core"},
59
- "Spanish": {"code": "es", "native": "Español", "tier": "core"},
60
  "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
61
- # ── Extended Languages (Qwen3.5-Omni expanded to 36) ──
62
- "Arabic": {"code": "ar", "native": "العربية", "tier": "extended"},
63
  "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
64
  "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
65
- "Turkish": {"code": "tr", "native": "Türkçe", "tier": "extended"},
66
- "Vietnamese": {"code": "vi", "native": "Tiếng Việt", "tier": "extended"},
67
- "Thai": {"code": "th", "native": "ภาษาไทย", "tier": "extended"},
68
  "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
69
  "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
70
- "Hindi": {"code": "hi", "native": "हिन्दी", "tier": "extended"},
71
- "Bengali": {"code": "bn", "native": "বাংলা", "tier": "extended"},
72
- "Urdu": {"code": "ur", "native": "اردو", "tier": "extended"},
73
  "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
74
- "Czech": {"code": "cs", "native": "Čeština", "tier": "extended"},
75
- "Romanian": {"code": "ro", "native": "Română", "tier": "extended"},
76
- "Greek": {"code": "el", "native": "Ελληνικά", "tier": "extended"},
77
  "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
78
  "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
79
  "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
80
  "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
81
- "Ukrainian": {"code": "uk", "native": "Українська", "tier": "extended"},
82
- "Hebrew": {"code": "he", "native": "עברית", "tier": "extended"},
83
- "Persian": {"code": "fa", "native": "فارسی", "tier": "extended"},
84
- "Cantonese": {"code": "yue", "native": "粵語", "tier": "extended"},
85
  "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
86
  "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
87
- "Tamil": {"code": "ta", "native": "தமிழ்", "tier": "extended"},
88
  }
89
 
90
- VOICES = {
91
- "Male Voices": [
92
- "Ethan Warm, energetic",
93
- "Ryan — Dramatic, rhythmic",
94
- "Kai — Soothing, calm",
95
- "Neil — Precise, clear",
96
- "Lenn — Rational, steady",
97
- "Aiden — Young, lively",
98
- "Eldric Sage — Authoritative narrator",
99
- "Arthur — Classic, mature",
100
- "Elias — Soft, thoughtful",
101
- "Alek — Confident, modern",
102
- "Andre — Deep, resonant",
103
- "Emilien — Gentle, French-inspired",
104
- "Vincent — Rich, theatrical",
105
- ],
106
- "Female Voices": [
107
- "Cherry — Sunny, friendly",
108
- "Serena — Gentle, soft",
109
- "Jennifer — Cinematic narrator",
110
- "Katerina — Mature, rich rhythm",
111
- "Chelsie — Bright, expressive",
112
- "Mia — Young, versatile",
113
- "Bella — Elegant, warm",
114
- "Vivian — Professional, clear",
115
- "Moon — Dreamy, ethereal",
116
- "Maia — Confident, articulate",
117
- "Seren — Calm, measured",
118
- "Dolce — Sweet, melodic",
119
- "Bellona — Strong, commanding",
120
- "Bunny — Playful, light",
121
- "Momo — Cute, upbeat",
122
- "Mochi — Soft, adorable",
123
- ],
124
  }
125
 
126
- # Flatten voice list for the dropdown
127
- ALL_VOICES = []
128
- for category, voices in VOICES.items():
129
- for v in voices:
130
- ALL_VOICES.append(v)
131
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- def get_voice_name(voice_label: str) -> str:
134
- """Extract just the voice name from 'Name — Description' format."""
135
- return voice_label.split("—")[0].strip()
136
 
137
-
138
- # ──────────────────────────────────────────────
139
  # Audio helpers
140
- # ──────────────────────────────────────────────
141
- def base64_to_wav(b64_data: str, output_path: str):
142
- """Decode base64 PCM data and write a proper WAV file."""
143
  audio_bytes = base64.b64decode(b64_data)
144
- sample_rate = 24000
145
- num_channels = 1
146
- bits_per_sample = 16
147
- byte_rate = sample_rate * num_channels * bits_per_sample // 8
148
- block_align = num_channels * bits_per_sample // 8
149
- data_size = len(audio_bytes)
150
  with open(output_path, "wb") as f:
151
  f.write(b"RIFF")
152
- f.write(struct.pack("<I", 36 + data_size))
153
  f.write(b"WAVE")
154
  f.write(b"fmt ")
155
  f.write(struct.pack("<I", 16))
156
  f.write(struct.pack("<H", 1))
157
- f.write(struct.pack("<H", num_channels))
158
- f.write(struct.pack("<I", sample_rate))
159
- f.write(struct.pack("<I", byte_rate))
160
- f.write(struct.pack("<H", block_align))
161
- f.write(struct.pack("<H", bits_per_sample))
162
  f.write(b"data")
163
- f.write(struct.pack("<I", data_size))
164
  f.write(audio_bytes)
165
 
166
 
167
- def concatenate_wavs(wav_files: list, output_path: str):
168
- """Concatenate multiple WAV files using ffmpeg."""
169
  if not wav_files:
170
  return
171
  if len(wav_files) == 1:
@@ -183,43 +171,90 @@ def concatenate_wavs(wav_files: list, output_path: str):
183
  os.remove(list_file)
184
 
185
 
186
- # ──────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # Text splitting
188
- # ──────────────────────────────────────────────
189
- def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> list:
190
- """
191
- Split text into chunks at sentence boundaries.
192
- Tries to keep paragraphs together when possible.
193
- """
194
- # Normalize whitespace
195
  text = text.strip()
196
  if not text:
197
  return []
198
-
199
- # If short enough, return as-is
200
  if len(text) <= max_chars:
201
  return [text]
202
 
203
  chunks = []
204
- # First split by paragraphs
205
  paragraphs = re.split(r"\n\s*\n", text)
206
-
207
  current_chunk = ""
 
208
  for para in paragraphs:
209
  para = para.strip()
210
  if not para:
211
  continue
212
-
213
- # If adding this paragraph keeps us under the limit
214
  if len(current_chunk) + len(para) + 2 <= max_chars:
215
  current_chunk = (current_chunk + "\n\n" + para).strip()
216
  else:
217
- # Save current chunk if it has content
218
  if current_chunk:
219
  chunks.append(current_chunk)
220
  current_chunk = ""
221
-
222
- # If the paragraph itself is too long, split by sentences
223
  if len(para) > max_chars:
224
  sentences = re.split(r"(?<=[.!?])\s+", para)
225
  for sentence in sentences:
@@ -228,7 +263,6 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
228
  else:
229
  if current_chunk:
230
  chunks.append(current_chunk)
231
- # If a single sentence is too long, force-split it
232
  if len(sentence) > max_chars:
233
  words = sentence.split()
234
  current_chunk = ""
@@ -246,64 +280,148 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
246
 
247
  if current_chunk:
248
  chunks.append(current_chunk)
249
-
250
  return chunks
251
 
252
 
253
- # ──────────────────────────────────────────────
254
- # API: Generate speech for a text chunk
255
- # ──────────────────────────────────────────────
256
- def generate_speech_chunk(
257
- client: OpenAI,
258
- text: str,
259
- voice: str,
260
- language: str,
261
- lang_config: dict,
262
- translate: bool,
263
- chunk_index: int,
264
- output_dir: str,
265
- ) -> tuple:
266
- """
267
- Send a text chunk to Qwen3.5-Omni-Plus and get back audio.
268
- If translate=True, translates from English to target language and speaks.
269
- If translate=False, speaks the text directly in English.
270
- Returns (wav_path, transcript) or (None, error_msg).
271
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
273
 
274
  if translate and language != "English":
275
  system_prompt = (
276
  f"You are a professional audiobook narrator and translator.\n"
277
- f"You will receive English text. Your task:\n"
278
- f"1. Translate the text into natural, fluent {language} ({lang_config['native']}).\n"
279
- f"2. Read the translated text aloud with clear, expressive narration.\n"
280
- f"3. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
281
- f" descriptions, and emotional moments.\n"
282
- f"4. Respond ONLY with the spoken {language} narration — no English,\n"
283
- f" no meta-commentary, no chapter headers unless they're in the text.\n"
284
- f"5. Maintain a natural reading pace suitable for an audiobook.\n"
285
- f"6. Translate idioms and cultural references appropriately."
286
- )
287
- user_text = (
288
- f"Translate the following English text into {language} and narrate it "
289
- f"as an audiobook. Respond only with the spoken {language} narration:\n\n{text}"
290
  )
 
291
  else:
292
  system_prompt = (
293
  "You are a professional audiobook narrator.\n"
294
- "You will receive text to read aloud. Your task:\n"
295
- "1. Read the text with clear, expressive narration.\n"
296
- "2. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
297
- " descriptions, and emotional moments.\n"
298
- "3. Respond ONLY with the spoken narration — no meta-commentary.\n"
299
- "4. Maintain a natural reading pace suitable for an audiobook.\n"
300
- "5. Pause appropriately between paragraphs and at punctuation."
301
  )
302
- user_text = f"Narrate the following text as an audiobook:\n\n{text}"
303
 
304
  try:
305
  completion = client.chat.completions.create(
306
- model=MODEL,
307
  messages=[
308
  {"role": "system", "content": system_prompt},
309
  {"role": "user", "content": user_text},
@@ -336,103 +454,18 @@ def generate_speech_chunk(
336
  full_audio_b64 = "".join(audio_chunks)
337
  base64_to_wav(full_audio_b64, output_wav)
338
  return output_wav, transcript
339
- else:
340
- return None, "No audio received from API"
341
 
342
  except Exception as e:
343
  return None, str(e)
344
 
345
 
346
- # ──────────────────────────────────────────────
347
- # Generate silence between chapters/sections
348
- # ──────────────────────────────────────────────
349
- def generate_silence(duration_sec: float, output_path: str):
350
- """Generate a silent WAV file."""
351
- subprocess.run(
352
- ["ffmpeg", "-y", "-f", "lavfi",
353
- "-i", f"anullsrc=r=24000:cl=mono",
354
- "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
355
- capture_output=True, check=True,
356
- )
357
-
358
-
359
- # ──────────────────────────────────────────────
360
- # Document text extraction
361
- # ──────────────────────────────────────────────
362
- def extract_text_from_pdf(filepath: str) -> str:
363
- """Extract text from a PDF file using pypdf."""
364
- if not HAS_PYPDF:
365
- raise ImportError("pypdf is not installed. Cannot read PDF files.")
366
- reader = pypdf.PdfReader(filepath)
367
- pages = []
368
- for page in reader.pages:
369
- text = page.extract_text()
370
- if text:
371
- pages.append(text.strip())
372
- return "\n\n".join(pages)
373
-
374
-
375
- def extract_text_from_docx(filepath: str) -> str:
376
- """Extract text from a .docx file using python-docx."""
377
- if not HAS_DOCX:
378
- raise ImportError("python-docx is not installed. Cannot read Word files.")
379
- doc = docx.Document(filepath)
380
- paragraphs = []
381
- for para in doc.paragraphs:
382
- text = para.text.strip()
383
- if text:
384
- paragraphs.append(text)
385
- return "\n\n".join(paragraphs)
386
-
387
-
388
- def extract_text_from_file(filepath: str) -> str:
389
- """Extract text from a file based on its extension."""
390
- ext = os.path.splitext(filepath)[1].lower()
391
-
392
- if ext == ".pdf":
393
- return extract_text_from_pdf(filepath)
394
- elif ext in (".docx", ".doc"):
395
- if ext == ".doc":
396
- # .doc (old format) — try converting with LibreOffice if available
397
- try:
398
- tmp_dir = tempfile.mkdtemp()
399
- subprocess.run(
400
- ["libreoffice", "--headless", "--convert-to", "docx",
401
- "--outdir", tmp_dir, filepath],
402
- capture_output=True, check=True, timeout=60,
403
- )
404
- docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
405
- docx_path = os.path.join(tmp_dir, docx_name)
406
- if os.path.exists(docx_path):
407
- text = extract_text_from_docx(docx_path)
408
- shutil.rmtree(tmp_dir, ignore_errors=True)
409
- return text
410
- except Exception:
411
- pass
412
- raise gr.Error(
413
- "Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
414
- )
415
- return extract_text_from_docx(filepath)
416
- else:
417
- # Plain text files (.txt, .md, etc.)
418
- with open(filepath, "r", encoding="utf-8", errors="replace") as f:
419
- return f.read()
420
-
421
-
422
- # ──────────────────────────────────────────────
423
- # Main pipeline
424
- # ──────────────────────────────────────────────
425
- def generate_audiobook(
426
- text_input: str,
427
- file_input,
428
- target_language: str,
429
- voice_label: str,
430
- add_pauses: bool,
431
- progress=gr.Progress(),
432
- ):
433
- """Main audiobook generation pipeline."""
434
-
435
- # ── Resolve text source ──
436
  if file_input is not None:
437
  try:
438
  progress(0.02, desc="Extracting text from document...")
@@ -447,32 +480,45 @@ def generate_audiobook(
447
  raise gr.Error("Please provide text or upload a file.")
448
 
449
  if len(text) < 10:
450
- raise gr.Error("Text is too short. Please provide more content.")
451
 
452
- # ── API key ──
453
  api_key = os.environ.get("DASHSCOPE_API_KEY", "")
454
  if not api_key:
455
- raise gr.Error(
456
- "DASHSCOPE_API_KEY not set. Add it as a Space Secret "
457
- "(Settings → Secrets → New Secret)."
458
- )
459
 
460
- voice = get_voice_name(voice_label)
461
  lang_config = LANGUAGES[target_language]
 
462
  translate = target_language != "English"
463
  client = OpenAI(api_key=api_key, base_url=BASE_URL)
464
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  try:
467
- # ── Split text ──
468
- progress(0.05, desc="Splitting text into chunks...")
469
  chunks = split_text_into_chunks(text)
470
  total_chunks = len(chunks)
471
  total_chars = sum(len(c) for c in chunks)
472
 
473
- progress(0.08, desc=f"Processing {total_chunks} chunks ({total_chars:,} characters)...")
474
-
475
- # ── Generate speech for each chunk ──
476
  audio_files = []
477
  all_transcripts = []
478
  silence_path = os.path.join(tmp_dir, "silence.wav")
@@ -480,39 +526,64 @@ def generate_audiobook(
480
  generate_silence(1.5, silence_path)
481
 
482
  for i, chunk in enumerate(chunks):
483
- frac = 0.1 + 0.8 * (i / total_chunks)
484
  progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
485
 
486
- wav_path, transcript = generate_speech_chunk(
487
- client, chunk, voice, target_language,
488
- lang_config, translate, i, tmp_dir,
489
- )
 
 
 
 
 
 
 
 
 
 
490
 
491
- if wav_path:
492
- audio_files.append(wav_path)
493
- # Add pause between chunks
494
- if add_pauses and i < total_chunks - 1:
495
- audio_files.append(silence_path)
 
 
496
  else:
497
- all_transcripts.append(f"⚠️ Chunk {i+1} failed: {transcript}")
498
- # Insert silence placeholder for failed chunk
499
- fail_silence = os.path.join(tmp_dir, f"fail_silence_{i:04d}.wav")
500
- generate_silence(2.0, fail_silence)
501
- audio_files.append(fail_silence)
 
 
 
 
 
 
 
 
 
502
 
503
- if transcript and not transcript.startswith("⚠️"):
504
- all_transcripts.append(transcript)
 
 
 
 
505
 
506
  if not audio_files:
507
- raise gr.Error("No audio was generated. Check your API key and try again.")
508
 
509
- # ── Concatenate all audio ──
510
- progress(0.92, desc="Assembling audiobook...")
511
  final_audio = os.path.join(tmp_dir, "audiobook.wav")
512
  concatenate_wavs(audio_files, final_audio)
513
 
514
- # ── Convert to MP3 for smaller file size ──
515
- progress(0.96, desc="Converting to MP3...")
516
  final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
517
  subprocess.run(
518
  ["ffmpeg", "-y", "-i", final_audio,
@@ -523,21 +594,20 @@ def generate_audiobook(
523
 
524
  progress(1.0, desc="Done!")
525
 
526
- # Build transcript display
527
- transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
528
-
529
- # Stats
530
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
 
531
  stats = (
532
  f"**Audiobook Generated!**\n\n"
533
  f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
534
  f"- **Language:** {target_language} ({lang_config['native']})\n"
535
- f"- **Voice:** {voice_label}\n"
 
536
  f"- **File size:** {audio_size:.1f} MB\n"
537
- f"- **Quality tier:** {lang_config['tier'].title()}\n"
538
  )
539
- if lang_config["tier"] == "extended":
540
- stats += "\n> ⚠️ This is an extended language. Voice quality may vary compared to the core 10 languages."
 
 
541
 
542
  return final_mp3, stats, transcript_text
543
 
@@ -545,33 +615,11 @@ def generate_audiobook(
545
  raise
546
  except Exception as e:
547
  raise gr.Error(f"Pipeline error: {str(e)}")
548
- finally:
549
- # Don't clean up tmp_dir yet — Gradio needs the files
550
- pass
551
-
552
 
553
- # ──────────────────────────────────────────────
554
- # Build language choices with tier labels
555
- # ──────────────────────────────────────────────
556
- def get_language_choices():
557
- core = [f"⭐ {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "core"]
558
- extended = [f" {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "extended"]
559
- return core + extended
560
 
561
-
562
- def clean_language_name(choice: str) -> str:
563
- """Remove the tier prefix from the dropdown choice."""
564
- return choice.replace("⭐ ", "").replace(" ", "").strip()
565
-
566
-
567
- def generate_wrapper(text_input, file_input, language_choice, voice, add_pauses, progress=gr.Progress()):
568
- language = clean_language_name(language_choice)
569
- return generate_audiobook(text_input, file_input, language, voice, add_pauses, progress)
570
-
571
-
572
- # ──────────────────────────────────────────────
573
- # Sample text
574
- # ──────────────────────────────────────────────
575
  SAMPLE_TEXT = """Chapter 1: The Beginning
576
 
577
  The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
@@ -580,41 +628,59 @@ The old lighthouse stood at the edge of the world, or so it seemed to the girl w
580
 
581
  The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
582
 
583
- Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
584
 
585
  The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
586
 
587
  "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
588
 
589
- And he would smile that slow, careful smile that seemed to cost him something each time and begin."""
590
 
591
-
592
- # ──────────────────────────────────────────────
593
- # Gradio UI
594
- # ──────────────────────────────────────────────
595
  DESCRIPTION = """
596
- # 📖 Audiobook Generator
597
- ### English Text Multi-Language Audiobook
 
598
 
599
- Paste or upload English text and get a professionally narrated audiobook in any of **36 languages**.
600
- The AI translates and narrates with expressive, audiobook-quality speech.
601
 
602
- = Core language (best quality) · Others = Extended support
 
 
 
603
  """
604
 
605
- # Language dropdown choices
606
  lang_choices = []
607
- lang_choices.append("── Core Languages (Best Quality) ──")
608
  for name, cfg in LANGUAGES.items():
609
  if cfg["tier"] == "core":
610
- lang_choices.append(f" {name}")
611
- lang_choices.append("── Extended Languages ──")
612
  for name, cfg in LANGUAGES.items():
613
  if cfg["tier"] == "extended":
614
  lang_choices.append(name)
615
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
  with gr.Blocks(
617
- title="Audiobook Generator — Qwen3.5-Omni",
618
  theme=gr.themes.Soft(
619
  primary_hue="indigo",
620
  secondary_hue="purple",
@@ -625,81 +691,98 @@ with gr.Blocks(
625
  gr.Markdown(DESCRIPTION)
626
 
627
  with gr.Row():
628
- # ── Left column: Input ──
629
  with gr.Column(scale=1):
630
  text_input = gr.Textbox(
631
  label="English Text",
632
  placeholder="Paste your English text here...",
633
- lines=12,
634
- max_lines=30,
635
  )
636
-
637
  file_input = gr.File(
638
  label="Or Upload a Document (.txt, .md, .pdf, .docx)",
639
  file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
640
  type="filepath",
641
  )
 
642
 
643
- sample_btn = gr.Button("📄 Load Sample Text", variant="secondary", size="sm")
 
 
 
 
 
644
 
645
- with gr.Row():
646
- target_lang = gr.Dropdown(
647
- choices=[c for c in lang_choices if not c.startswith("──")],
648
- value=" English",
649
- label="Target Language",
650
- info="⭐ = Core (best quality). Choose English for no translation.",
651
- )
652
 
653
- voice_select = gr.Dropdown(
654
- choices=ALL_VOICES,
655
- value="Jennifer Cinematic narrator",
656
- label="Narrator Voice",
657
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
658
 
659
  add_pauses = gr.Checkbox(
660
  value=True,
661
  label="Add pauses between sections",
662
- info="Adds 1.5s silence between text chunks for natural pacing",
663
  )
664
 
665
- generate_btn = gr.Button(
666
- "🎙️ Generate Audiobook",
667
- variant="primary",
668
- size="lg",
669
- )
670
 
671
- # ── Right column: Output ──
672
  with gr.Column(scale=1):
673
- audio_output = gr.Audio(
674
- label="Generated Audiobook",
675
- type="filepath",
676
- )
677
-
678
  stats_output = gr.Markdown(label="Generation Stats")
679
-
680
  with gr.Accordion("Translation / Narration Transcript", open=False):
681
  transcript_output = gr.Markdown()
682
 
683
- # ── Event handlers ──
684
- sample_btn.click(
685
- fn=lambda: SAMPLE_TEXT,
686
- outputs=text_input,
 
 
687
  )
688
 
689
  generate_btn.click(
690
  fn=generate_wrapper,
691
- inputs=[text_input, file_input, target_lang, voice_select, add_pauses],
 
692
  outputs=[audio_output, stats_output, transcript_output],
693
  )
694
 
695
- # ── Footer ──
696
  gr.Markdown(
697
  "---\n"
698
- "**Supported languages (36):** Arabic, Bengali, Cantonese, Chinese, Czech, Danish, Dutch, "
699
- "English, Filipino, Finnish, French, German, Greek, Hebrew, Hindi, Hungarian, Indonesian, "
700
- "Italian, Japanese, Korean, Malay, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, "
701
- "Spanish, Swahili, Swedish, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese\n\n"
702
- )
 
 
 
 
703
 
704
  if __name__ == "__main__":
705
  demo.launch()
 
1
  """
2
+ Audiobook Generator - English Source to Multi-Language Audio
3
+ Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC via DashScope API
4
+
5
+ Three voice modes:
6
+ 1. Preset Voices: Use built-in Qwen voices (via Qwen3.5-Omni-Plus)
7
+ 2. Cloned Voice: Clone a voice from audio sample (via Qwen3-TTS-VC)
8
+ 3. Both support translation from English to 36 languages
9
+
10
+ Deploy as a Hugging Face Space:
11
+ 1. Create a new Space (SDK: Gradio)
12
+ 2. Upload app.py and requirements.txt
13
+ 3. Add DASHSCOPE_API_KEY as a Space Secret
14
  """
15
 
16
  import os
17
  import base64
18
+ import json
19
  import math
20
+ import pathlib
21
  import shutil
22
  import struct
23
  import subprocess
 
26
  import re
27
 
28
  import gradio as gr
29
+ import requests as http_requests
30
  from openai import OpenAI
31
 
32
+ # Optional document parsers
33
  try:
34
  import pypdf
35
  HAS_PYPDF = True
 
42
  except ImportError:
43
  HAS_DOCX = False
44
 
 
45
  # Configuration
46
+ OMNI_MODEL = "qwen3.5-omni-plus"
47
+ TTS_VC_MODEL = "qwen3-tts-vc-2026-01-22"
48
+ VOICE_CLONE_MODEL = "qwen-voice-enrollment"
49
+
50
  BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
51
+ DASHSCOPE_API_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
52
+ VOICE_CLONE_URL = f"{DASHSCOPE_API_URL}/services/audio/tts/customization"
53
+ TTS_SYNTHESIS_URL = f"{DASHSCOPE_API_URL}/services/aigc/multimodal-generation/generation"
54
 
 
 
55
  MAX_CHARS_PER_CHUNK = 1500
56
 
57
+ # Languages
 
 
58
  LANGUAGES = {
 
59
  "English": {"code": "en", "native": "English", "tier": "core"},
60
+ "Chinese (Mandarin)": {"code": "zh", "native": "Chinese", "tier": "core"},
61
+ "Japanese": {"code": "ja", "native": "Japanese", "tier": "core"},
62
+ "Korean": {"code": "ko", "native": "Korean", "tier": "core"},
63
  "German": {"code": "de", "native": "Deutsch", "tier": "core"},
64
+ "French": {"code": "fr", "native": "Francais", "tier": "core"},
65
+ "Russian": {"code": "ru", "native": "Russian", "tier": "core"},
66
+ "Portuguese": {"code": "pt", "native": "Portugues", "tier": "core"},
67
+ "Spanish": {"code": "es", "native": "Espanol", "tier": "core"},
68
  "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
69
+ "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
 
70
  "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
71
  "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
72
+ "Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
73
+ "Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
74
+ "Thai": {"code": "th", "native": "Thai", "tier": "extended"},
75
  "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
76
  "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
77
+ "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
78
+ "Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
79
+ "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
80
  "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
81
+ "Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
82
+ "Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
83
+ "Greek": {"code": "el", "native": "Greek", "tier": "extended"},
84
  "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
85
  "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
86
  "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
87
  "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
88
+ "Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
89
+ "Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
90
+ "Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
91
+ "Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
92
  "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
93
  "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
94
+ "Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
95
  }
96
 
97
+ VOICE_CLONE_LANGUAGES = {
98
+ "English", "Chinese (Mandarin)", "Japanese", "Korean", "German",
99
+ "French", "Russian", "Portuguese", "Spanish", "Italian",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
 
102
+ PRESET_VOICES = [
103
+ "Cherry -- Sunny, friendly",
104
+ "Serena -- Gentle, soft",
105
+ "Jennifer -- Cinematic narrator",
106
+ "Katerina -- Mature, rich rhythm",
107
+ "Ethan -- Warm, energetic",
108
+ "Ryan -- Dramatic, rhythmic",
109
+ "Kai -- Soothing, calm",
110
+ "Neil -- Precise, clear",
111
+ "Lenn -- Rational, steady",
112
+ "Aiden -- Young, lively",
113
+ "Eldric Sage -- Authoritative narrator",
114
+ "Arthur -- Classic, mature",
115
+ "Mia -- Young, versatile",
116
+ "Bella -- Elegant, warm",
117
+ "Vivian -- Professional, clear",
118
+ "Seren -- Calm, measured",
119
+ "Dolce -- Sweet, melodic",
120
+ "Bellona -- Strong, commanding",
121
+ "Vincent -- Rich, theatrical",
122
+ "Andre -- Deep, resonant",
123
+ ]
124
+
125
+
126
+ def get_voice_name(label):
127
+ return label.split("--")[0].strip()
128
 
 
 
 
129
 
 
 
130
  # Audio helpers
131
+ def base64_to_wav(b64_data, output_path):
 
 
132
  audio_bytes = base64.b64decode(b64_data)
133
+ sr = 24000
134
+ nc = 1
135
+ bps = 16
136
+ br = sr * nc * bps // 8
137
+ ba = nc * bps // 8
138
+ ds = len(audio_bytes)
139
  with open(output_path, "wb") as f:
140
  f.write(b"RIFF")
141
+ f.write(struct.pack("<I", 36 + ds))
142
  f.write(b"WAVE")
143
  f.write(b"fmt ")
144
  f.write(struct.pack("<I", 16))
145
  f.write(struct.pack("<H", 1))
146
+ f.write(struct.pack("<H", nc))
147
+ f.write(struct.pack("<I", sr))
148
+ f.write(struct.pack("<I", br))
149
+ f.write(struct.pack("<H", ba))
150
+ f.write(struct.pack("<H", bps))
151
  f.write(b"data")
152
+ f.write(struct.pack("<I", ds))
153
  f.write(audio_bytes)
154
 
155
 
156
+ def concatenate_wavs(wav_files, output_path):
 
157
  if not wav_files:
158
  return
159
  if len(wav_files) == 1:
 
171
  os.remove(list_file)
172
 
173
 
174
+ def generate_silence(duration_sec, output_path):
175
+ subprocess.run(
176
+ ["ffmpeg", "-y", "-f", "lavfi",
177
+ "-i", "anullsrc=r=24000:cl=mono",
178
+ "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
179
+ capture_output=True, check=True,
180
+ )
181
+
182
+
183
+ # Document extraction
184
+ def extract_text_from_pdf(filepath):
185
+ if not HAS_PYPDF:
186
+ raise ImportError("pypdf is not installed.")
187
+ reader = pypdf.PdfReader(filepath)
188
+ pages = []
189
+ for page in reader.pages:
190
+ text = page.extract_text()
191
+ if text:
192
+ pages.append(text.strip())
193
+ return "\n\n".join(pages)
194
+
195
+
196
+ def extract_text_from_docx(filepath):
197
+ if not HAS_DOCX:
198
+ raise ImportError("python-docx is not installed.")
199
+ doc = docx.Document(filepath)
200
+ paragraphs = []
201
+ for para in doc.paragraphs:
202
+ text = para.text.strip()
203
+ if text:
204
+ paragraphs.append(text)
205
+ return "\n\n".join(paragraphs)
206
+
207
+
208
+ def extract_text_from_file(filepath):
209
+ ext = os.path.splitext(filepath)[1].lower()
210
+ if ext == ".pdf":
211
+ return extract_text_from_pdf(filepath)
212
+ elif ext in (".docx", ".doc"):
213
+ if ext == ".doc":
214
+ try:
215
+ tmp_dir = tempfile.mkdtemp()
216
+ subprocess.run(
217
+ ["libreoffice", "--headless", "--convert-to", "docx",
218
+ "--outdir", tmp_dir, filepath],
219
+ capture_output=True, check=True, timeout=60,
220
+ )
221
+ docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
222
+ docx_path = os.path.join(tmp_dir, docx_name)
223
+ if os.path.exists(docx_path):
224
+ text = extract_text_from_docx(docx_path)
225
+ shutil.rmtree(tmp_dir, ignore_errors=True)
226
+ return text
227
+ except Exception:
228
+ pass
229
+ raise gr.Error("Cannot read .doc files. Please save as .docx or .pdf.")
230
+ return extract_text_from_docx(filepath)
231
+ else:
232
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
233
+ return f.read()
234
+
235
+
236
  # Text splitting
237
+ def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
 
 
 
 
 
 
238
  text = text.strip()
239
  if not text:
240
  return []
 
 
241
  if len(text) <= max_chars:
242
  return [text]
243
 
244
  chunks = []
 
245
  paragraphs = re.split(r"\n\s*\n", text)
 
246
  current_chunk = ""
247
+
248
  for para in paragraphs:
249
  para = para.strip()
250
  if not para:
251
  continue
 
 
252
  if len(current_chunk) + len(para) + 2 <= max_chars:
253
  current_chunk = (current_chunk + "\n\n" + para).strip()
254
  else:
 
255
  if current_chunk:
256
  chunks.append(current_chunk)
257
  current_chunk = ""
 
 
258
  if len(para) > max_chars:
259
  sentences = re.split(r"(?<=[.!?])\s+", para)
260
  for sentence in sentences:
 
263
  else:
264
  if current_chunk:
265
  chunks.append(current_chunk)
 
266
  if len(sentence) > max_chars:
267
  words = sentence.split()
268
  current_chunk = ""
 
280
 
281
  if current_chunk:
282
  chunks.append(current_chunk)
 
283
  return chunks
284
 
285
 
286
+ # ==============================
287
+ # VOICE CLONING
288
+ # ==============================
289
+ def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
290
+ filepath = pathlib.Path(audio_path)
291
+ if not filepath.exists():
292
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
293
+
294
+ ext = filepath.suffix.lower()
295
+ mime_map = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4"}
296
+ mime_type = mime_map.get(ext, "audio/mpeg")
297
+
298
+ b64_str = base64.b64encode(filepath.read_bytes()).decode()
299
+ data_uri = f"data:{mime_type};base64,{b64_str}"
300
+
301
+ payload = {
302
+ "model": VOICE_CLONE_MODEL,
303
+ "input": {
304
+ "action": "create",
305
+ "target_model": TTS_VC_MODEL,
306
+ "preferred_name": preferred_name,
307
+ "audio": {"data": data_uri},
308
+ },
309
+ }
310
+ headers = {
311
+ "Authorization": f"Bearer {api_key}",
312
+ "Content-Type": "application/json",
313
+ }
314
+
315
+ resp = http_requests.post(VOICE_CLONE_URL, json=payload, headers=headers, timeout=60)
316
+ if resp.status_code != 200:
317
+ raise RuntimeError(f"Voice cloning failed ({resp.status_code}): {resp.text}")
318
+
319
+ try:
320
+ return resp.json()["output"]["voice"]
321
+ except (KeyError, ValueError) as e:
322
+ raise RuntimeError(f"Failed to parse voice clone response: {e}\n{resp.text}")
323
+
324
+
325
+ # ==============================
326
+ # TTS WITH CLONED VOICE
327
+ # ==============================
328
+ def synthesize_with_cloned_voice(text, voice_id, language, api_key, output_dir, chunk_index):
329
+ lang_type_map = {
330
+ "English": "English", "Chinese (Mandarin)": "Chinese",
331
+ "Japanese": "Japanese", "Korean": "Korean",
332
+ "German": "German", "French": "French",
333
+ "Russian": "Russian", "Portuguese": "Portuguese",
334
+ "Spanish": "Spanish", "Italian": "Italian",
335
+ }
336
+ language_type = lang_type_map.get(language, "English")
337
+
338
+ payload = {
339
+ "model": TTS_VC_MODEL,
340
+ "input": {
341
+ "text": text,
342
+ "voice": voice_id,
343
+ "language_type": language_type,
344
+ },
345
+ }
346
+ headers = {
347
+ "Authorization": f"Bearer {api_key}",
348
+ "Content-Type": "application/json",
349
+ }
350
+
351
+ try:
352
+ resp = http_requests.post(TTS_SYNTHESIS_URL, json=payload, headers=headers, timeout=120)
353
+ if resp.status_code != 200:
354
+ return None, f"TTS failed ({resp.status_code}): {resp.text[:200]}"
355
+
356
+ result = resp.json()
357
+ audio_url = result.get("output", {}).get("audio", {}).get("url")
358
+ if not audio_url:
359
+ return None, f"No audio URL in response: {json.dumps(result)[:200]}"
360
+
361
+ output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
362
+ audio_resp = http_requests.get(audio_url, timeout=120)
363
+ if audio_resp.status_code != 200:
364
+ return None, "Failed to download audio from URL"
365
+
366
+ with open(output_wav, "wb") as f:
367
+ f.write(audio_resp.content)
368
+
369
+ return output_wav, None
370
+
371
+ except Exception as e:
372
+ return None, str(e)
373
+
374
+
375
+ # ==============================
376
+ # TRANSLATION (text only)
377
+ # ==============================
378
+ def translate_text(client, text, target_language, lang_config):
379
+ response = client.chat.completions.create(
380
+ model=OMNI_MODEL,
381
+ modalities=["text"],
382
+ messages=[
383
+ {
384
+ "role": "system",
385
+ "content": (
386
+ f"You are a professional translator. Translate English text into "
387
+ f"natural, fluent {target_language} ({lang_config['native']}). "
388
+ f"Output ONLY the translated text."
389
+ ),
390
+ },
391
+ {
392
+ "role": "user",
393
+ "content": f"Translate the following into {target_language}:\n\n{text}",
394
+ },
395
+ ],
396
+ )
397
+ return response.choices[0].message.content.strip()
398
+
399
+
400
+ # ==============================
401
+ # SPEECH WITH PRESET VOICE
402
+ # ==============================
403
+ def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
404
  output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
405
 
406
  if translate and language != "English":
407
  system_prompt = (
408
  f"You are a professional audiobook narrator and translator.\n"
409
+ f"Translate the English text into natural {language} ({lang_config['native']}).\n"
410
+ f"Read the translation aloud with expressive audiobook narration.\n"
411
+ f"Respond ONLY with the spoken {language} narration."
 
 
 
 
 
 
 
 
 
 
412
  )
413
+ user_text = f"Translate into {language} and narrate as an audiobook:\n\n{text}"
414
  else:
415
  system_prompt = (
416
  "You are a professional audiobook narrator.\n"
417
+ "Read the text with clear, expressive narration.\n"
418
+ "Respond ONLY with the spoken narration."
 
 
 
 
 
419
  )
420
+ user_text = f"Narrate as an audiobook:\n\n{text}"
421
 
422
  try:
423
  completion = client.chat.completions.create(
424
+ model=OMNI_MODEL,
425
  messages=[
426
  {"role": "system", "content": system_prompt},
427
  {"role": "user", "content": user_text},
 
454
  full_audio_b64 = "".join(audio_chunks)
455
  base64_to_wav(full_audio_b64, output_wav)
456
  return output_wav, transcript
457
+ return None, "No audio received"
 
458
 
459
  except Exception as e:
460
  return None, str(e)
461
 
462
 
463
+ # ==============================
464
+ # MAIN PIPELINE
465
+ # ==============================
466
+ def generate_audiobook(text_input, file_input, target_language, voice_mode,
467
+ preset_voice_label, clone_audio, add_pauses, progress=gr.Progress()):
468
+ # Resolve text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  if file_input is not None:
470
  try:
471
  progress(0.02, desc="Extracting text from document...")
 
480
  raise gr.Error("Please provide text or upload a file.")
481
 
482
  if len(text) < 10:
483
+ raise gr.Error("Text is too short.")
484
 
 
485
  api_key = os.environ.get("DASHSCOPE_API_KEY", "")
486
  if not api_key:
487
+ raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
 
 
 
488
 
 
489
  lang_config = LANGUAGES[target_language]
490
+ use_clone = voice_mode == "Clone a Voice"
491
  translate = target_language != "English"
492
  client = OpenAI(api_key=api_key, base_url=BASE_URL)
493
  tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
494
 
495
+ # Voice cloning setup
496
+ cloned_voice_id = None
497
+ if use_clone:
498
+ if clone_audio is None:
499
+ raise gr.Error("Please upload a voice sample (10-60 seconds of clear speech).")
500
+
501
+ if target_language not in VOICE_CLONE_LANGUAGES:
502
+ raise gr.Error(
503
+ f"Voice cloning TTS supports: {', '.join(sorted(VOICE_CLONE_LANGUAGES))}. "
504
+ f"'{target_language}' is not supported with cloned voices. Use a preset voice instead."
505
+ )
506
+
507
+ progress(0.03, desc="Cloning voice from audio sample...")
508
+ try:
509
+ cloned_voice_id = clone_voice(clone_audio, api_key)
510
+ progress(0.08, desc="Voice cloned successfully!")
511
+ except Exception as e:
512
+ raise gr.Error(f"Voice cloning failed: {e}")
513
+
514
  try:
515
+ # Split text
516
+ progress(0.10, desc="Splitting text into chunks...")
517
  chunks = split_text_into_chunks(text)
518
  total_chunks = len(chunks)
519
  total_chars = sum(len(c) for c in chunks)
520
 
521
+ # Process each chunk
 
 
522
  audio_files = []
523
  all_transcripts = []
524
  silence_path = os.path.join(tmp_dir, "silence.wav")
 
526
  generate_silence(1.5, silence_path)
527
 
528
  for i, chunk in enumerate(chunks):
529
+ frac = 0.12 + 0.75 * (i / total_chunks)
530
  progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
531
 
532
+ if use_clone:
533
+ # CLONED VOICE PIPELINE
534
+ final_text = chunk
535
+ if translate:
536
+ try:
537
+ final_text = translate_text(client, chunk, target_language, lang_config)
538
+ all_transcripts.append(final_text)
539
+ except Exception as e:
540
+ all_transcripts.append(f"Translation failed for chunk {i+1}: {e}")
541
+ final_text = chunk
542
+
543
+ wav_path, error = synthesize_with_cloned_voice(
544
+ final_text, cloned_voice_id, target_language, api_key, tmp_dir, i,
545
+ )
546
 
547
+ if wav_path:
548
+ audio_files.append(wav_path)
549
+ else:
550
+ all_transcripts.append(f"TTS failed for chunk {i+1}: {error}")
551
+ fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
552
+ generate_silence(2.0, fail_silence)
553
+ audio_files.append(fail_silence)
554
  else:
555
+ # PRESET VOICE PIPELINE
556
+ voice = get_voice_name(preset_voice_label)
557
+ wav_path, transcript = generate_speech_preset(
558
+ client, chunk, voice, target_language,
559
+ lang_config, translate, i, tmp_dir,
560
+ )
561
+
562
+ if wav_path:
563
+ audio_files.append(wav_path)
564
+ else:
565
+ all_transcripts.append(f"Chunk {i+1} failed: {transcript}")
566
+ fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
567
+ generate_silence(2.0, fail_silence)
568
+ audio_files.append(fail_silence)
569
 
570
+ if transcript and "failed" not in transcript.lower():
571
+ all_transcripts.append(transcript)
572
+
573
+ # Pause between chunks
574
+ if add_pauses and i < total_chunks - 1 and audio_files:
575
+ audio_files.append(silence_path)
576
 
577
  if not audio_files:
578
+ raise gr.Error("No audio was generated.")
579
 
580
+ # Concatenate
581
+ progress(0.90, desc="Assembling audiobook...")
582
  final_audio = os.path.join(tmp_dir, "audiobook.wav")
583
  concatenate_wavs(audio_files, final_audio)
584
 
585
+ # Convert to MP3
586
+ progress(0.95, desc="Converting to MP3...")
587
  final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
588
  subprocess.run(
589
  ["ffmpeg", "-y", "-i", final_audio,
 
594
 
595
  progress(1.0, desc="Done!")
596
 
 
 
 
 
597
  audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
598
+ voice_info = f"Cloned voice (ID: {cloned_voice_id[:20]}...)" if use_clone else preset_voice_label
599
  stats = (
600
  f"**Audiobook Generated!**\n\n"
601
  f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
602
  f"- **Language:** {target_language} ({lang_config['native']})\n"
603
+ f"- **Voice:** {voice_info}\n"
604
+ f"- **Mode:** {'Voice Clone via Qwen3-TTS-VC' if use_clone else 'Preset via Qwen3.5-Omni-Plus'}\n"
605
  f"- **File size:** {audio_size:.1f} MB\n"
 
606
  )
607
+ if lang_config["tier"] == "extended" and not use_clone:
608
+ stats += "\n> Note: Extended language - voice quality may vary."
609
+
610
+ transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
611
 
612
  return final_mp3, stats, transcript_text
613
 
 
615
  raise
616
  except Exception as e:
617
  raise gr.Error(f"Pipeline error: {str(e)}")
 
 
 
 
618
 
 
 
 
 
 
 
 
619
 
620
+ # ==============================
621
+ # GRADIO UI
622
+ # ==============================
 
 
 
 
 
 
 
 
 
 
 
623
  SAMPLE_TEXT = """Chapter 1: The Beginning
624
 
625
  The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
 
628
 
629
  The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
630
 
631
+ Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather - grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
632
 
633
  The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
634
 
635
  "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
636
 
637
+ And he would smile - that slow, careful smile that seemed to cost him something each time - and begin."""
638
 
 
 
 
 
639
  DESCRIPTION = """
640
+ # Audiobook Generator
641
+ ### English Text to Multi-Language Audiobook with Voice Cloning
642
+ **Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC**
643
 
644
+ Upload English text and generate a narrated audiobook in **36 languages**.
645
+ Choose a **preset voice** or **clone any voice** from a short audio sample!
646
 
647
+ | Mode | Model | Languages | How it works |
648
+ |------|-------|-----------|-------------|
649
+ | **Preset Voice** | Qwen3.5-Omni-Plus | 36 languages | Translates + speaks in one step |
650
+ | **Clone a Voice** | Qwen3-TTS-VC | 10 core languages | Clones voice, translates, then speaks |
651
  """
652
 
 
653
  lang_choices = []
 
654
  for name, cfg in LANGUAGES.items():
655
  if cfg["tier"] == "core":
656
+ lang_choices.append(f"* {name}")
 
657
  for name, cfg in LANGUAGES.items():
658
  if cfg["tier"] == "extended":
659
  lang_choices.append(name)
660
 
661
+
662
+ def clean_language_name(choice):
663
+ return choice.replace("* ", "").strip()
664
+
665
+
666
+ def on_voice_mode_change(mode):
667
+ if mode == "Clone a Voice":
668
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
669
+ else:
670
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
671
+
672
+
673
+ def generate_wrapper(text_input, file_input, language_choice, voice_mode,
674
+ preset_voice, clone_audio, add_pauses, progress=gr.Progress()):
675
+ language = clean_language_name(language_choice)
676
+ return generate_audiobook(
677
+ text_input, file_input, language, voice_mode,
678
+ preset_voice, clone_audio, add_pauses, progress,
679
+ )
680
+
681
+
682
  with gr.Blocks(
683
+ title="Audiobook Generator",
684
  theme=gr.themes.Soft(
685
  primary_hue="indigo",
686
  secondary_hue="purple",
 
691
  gr.Markdown(DESCRIPTION)
692
 
693
  with gr.Row():
 
694
  with gr.Column(scale=1):
695
  text_input = gr.Textbox(
696
  label="English Text",
697
  placeholder="Paste your English text here...",
698
+ lines=10,
699
+ max_lines=25,
700
  )
 
701
  file_input = gr.File(
702
  label="Or Upload a Document (.txt, .md, .pdf, .docx)",
703
  file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
704
  type="filepath",
705
  )
706
+ sample_btn = gr.Button("Load Sample Text", variant="secondary", size="sm")
707
 
708
+ target_lang = gr.Dropdown(
709
+ choices=lang_choices,
710
+ value="* English",
711
+ label="Target Language",
712
+ info="* = Core (best quality). Voice cloning supports core languages only.",
713
+ )
714
 
715
+ voice_mode = gr.Radio(
716
+ choices=["Preset Voice", "Clone a Voice"],
717
+ value="Preset Voice",
718
+ label="Voice Mode",
719
+ )
 
 
720
 
721
+ preset_voice = gr.Dropdown(
722
+ choices=PRESET_VOICES,
723
+ value="Jennifer -- Cinematic narrator",
724
+ label="Preset Narrator Voice",
725
+ visible=True,
726
+ )
727
+
728
+ clone_audio = gr.Audio(
729
+ label="Upload Voice Sample (10-60s of clear speech, WAV/MP3/M4A)",
730
+ type="filepath",
731
+ visible=False,
732
+ )
733
+
734
+ clone_info = gr.Markdown(
735
+ value=(
736
+ "> **Voice cloning tips:**\n"
737
+ "> - Use 10-60 seconds of clear, single-speaker audio\n"
738
+ "> - No background music or noise\n"
739
+ "> - WAV (16-bit), MP3, or M4A format\n"
740
+ "> - Sample rate at least 24 kHz recommended\n"
741
+ "> - Cloned voice TTS supports 10 core languages only"
742
+ ),
743
+ visible=False,
744
+ )
745
 
746
  add_pauses = gr.Checkbox(
747
  value=True,
748
  label="Add pauses between sections",
749
+ info="1.5s silence between chunks",
750
  )
751
 
752
+ generate_btn = gr.Button("Generate Audiobook", variant="primary", size="lg")
 
 
 
 
753
 
 
754
  with gr.Column(scale=1):
755
+ audio_output = gr.Audio(label="Generated Audiobook", type="filepath")
 
 
 
 
756
  stats_output = gr.Markdown(label="Generation Stats")
 
757
  with gr.Accordion("Translation / Narration Transcript", open=False):
758
  transcript_output = gr.Markdown()
759
 
760
+ sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
761
+
762
+ voice_mode.change(
763
+ fn=on_voice_mode_change,
764
+ inputs=voice_mode,
765
+ outputs=[preset_voice, clone_audio, clone_info],
766
  )
767
 
768
  generate_btn.click(
769
  fn=generate_wrapper,
770
+ inputs=[text_input, file_input, target_lang, voice_mode,
771
+ preset_voice, clone_audio, add_pauses],
772
  outputs=[audio_output, stats_output, transcript_output],
773
  )
774
 
 
775
  gr.Markdown(
776
  "---\n"
777
+ "**How it works:**\n\n"
778
+ "**Preset voice mode:** Text goes to Qwen3.5-Omni-Plus (translates + speaks in one call) then outputs MP3\n\n"
779
+ "**Clone voice mode:** Voice sample goes to Qwen Voice Enrollment (creates voice ID), "
780
+ "text goes to Qwen3.5-Omni-Plus (translates to target language), "
781
+ "then Qwen3-TTS-VC (synthesizes speech with cloned voice) outputs MP3\n\n"
782
+ "**Voice cloning supports:** Chinese, English, Japanese, Korean, German, French, "
783
+ "Russian, Portuguese, Spanish, Italian\n\n"
784
+ "Built with Gradio | Model by Alibaba Qwen | API via DashScope"
785
+ )
786
 
787
  if __name__ == "__main__":
788
  demo.launch()
requirements.txt CHANGED
@@ -3,3 +3,4 @@ gradio>=5.25.0
3
  audioop-lts; python_version >= "3.13"
4
  pypdf>=4.0.0
5
  python-docx>=1.1.0
 
 
3
  audioop-lts; python_version >= "3.13"
4
  pypdf>=4.0.0
5
  python-docx>=1.1.0
6
+ requests>=2.31.0