Spaces:

PlotweaverModel
/

AudioBook

Running

App Files Files Community

PlotweaverModel commited on 4 days ago

Commit

33bd369

verified ·

1 Parent(s): 76d6cf7

files update

Browse files

Files changed (2) hide show

app.py +461 -378
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,15 +1,23 @@
 """
-📖 Audiobook Generator — English Source to Multi-Language Audio
-Two modes:
-  1. Translation + TTS: Translate English text to target language, then generate speech
-  2. Direct TTS: Generate speech from English text directly
 """
 import os
 import base64
 import math
 import shutil
 import struct
 import subprocess
@@ -18,9 +26,10 @@ import time
 import re
 import gradio as gr
 from openai import OpenAI
-# Optional document parsers — installed via requirements.txt
 try:
     import pypdf
     HAS_PYPDF = True
@@ -33,139 +42,118 @@ try:
 except ImportError:
     HAS_DOCX = False
-# ──────────────────────────────────────────────
 # Configuration
-# ──────────────────────────────────────────────
-MODEL = "qwen3.5-omni-plus"
 BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
-# Maximum characters per chunk sent to the API
-# The model has token limits, so we split long texts
 MAX_CHARS_PER_CHUNK = 1500
-# All 36 speech output languages supported by Qwen3.5-Omni
-# Core 10 languages have the best quality; extended languages are supported
-# but may vary in quality as they include dialects
 LANGUAGES = {
-    # ── Core 10 Languages (highest quality) ──
     "English": {"code": "en", "native": "English", "tier": "core"},
-    "Chinese (Mandarin)": {"code": "zh", "native": "中文", "tier": "core"},
-    "Japanese": {"code": "ja", "native": "日本語", "tier": "core"},
-    "Korean": {"code": "ko", "native": "한국어", "tier": "core"},
     "German": {"code": "de", "native": "Deutsch", "tier": "core"},
-    "French": {"code": "fr", "native": "Français", "tier": "core"},
-    "Russian": {"code": "ru", "native": "Русский", "tier": "core"},
-    "Portuguese": {"code": "pt", "native": "Português", "tier": "core"},
-    "Spanish": {"code": "es", "native": "Español", "tier": "core"},
     "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
-    # ── Extended Languages (Qwen3.5-Omni expanded to 36) ──
-    "Arabic": {"code": "ar", "native": "العربية", "tier": "extended"},
     "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
     "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
-    "Turkish": {"code": "tr", "native": "Türkçe", "tier": "extended"},
-    "Vietnamese": {"code": "vi", "native": "Tiếng Việt", "tier": "extended"},
-    "Thai": {"code": "th", "native": "ภาษาไทย", "tier": "extended"},
     "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
     "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
-    "Hindi": {"code": "hi", "native": "हिन्दी", "tier": "extended"},
-    "Bengali": {"code": "bn", "native": "বাংলা", "tier": "extended"},
-    "Urdu": {"code": "ur", "native": "اردو", "tier": "extended"},
     "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
-    "Czech": {"code": "cs", "native": "Čeština", "tier": "extended"},
-    "Romanian": {"code": "ro", "native": "Română", "tier": "extended"},
-    "Greek": {"code": "el", "native": "Ελληνικά", "tier": "extended"},
     "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
     "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
     "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
     "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
-    "Ukrainian": {"code": "uk", "native": "Українська", "tier": "extended"},
-    "Hebrew": {"code": "he", "native": "עברית", "tier": "extended"},
-    "Persian": {"code": "fa", "native": "فارسی", "tier": "extended"},
-    "Cantonese": {"code": "yue", "native": "粵語", "tier": "extended"},
     "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
     "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
-    "Tamil": {"code": "ta", "native": "தமிழ்", "tier": "extended"},
 }
-VOICES = {
-    "Male Voices": [
-        "Ethan — Warm, energetic",
-        "Ryan — Dramatic, rhythmic",
-        "Kai — Soothing, calm",
-        "Neil — Precise, clear",
-        "Lenn — Rational, steady",
-        "Aiden — Young, lively",
-        "Eldric Sage — Authoritative narrator",
-        "Arthur — Classic, mature",
-        "Elias — Soft, thoughtful",
-        "Alek — Confident, modern",
-        "Andre — Deep, resonant",
-        "Emilien — Gentle, French-inspired",
-        "Vincent — Rich, theatrical",
-    ],
-    "Female Voices": [
-        "Cherry — Sunny, friendly",
-        "Serena — Gentle, soft",
-        "Jennifer — Cinematic narrator",
-        "Katerina — Mature, rich rhythm",
-        "Chelsie — Bright, expressive",
-        "Mia — Young, versatile",
-        "Bella — Elegant, warm",
-        "Vivian — Professional, clear",
-        "Moon — Dreamy, ethereal",
-        "Maia — Confident, articulate",
-        "Seren — Calm, measured",
-        "Dolce — Sweet, melodic",
-        "Bellona — Strong, commanding",
-        "Bunny — Playful, light",
-        "Momo — Cute, upbeat",
-        "Mochi — Soft, adorable",
-    ],
 }
-# Flatten voice list for the dropdown
-ALL_VOICES = []
-for category, voices in VOICES.items():
-    for v in voices:
-        ALL_VOICES.append(v)
-def get_voice_name(voice_label: str) -> str:
-    """Extract just the voice name from 'Name — Description' format."""
-    return voice_label.split("—")[0].strip()
-# ──────────────────────────────────────────────
 # Audio helpers
-# ──────────────────────────────────────────────
-def base64_to_wav(b64_data: str, output_path: str):
-    """Decode base64 PCM data and write a proper WAV file."""
     audio_bytes = base64.b64decode(b64_data)
-    sample_rate = 24000
-    num_channels = 1
-    bits_per_sample = 16
-    byte_rate = sample_rate * num_channels * bits_per_sample // 8
-    block_align = num_channels * bits_per_sample // 8
-    data_size = len(audio_bytes)
     with open(output_path, "wb") as f:
         f.write(b"RIFF")
-        f.write(struct.pack("<I", 36 + data_size))
         f.write(b"WAVE")
         f.write(b"fmt ")
         f.write(struct.pack("<I", 16))
         f.write(struct.pack("<H", 1))
-        f.write(struct.pack("<H", num_channels))
-        f.write(struct.pack("<I", sample_rate))
-        f.write(struct.pack("<I", byte_rate))
-        f.write(struct.pack("<H", block_align))
-        f.write(struct.pack("<H", bits_per_sample))
         f.write(b"data")
-        f.write(struct.pack("<I", data_size))
         f.write(audio_bytes)
-def concatenate_wavs(wav_files: list, output_path: str):
-    """Concatenate multiple WAV files using ffmpeg."""
     if not wav_files:
         return
     if len(wav_files) == 1:
@@ -183,43 +171,90 @@ def concatenate_wavs(wav_files: list, output_path: str):
     os.remove(list_file)
-# ──────────────────────────────────────────────
 # Text splitting
-# ──────────────────────────────────────────────
-def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> list:
-    """
-    Split text into chunks at sentence boundaries.
-    Tries to keep paragraphs together when possible.
-    """
-    # Normalize whitespace
     text = text.strip()
     if not text:
         return []
-    # If short enough, return as-is
     if len(text) <= max_chars:
         return [text]
     chunks = []
-    # First split by paragraphs
     paragraphs = re.split(r"\n\s*\n", text)
     current_chunk = ""
     for para in paragraphs:
         para = para.strip()
         if not para:
             continue
-        # If adding this paragraph keeps us under the limit
         if len(current_chunk) + len(para) + 2 <= max_chars:
             current_chunk = (current_chunk + "\n\n" + para).strip()
         else:
-            # Save current chunk if it has content
             if current_chunk:
                 chunks.append(current_chunk)
                 current_chunk = ""
-            # If the paragraph itself is too long, split by sentences
             if len(para) > max_chars:
                 sentences = re.split(r"(?<=[.!?])\s+", para)
                 for sentence in sentences:
@@ -228,7 +263,6 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
                     else:
                         if current_chunk:
                             chunks.append(current_chunk)
-                        # If a single sentence is too long, force-split it
                         if len(sentence) > max_chars:
                             words = sentence.split()
                             current_chunk = ""
@@ -246,64 +280,148 @@ def split_text_into_chunks(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> l
     if current_chunk:
         chunks.append(current_chunk)
     return chunks
-# ──────────────────────────────────────────────
-# API: Generate speech for a text chunk
-# ──────────────────────────────────────────────
-def generate_speech_chunk(
-    client: OpenAI,
-    text: str,
-    voice: str,
-    language: str,
-    lang_config: dict,
-    translate: bool,
-    chunk_index: int,
-    output_dir: str,
-) -> tuple:
-    """
-    Send a text chunk to Qwen3.5-Omni-Plus and get back audio.
-    If translate=True, translates from English to target language and speaks.
-    If translate=False, speaks the text directly in English.
-    Returns (wav_path, transcript) or (None, error_msg).
-    """
     output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
     if translate and language != "English":
         system_prompt = (
             f"You are a professional audiobook narrator and translator.\n"
-            f"You will receive English text. Your task:\n"
-            f"1. Translate the text into natural, fluent {language} ({lang_config['native']}).\n"
-            f"2. Read the translated text aloud with clear, expressive narration.\n"
-            f"3. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
-            f"   descriptions, and emotional moments.\n"
-            f"4. Respond ONLY with the spoken {language} narration — no English,\n"
-            f"   no meta-commentary, no chapter headers unless they're in the text.\n"
-            f"5. Maintain a natural reading pace suitable for an audiobook.\n"
-            f"6. Translate idioms and cultural references appropriately."
-        )
-        user_text = (
-            f"Translate the following English text into {language} and narrate it "
-            f"as an audiobook. Respond only with the spoken {language} narration:\n\n{text}"
         )
     else:
         system_prompt = (
             "You are a professional audiobook narrator.\n"
-            "You will receive text to read aloud. Your task:\n"
-            "1. Read the text with clear, expressive narration.\n"
-            "2. Use an engaging audiobook narration style — vary your tone for dialogue,\n"
-            "   descriptions, and emotional moments.\n"
-            "3. Respond ONLY with the spoken narration — no meta-commentary.\n"
-            "4. Maintain a natural reading pace suitable for an audiobook.\n"
-            "5. Pause appropriately between paragraphs and at punctuation."
         )
-        user_text = f"Narrate the following text as an audiobook:\n\n{text}"
     try:
         completion = client.chat.completions.create(
-            model=MODEL,
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": user_text},
@@ -336,103 +454,18 @@ def generate_speech_chunk(
             full_audio_b64 = "".join(audio_chunks)
             base64_to_wav(full_audio_b64, output_wav)
             return output_wav, transcript
-        else:
-            return None, "No audio received from API"
     except Exception as e:
         return None, str(e)
-# ──────────────────────────────────────────────
-# Generate silence between chapters/sections
-# ──────────────────────────────────────────────
-def generate_silence(duration_sec: float, output_path: str):
-    """Generate a silent WAV file."""
-    subprocess.run(
-        ["ffmpeg", "-y", "-f", "lavfi",
-         "-i", f"anullsrc=r=24000:cl=mono",
-         "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
-        capture_output=True, check=True,
-    )
-# ──────────────────────────────────────────────
-# Document text extraction
-# ──────────────────────────────────────────────
-def extract_text_from_pdf(filepath: str) -> str:
-    """Extract text from a PDF file using pypdf."""
-    if not HAS_PYPDF:
-        raise ImportError("pypdf is not installed. Cannot read PDF files.")
-    reader = pypdf.PdfReader(filepath)
-    pages = []
-    for page in reader.pages:
-        text = page.extract_text()
-        if text:
-            pages.append(text.strip())
-    return "\n\n".join(pages)
-def extract_text_from_docx(filepath: str) -> str:
-    """Extract text from a .docx file using python-docx."""
-    if not HAS_DOCX:
-        raise ImportError("python-docx is not installed. Cannot read Word files.")
-    doc = docx.Document(filepath)
-    paragraphs = []
-    for para in doc.paragraphs:
-        text = para.text.strip()
-        if text:
-            paragraphs.append(text)
-    return "\n\n".join(paragraphs)
-def extract_text_from_file(filepath: str) -> str:
-    """Extract text from a file based on its extension."""
-    ext = os.path.splitext(filepath)[1].lower()
-    if ext == ".pdf":
-        return extract_text_from_pdf(filepath)
-    elif ext in (".docx", ".doc"):
-        if ext == ".doc":
-            # .doc (old format) — try converting with LibreOffice if available
-            try:
-                tmp_dir = tempfile.mkdtemp()
-                subprocess.run(
-                    ["libreoffice", "--headless", "--convert-to", "docx",
-                     "--outdir", tmp_dir, filepath],
-                    capture_output=True, check=True, timeout=60,
-                )
-                docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
-                docx_path = os.path.join(tmp_dir, docx_name)
-                if os.path.exists(docx_path):
-                    text = extract_text_from_docx(docx_path)
-                    shutil.rmtree(tmp_dir, ignore_errors=True)
-                    return text
-            except Exception:
-                pass
-            raise gr.Error(
-                "Cannot read .doc files directly. Please save as .docx or .pdf and re-upload."
-            )
-        return extract_text_from_docx(filepath)
-    else:
-        # Plain text files (.txt, .md, etc.)
-        with open(filepath, "r", encoding="utf-8", errors="replace") as f:
-            return f.read()
-# ──────────────────────────────────────────────
-# Main pipeline
-# ──────────────────────────────────────────────
-def generate_audiobook(
-    text_input: str,
-    file_input,
-    target_language: str,
-    voice_label: str,
-    add_pauses: bool,
-    progress=gr.Progress(),
-):
-    """Main audiobook generation pipeline."""
-    # ── Resolve text source ──
     if file_input is not None:
         try:
             progress(0.02, desc="Extracting text from document...")
@@ -447,32 +480,45 @@ def generate_audiobook(
         raise gr.Error("Please provide text or upload a file.")
     if len(text) < 10:
-        raise gr.Error("Text is too short. Please provide more content.")
-    # ── API key ──
     api_key = os.environ.get("DASHSCOPE_API_KEY", "")
     if not api_key:
-        raise gr.Error(
-            "DASHSCOPE_API_KEY not set. Add it as a Space Secret "
-            "(Settings → Secrets → New Secret)."
-        )
-    voice = get_voice_name(voice_label)
     lang_config = LANGUAGES[target_language]
     translate = target_language != "English"
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
     tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
     try:
-        # ── Split text ──
-        progress(0.05, desc="Splitting text into chunks...")
         chunks = split_text_into_chunks(text)
         total_chunks = len(chunks)
         total_chars = sum(len(c) for c in chunks)
-        progress(0.08, desc=f"Processing {total_chunks} chunks ({total_chars:,} characters)...")
-        # ── Generate speech for each chunk ──
         audio_files = []
         all_transcripts = []
         silence_path = os.path.join(tmp_dir, "silence.wav")
@@ -480,39 +526,64 @@ def generate_audiobook(
             generate_silence(1.5, silence_path)
         for i, chunk in enumerate(chunks):
-            frac = 0.1 + 0.8 * (i / total_chunks)
             progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
-            wav_path, transcript = generate_speech_chunk(
-                client, chunk, voice, target_language,
-                lang_config, translate, i, tmp_dir,
-            )
-            if wav_path:
-                audio_files.append(wav_path)
-                # Add pause between chunks
-                if add_pauses and i < total_chunks - 1:
-                    audio_files.append(silence_path)
             else:
-                all_transcripts.append(f"⚠️ Chunk {i+1} failed: {transcript}")
-                # Insert silence placeholder for failed chunk
-                fail_silence = os.path.join(tmp_dir, f"fail_silence_{i:04d}.wav")
-                generate_silence(2.0, fail_silence)
-                audio_files.append(fail_silence)
-            if transcript and not transcript.startswith("⚠️"):
-                all_transcripts.append(transcript)
         if not audio_files:
-            raise gr.Error("No audio was generated. Check your API key and try again.")
-        # ── Concatenate all audio ──
-        progress(0.92, desc="Assembling audiobook...")
         final_audio = os.path.join(tmp_dir, "audiobook.wav")
         concatenate_wavs(audio_files, final_audio)
-        # ── Convert to MP3 for smaller file size ──
-        progress(0.96, desc="Converting to MP3...")
         final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
         subprocess.run(
             ["ffmpeg", "-y", "-i", final_audio,
@@ -523,21 +594,20 @@ def generate_audiobook(
         progress(1.0, desc="Done!")
-        # Build transcript display
-        transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
-        # Stats
         audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
         stats = (
             f"**Audiobook Generated!**\n\n"
             f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
             f"- **Language:** {target_language} ({lang_config['native']})\n"
-            f"- **Voice:** {voice_label}\n"
             f"- **File size:** {audio_size:.1f} MB\n"
-            f"- **Quality tier:** {lang_config['tier'].title()}\n"
         )
-        if lang_config["tier"] == "extended":
-            stats += "\n> ⚠️ This is an extended language. Voice quality may vary compared to the core 10 languages."
         return final_mp3, stats, transcript_text
@@ -545,33 +615,11 @@ def generate_audiobook(
         raise
     except Exception as e:
         raise gr.Error(f"Pipeline error: {str(e)}")
-    finally:
-        # Don't clean up tmp_dir yet — Gradio needs the files
-        pass
-# ──────────────────────────────────────────────
-# Build language choices with tier labels
-# ──────────────────────────────────────────────
-def get_language_choices():
-    core = [f"⭐ {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "core"]
-    extended = [f"  {name}" for name, cfg in LANGUAGES.items() if cfg["tier"] == "extended"]
-    return core + extended
-def clean_language_name(choice: str) -> str:
-    """Remove the tier prefix from the dropdown choice."""
-    return choice.replace("⭐ ", "").replace("  ", "").strip()
-def generate_wrapper(text_input, file_input, language_choice, voice, add_pauses, progress=gr.Progress()):
-    language = clean_language_name(language_choice)
-    return generate_audiobook(text_input, file_input, language, voice, add_pauses, progress)
-# ──────────────────────────────────────────────
-# Sample text
-# ──────────────────────────────────────────────
 SAMPLE_TEXT = """Chapter 1: The Beginning
 The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
@@ -580,41 +628,59 @@ The old lighthouse stood at the edge of the world, or so it seemed to the girl w
 The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
-Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather — grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
 The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
 "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
-And he would smile — that slow, careful smile that seemed to cost him something each time — and begin."""
-# ──────────────────────────────────────────────
-# Gradio UI
-# ──────────────────────────────────────────────
 DESCRIPTION = """
-# 📖 Audiobook Generator
-### English Text → Multi-Language Audiobook
-Paste or upload English text and get a professionally narrated audiobook in any of **36 languages**.
-The AI translates and narrates with expressive, audiobook-quality speech.
-⭐ = Core language (best quality) · Others = Extended support
 """
-# Language dropdown choices
 lang_choices = []
-lang_choices.append("── Core Languages (Best Quality) ──")
 for name, cfg in LANGUAGES.items():
     if cfg["tier"] == "core":
-        lang_choices.append(f"⭐ {name}")
-lang_choices.append("── Extended Languages ──")
 for name, cfg in LANGUAGES.items():
     if cfg["tier"] == "extended":
         lang_choices.append(name)
 with gr.Blocks(
-    title="Audiobook Generator — Qwen3.5-Omni",
     theme=gr.themes.Soft(
         primary_hue="indigo",
         secondary_hue="purple",
@@ -625,81 +691,98 @@ with gr.Blocks(
     gr.Markdown(DESCRIPTION)
     with gr.Row():
-        # ── Left column: Input ──
         with gr.Column(scale=1):
             text_input = gr.Textbox(
                 label="English Text",
                 placeholder="Paste your English text here...",
-                lines=12,
-                max_lines=30,
             )
             file_input = gr.File(
                 label="Or Upload a Document (.txt, .md, .pdf, .docx)",
                 file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
                 type="filepath",
             )
-            sample_btn = gr.Button("📄 Load Sample Text", variant="secondary", size="sm")
-            with gr.Row():
-                target_lang = gr.Dropdown(
-                    choices=[c for c in lang_choices if not c.startswith("──")],
-                    value="⭐ English",
-                    label="Target Language",
-                    info="⭐ = Core (best quality). Choose English for no translation.",
-                )
-                voice_select = gr.Dropdown(
-                    choices=ALL_VOICES,
-                    value="Jennifer — Cinematic narrator",
-                    label="Narrator Voice",
-                )
             add_pauses = gr.Checkbox(
                 value=True,
                 label="Add pauses between sections",
-                info="Adds 1.5s silence between text chunks for natural pacing",
             )
-            generate_btn = gr.Button(
-                "🎙️ Generate Audiobook",
-                variant="primary",
-                size="lg",
-            )
-        # ── Right column: Output ──
         with gr.Column(scale=1):
-            audio_output = gr.Audio(
-                label="Generated Audiobook",
-                type="filepath",
-            )
             stats_output = gr.Markdown(label="Generation Stats")
             with gr.Accordion("Translation / Narration Transcript", open=False):
                 transcript_output = gr.Markdown()
-    # ── Event handlers ──
-    sample_btn.click(
-        fn=lambda: SAMPLE_TEXT,
-        outputs=text_input,
     )
     generate_btn.click(
         fn=generate_wrapper,
-        inputs=[text_input, file_input, target_lang, voice_select, add_pauses],
         outputs=[audio_output, stats_output, transcript_output],
     )
-    # ── Footer ──
     gr.Markdown(
         "---\n"
-        "**Supported languages (36):** Arabic, Bengali, Cantonese, Chinese, Czech, Danish, Dutch, "
-        "English, Filipino, Finnish, French, German, Greek, Hebrew, Hindi, Hungarian, Indonesian, "
-        "Italian, Japanese, Korean, Malay, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, "
-        "Spanish, Swahili, Swedish, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese\n\n"
-        )
 if __name__ == "__main__":
     demo.launch()

 """
+Audiobook Generator - English Source to Multi-Language Audio
+Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC via DashScope API
+Three voice modes:
+  1. Preset Voices: Use built-in Qwen voices (via Qwen3.5-Omni-Plus)
+  2. Cloned Voice: Clone a voice from audio sample (via Qwen3-TTS-VC)
+  3. Both support translation from English to 36 languages
+Deploy as a Hugging Face Space:
+  1. Create a new Space (SDK: Gradio)
+  2. Upload app.py and requirements.txt
+  3. Add DASHSCOPE_API_KEY as a Space Secret
 """
 import os
 import base64
+import json
 import math
+import pathlib
 import shutil
 import struct
 import subprocess
 import re
 import gradio as gr
+import requests as http_requests
 from openai import OpenAI
+# Optional document parsers
 try:
     import pypdf
     HAS_PYPDF = True
 except ImportError:
     HAS_DOCX = False
 # Configuration
+OMNI_MODEL = "qwen3.5-omni-plus"
+TTS_VC_MODEL = "qwen3-tts-vc-2026-01-22"
+VOICE_CLONE_MODEL = "qwen-voice-enrollment"
 BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
+DASHSCOPE_API_URL = "https://dashscope-intl.aliyuncs.com/api/v1"
+VOICE_CLONE_URL = f"{DASHSCOPE_API_URL}/services/audio/tts/customization"
+TTS_SYNTHESIS_URL = f"{DASHSCOPE_API_URL}/services/aigc/multimodal-generation/generation"
 MAX_CHARS_PER_CHUNK = 1500
+# Languages
 LANGUAGES = {
     "English": {"code": "en", "native": "English", "tier": "core"},
+    "Chinese (Mandarin)": {"code": "zh", "native": "Chinese", "tier": "core"},
+    "Japanese": {"code": "ja", "native": "Japanese", "tier": "core"},
+    "Korean": {"code": "ko", "native": "Korean", "tier": "core"},
     "German": {"code": "de", "native": "Deutsch", "tier": "core"},
+    "French": {"code": "fr", "native": "Francais", "tier": "core"},
+    "Russian": {"code": "ru", "native": "Russian", "tier": "core"},
+    "Portuguese": {"code": "pt", "native": "Portugues", "tier": "core"},
+    "Spanish": {"code": "es", "native": "Espanol", "tier": "core"},
     "Italian": {"code": "it", "native": "Italiano", "tier": "core"},
+    "Arabic": {"code": "ar", "native": "Arabic", "tier": "extended"},
     "Dutch": {"code": "nl", "native": "Nederlands", "tier": "extended"},
     "Polish": {"code": "pl", "native": "Polski", "tier": "extended"},
+    "Turkish": {"code": "tr", "native": "Turkce", "tier": "extended"},
+    "Vietnamese": {"code": "vi", "native": "Tieng Viet", "tier": "extended"},
+    "Thai": {"code": "th", "native": "Thai", "tier": "extended"},
     "Indonesian": {"code": "id", "native": "Bahasa Indonesia", "tier": "extended"},
     "Malay": {"code": "ms", "native": "Bahasa Melayu", "tier": "extended"},
+    "Hindi": {"code": "hi", "native": "Hindi", "tier": "extended"},
+    "Bengali": {"code": "bn", "native": "Bengali", "tier": "extended"},
+    "Urdu": {"code": "ur", "native": "Urdu", "tier": "extended"},
     "Swedish": {"code": "sv", "native": "Svenska", "tier": "extended"},
+    "Czech": {"code": "cs", "native": "Cestina", "tier": "extended"},
+    "Romanian": {"code": "ro", "native": "Romana", "tier": "extended"},
+    "Greek": {"code": "el", "native": "Greek", "tier": "extended"},
     "Hungarian": {"code": "hu", "native": "Magyar", "tier": "extended"},
     "Finnish": {"code": "fi", "native": "Suomi", "tier": "extended"},
     "Danish": {"code": "da", "native": "Dansk", "tier": "extended"},
     "Norwegian": {"code": "no", "native": "Norsk", "tier": "extended"},
+    "Ukrainian": {"code": "uk", "native": "Ukrainian", "tier": "extended"},
+    "Hebrew": {"code": "he", "native": "Hebrew", "tier": "extended"},
+    "Persian": {"code": "fa", "native": "Farsi", "tier": "extended"},
+    "Cantonese": {"code": "yue", "native": "Cantonese", "tier": "extended"},
     "Filipino": {"code": "fil", "native": "Filipino", "tier": "extended"},
     "Swahili": {"code": "sw", "native": "Kiswahili", "tier": "extended"},
+    "Tamil": {"code": "ta", "native": "Tamil", "tier": "extended"},
 }
+VOICE_CLONE_LANGUAGES = {
+    "English", "Chinese (Mandarin)", "Japanese", "Korean", "German",
+    "French", "Russian", "Portuguese", "Spanish", "Italian",
 }
+PRESET_VOICES = [
+    "Cherry -- Sunny, friendly",
+    "Serena -- Gentle, soft",
+    "Jennifer -- Cinematic narrator",
+    "Katerina -- Mature, rich rhythm",
+    "Ethan -- Warm, energetic",
+    "Ryan -- Dramatic, rhythmic",
+    "Kai -- Soothing, calm",
+    "Neil -- Precise, clear",
+    "Lenn -- Rational, steady",
+    "Aiden -- Young, lively",
+    "Eldric Sage -- Authoritative narrator",
+    "Arthur -- Classic, mature",
+    "Mia -- Young, versatile",
+    "Bella -- Elegant, warm",
+    "Vivian -- Professional, clear",
+    "Seren -- Calm, measured",
+    "Dolce -- Sweet, melodic",
+    "Bellona -- Strong, commanding",
+    "Vincent -- Rich, theatrical",
+    "Andre -- Deep, resonant",
+]
+def get_voice_name(label):
+    return label.split("--")[0].strip()
 # Audio helpers
+def base64_to_wav(b64_data, output_path):
     audio_bytes = base64.b64decode(b64_data)
+    sr = 24000
+    nc = 1
+    bps = 16
+    br = sr * nc * bps // 8
+    ba = nc * bps // 8
+    ds = len(audio_bytes)
     with open(output_path, "wb") as f:
         f.write(b"RIFF")
+        f.write(struct.pack("<I", 36 + ds))
         f.write(b"WAVE")
         f.write(b"fmt ")
         f.write(struct.pack("<I", 16))
         f.write(struct.pack("<H", 1))
+        f.write(struct.pack("<H", nc))
+        f.write(struct.pack("<I", sr))
+        f.write(struct.pack("<I", br))
+        f.write(struct.pack("<H", ba))
+        f.write(struct.pack("<H", bps))
         f.write(b"data")
+        f.write(struct.pack("<I", ds))
         f.write(audio_bytes)
+def concatenate_wavs(wav_files, output_path):
     if not wav_files:
         return
     if len(wav_files) == 1:
     os.remove(list_file)
+def generate_silence(duration_sec, output_path):
+    subprocess.run(
+        ["ffmpeg", "-y", "-f", "lavfi",
+         "-i", "anullsrc=r=24000:cl=mono",
+         "-t", str(duration_sec), "-acodec", "pcm_s16le", output_path],
+        capture_output=True, check=True,
+    )
+# Document extraction
+def extract_text_from_pdf(filepath):
+    if not HAS_PYPDF:
+        raise ImportError("pypdf is not installed.")
+    reader = pypdf.PdfReader(filepath)
+    pages = []
+    for page in reader.pages:
+        text = page.extract_text()
+        if text:
+            pages.append(text.strip())
+    return "\n\n".join(pages)
+def extract_text_from_docx(filepath):
+    if not HAS_DOCX:
+        raise ImportError("python-docx is not installed.")
+    doc = docx.Document(filepath)
+    paragraphs = []
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if text:
+            paragraphs.append(text)
+    return "\n\n".join(paragraphs)
+def extract_text_from_file(filepath):
+    ext = os.path.splitext(filepath)[1].lower()
+    if ext == ".pdf":
+        return extract_text_from_pdf(filepath)
+    elif ext in (".docx", ".doc"):
+        if ext == ".doc":
+            try:
+                tmp_dir = tempfile.mkdtemp()
+                subprocess.run(
+                    ["libreoffice", "--headless", "--convert-to", "docx",
+                     "--outdir", tmp_dir, filepath],
+                    capture_output=True, check=True, timeout=60,
+                )
+                docx_name = os.path.splitext(os.path.basename(filepath))[0] + ".docx"
+                docx_path = os.path.join(tmp_dir, docx_name)
+                if os.path.exists(docx_path):
+                    text = extract_text_from_docx(docx_path)
+                    shutil.rmtree(tmp_dir, ignore_errors=True)
+                    return text
+            except Exception:
+                pass
+            raise gr.Error("Cannot read .doc files. Please save as .docx or .pdf.")
+        return extract_text_from_docx(filepath)
+    else:
+        with open(filepath, "r", encoding="utf-8", errors="replace") as f:
+            return f.read()
 # Text splitting
+def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK):
     text = text.strip()
     if not text:
         return []
     if len(text) <= max_chars:
         return [text]
     chunks = []
     paragraphs = re.split(r"\n\s*\n", text)
     current_chunk = ""
     for para in paragraphs:
         para = para.strip()
         if not para:
             continue
         if len(current_chunk) + len(para) + 2 <= max_chars:
             current_chunk = (current_chunk + "\n\n" + para).strip()
         else:
             if current_chunk:
                 chunks.append(current_chunk)
                 current_chunk = ""
             if len(para) > max_chars:
                 sentences = re.split(r"(?<=[.!?])\s+", para)
                 for sentence in sentences:
                     else:
                         if current_chunk:
                             chunks.append(current_chunk)
                         if len(sentence) > max_chars:
                             words = sentence.split()
                             current_chunk = ""
     if current_chunk:
         chunks.append(current_chunk)
     return chunks
+# ==============================
+# VOICE CLONING
+# ==============================
+def clone_voice(audio_path, api_key, preferred_name="audiobook_voice"):
+    filepath = pathlib.Path(audio_path)
+    if not filepath.exists():
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    ext = filepath.suffix.lower()
+    mime_map = {".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4"}
+    mime_type = mime_map.get(ext, "audio/mpeg")
+    b64_str = base64.b64encode(filepath.read_bytes()).decode()
+    data_uri = f"data:{mime_type};base64,{b64_str}"
+    payload = {
+        "model": VOICE_CLONE_MODEL,
+        "input": {
+            "action": "create",
+            "target_model": TTS_VC_MODEL,
+            "preferred_name": preferred_name,
+            "audio": {"data": data_uri},
+        },
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    resp = http_requests.post(VOICE_CLONE_URL, json=payload, headers=headers, timeout=60)
+    if resp.status_code != 200:
+        raise RuntimeError(f"Voice cloning failed ({resp.status_code}): {resp.text}")
+    try:
+        return resp.json()["output"]["voice"]
+    except (KeyError, ValueError) as e:
+        raise RuntimeError(f"Failed to parse voice clone response: {e}\n{resp.text}")
+# ==============================
+# TTS WITH CLONED VOICE
+# ==============================
+def synthesize_with_cloned_voice(text, voice_id, language, api_key, output_dir, chunk_index):
+    lang_type_map = {
+        "English": "English", "Chinese (Mandarin)": "Chinese",
+        "Japanese": "Japanese", "Korean": "Korean",
+        "German": "German", "French": "French",
+        "Russian": "Russian", "Portuguese": "Portuguese",
+        "Spanish": "Spanish", "Italian": "Italian",
+    }
+    language_type = lang_type_map.get(language, "English")
+    payload = {
+        "model": TTS_VC_MODEL,
+        "input": {
+            "text": text,
+            "voice": voice_id,
+            "language_type": language_type,
+        },
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    try:
+        resp = http_requests.post(TTS_SYNTHESIS_URL, json=payload, headers=headers, timeout=120)
+        if resp.status_code != 200:
+            return None, f"TTS failed ({resp.status_code}): {resp.text[:200]}"
+        result = resp.json()
+        audio_url = result.get("output", {}).get("audio", {}).get("url")
+        if not audio_url:
+            return None, f"No audio URL in response: {json.dumps(result)[:200]}"
+        output_wav = os.path.join(output_dir, f"vc_chunk_{chunk_index:04d}.wav")
+        audio_resp = http_requests.get(audio_url, timeout=120)
+        if audio_resp.status_code != 200:
+            return None, "Failed to download audio from URL"
+        with open(output_wav, "wb") as f:
+            f.write(audio_resp.content)
+        return output_wav, None
+    except Exception as e:
+        return None, str(e)
+# ==============================
+# TRANSLATION (text only)
+# ==============================
+def translate_text(client, text, target_language, lang_config):
+    response = client.chat.completions.create(
+        model=OMNI_MODEL,
+        modalities=["text"],
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    f"You are a professional translator. Translate English text into "
+                    f"natural, fluent {target_language} ({lang_config['native']}). "
+                    f"Output ONLY the translated text."
+                ),
+            },
+            {
+                "role": "user",
+                "content": f"Translate the following into {target_language}:\n\n{text}",
+            },
+        ],
+    )
+    return response.choices[0].message.content.strip()
+# ==============================
+# SPEECH WITH PRESET VOICE
+# ==============================
+def generate_speech_preset(client, text, voice, language, lang_config, translate, chunk_index, output_dir):
     output_wav = os.path.join(output_dir, f"chunk_{chunk_index:04d}.wav")
     if translate and language != "English":
         system_prompt = (
             f"You are a professional audiobook narrator and translator.\n"
+            f"Translate the English text into natural {language} ({lang_config['native']}).\n"
+            f"Read the translation aloud with expressive audiobook narration.\n"
+            f"Respond ONLY with the spoken {language} narration."
         )
+        user_text = f"Translate into {language} and narrate as an audiobook:\n\n{text}"
     else:
         system_prompt = (
             "You are a professional audiobook narrator.\n"
+            "Read the text with clear, expressive narration.\n"
+            "Respond ONLY with the spoken narration."
         )
+        user_text = f"Narrate as an audiobook:\n\n{text}"
     try:
         completion = client.chat.completions.create(
+            model=OMNI_MODEL,
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": user_text},
             full_audio_b64 = "".join(audio_chunks)
             base64_to_wav(full_audio_b64, output_wav)
             return output_wav, transcript
+        return None, "No audio received"
     except Exception as e:
         return None, str(e)
+# ==============================
+# MAIN PIPELINE
+# ==============================
+def generate_audiobook(text_input, file_input, target_language, voice_mode,
+                       preset_voice_label, clone_audio, add_pauses, progress=gr.Progress()):
+    # Resolve text
     if file_input is not None:
         try:
             progress(0.02, desc="Extracting text from document...")
         raise gr.Error("Please provide text or upload a file.")
     if len(text) < 10:
+        raise gr.Error("Text is too short.")
     api_key = os.environ.get("DASHSCOPE_API_KEY", "")
     if not api_key:
+        raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
     lang_config = LANGUAGES[target_language]
+    use_clone = voice_mode == "Clone a Voice"
     translate = target_language != "English"
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
     tmp_dir = tempfile.mkdtemp(prefix="audiobook_")
+    # Voice cloning setup
+    cloned_voice_id = None
+    if use_clone:
+        if clone_audio is None:
+            raise gr.Error("Please upload a voice sample (10-60 seconds of clear speech).")
+        if target_language not in VOICE_CLONE_LANGUAGES:
+            raise gr.Error(
+                f"Voice cloning TTS supports: {', '.join(sorted(VOICE_CLONE_LANGUAGES))}. "
+                f"'{target_language}' is not supported with cloned voices. Use a preset voice instead."
+            )
+        progress(0.03, desc="Cloning voice from audio sample...")
+        try:
+            cloned_voice_id = clone_voice(clone_audio, api_key)
+            progress(0.08, desc="Voice cloned successfully!")
+        except Exception as e:
+            raise gr.Error(f"Voice cloning failed: {e}")
     try:
+        # Split text
+        progress(0.10, desc="Splitting text into chunks...")
         chunks = split_text_into_chunks(text)
         total_chunks = len(chunks)
         total_chars = sum(len(c) for c in chunks)
+        # Process each chunk
         audio_files = []
         all_transcripts = []
         silence_path = os.path.join(tmp_dir, "silence.wav")
             generate_silence(1.5, silence_path)
         for i, chunk in enumerate(chunks):
+            frac = 0.12 + 0.75 * (i / total_chunks)
             progress(frac, desc=f"Narrating chunk {i+1}/{total_chunks}...")
+            if use_clone:
+                # CLONED VOICE PIPELINE
+                final_text = chunk
+                if translate:
+                    try:
+                        final_text = translate_text(client, chunk, target_language, lang_config)
+                        all_transcripts.append(final_text)
+                    except Exception as e:
+                        all_transcripts.append(f"Translation failed for chunk {i+1}: {e}")
+                        final_text = chunk
+                wav_path, error = synthesize_with_cloned_voice(
+                    final_text, cloned_voice_id, target_language, api_key, tmp_dir, i,
+                )
+                if wav_path:
+                    audio_files.append(wav_path)
+                else:
+                    all_transcripts.append(f"TTS failed for chunk {i+1}: {error}")
+                    fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
+                    generate_silence(2.0, fail_silence)
+                    audio_files.append(fail_silence)
             else:
+                # PRESET VOICE PIPELINE
+                voice = get_voice_name(preset_voice_label)
+                wav_path, transcript = generate_speech_preset(
+                    client, chunk, voice, target_language,
+                    lang_config, translate, i, tmp_dir,
+                )
+                if wav_path:
+                    audio_files.append(wav_path)
+                else:
+                    all_transcripts.append(f"Chunk {i+1} failed: {transcript}")
+                    fail_silence = os.path.join(tmp_dir, f"fail_{i:04d}.wav")
+                    generate_silence(2.0, fail_silence)
+                    audio_files.append(fail_silence)
+                if transcript and "failed" not in transcript.lower():
+                    all_transcripts.append(transcript)
+            # Pause between chunks
+            if add_pauses and i < total_chunks - 1 and audio_files:
+                audio_files.append(silence_path)
         if not audio_files:
+            raise gr.Error("No audio was generated.")
+        # Concatenate
+        progress(0.90, desc="Assembling audiobook...")
         final_audio = os.path.join(tmp_dir, "audiobook.wav")
         concatenate_wavs(audio_files, final_audio)
+        # Convert to MP3
+        progress(0.95, desc="Converting to MP3...")
         final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
         subprocess.run(
             ["ffmpeg", "-y", "-i", final_audio,
         progress(1.0, desc="Done!")
         audio_size = os.path.getsize(final_mp3) / (1024 * 1024)
+        voice_info = f"Cloned voice (ID: {cloned_voice_id[:20]}...)" if use_clone else preset_voice_label
         stats = (
             f"**Audiobook Generated!**\n\n"
             f"- **Source:** {total_chars:,} characters in {total_chunks} chunks\n"
             f"- **Language:** {target_language} ({lang_config['native']})\n"
+            f"- **Voice:** {voice_info}\n"
+            f"- **Mode:** {'Voice Clone via Qwen3-TTS-VC' if use_clone else 'Preset via Qwen3.5-Omni-Plus'}\n"
             f"- **File size:** {audio_size:.1f} MB\n"
         )
+        if lang_config["tier"] == "extended" and not use_clone:
+            stats += "\n> Note: Extended language - voice quality may vary."
+        transcript_text = "\n\n---\n\n".join(all_transcripts) if all_transcripts else ""
         return final_mp3, stats, transcript_text
         raise
     except Exception as e:
         raise gr.Error(f"Pipeline error: {str(e)}")
+# ==============================
+# GRADIO UI
+# ==============================
 SAMPLE_TEXT = """Chapter 1: The Beginning
 The old lighthouse stood at the edge of the world, or so it seemed to the girl who had lived in its shadow all her life. Each morning, she would climb the winding iron staircase to the lamp room, counting exactly one hundred and forty-seven steps, and watch the sun rise from the sea like a great golden coin tossed by the gods.
 The gulls, as always, said nothing. They merely tilted their heads and regarded her with ancient, knowing eyes before launching themselves into the wind.
+Her name was Elena, and she was seventeen years old. She had hair the color of dark honey and eyes that changed with the weather - grey in storms, green in sunlight, and something altogether different in the strange purple twilight that sometimes settled over the coast in autumn.
 The lighthouse keeper, her grandfather, was a man of few words but many stories. He kept them locked away like treasures in a chest, only bringing them out on winter nights when the storms howled outside and the old building trembled like a living thing.
 "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
+And he would smile - that slow, careful smile that seemed to cost him something each time - and begin."""
 DESCRIPTION = """
+# Audiobook Generator
+### English Text to Multi-Language Audiobook with Voice Cloning
+**Powered by Qwen3.5-Omni-Plus + Qwen3-TTS-VC**
+Upload English text and generate a narrated audiobook in **36 languages**.
+Choose a **preset voice** or **clone any voice** from a short audio sample!
+| Mode | Model | Languages | How it works |
+|------|-------|-----------|-------------|
+| **Preset Voice** | Qwen3.5-Omni-Plus | 36 languages | Translates + speaks in one step |
+| **Clone a Voice** | Qwen3-TTS-VC | 10 core languages | Clones voice, translates, then speaks |
 """
 lang_choices = []
 for name, cfg in LANGUAGES.items():
     if cfg["tier"] == "core":
+        lang_choices.append(f"* {name}")
 for name, cfg in LANGUAGES.items():
     if cfg["tier"] == "extended":
         lang_choices.append(name)
+def clean_language_name(choice):
+    return choice.replace("* ", "").strip()
+def on_voice_mode_change(mode):
+    if mode == "Clone a Voice":
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+    else:
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+def generate_wrapper(text_input, file_input, language_choice, voice_mode,
+                     preset_voice, clone_audio, add_pauses, progress=gr.Progress()):
+    language = clean_language_name(language_choice)
+    return generate_audiobook(
+        text_input, file_input, language, voice_mode,
+        preset_voice, clone_audio, add_pauses, progress,
+    )
 with gr.Blocks(
+    title="Audiobook Generator",
     theme=gr.themes.Soft(
         primary_hue="indigo",
         secondary_hue="purple",
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
                 label="English Text",
                 placeholder="Paste your English text here...",
+                lines=10,
+                max_lines=25,
             )
             file_input = gr.File(
                 label="Or Upload a Document (.txt, .md, .pdf, .docx)",
                 file_types=[".txt", ".md", ".text", ".pdf", ".docx", ".doc"],
                 type="filepath",
             )
+            sample_btn = gr.Button("Load Sample Text", variant="secondary", size="sm")
+            target_lang = gr.Dropdown(
+                choices=lang_choices,
+                value="* English",
+                label="Target Language",
+                info="* = Core (best quality). Voice cloning supports core languages only.",
+            )
+            voice_mode = gr.Radio(
+                choices=["Preset Voice", "Clone a Voice"],
+                value="Preset Voice",
+                label="Voice Mode",
+            )
+            preset_voice = gr.Dropdown(
+                choices=PRESET_VOICES,
+                value="Jennifer -- Cinematic narrator",
+                label="Preset Narrator Voice",
+                visible=True,
+            )
+            clone_audio = gr.Audio(
+                label="Upload Voice Sample (10-60s of clear speech, WAV/MP3/M4A)",
+                type="filepath",
+                visible=False,
+            )
+            clone_info = gr.Markdown(
+                value=(
+                    "> **Voice cloning tips:**\n"
+                    "> - Use 10-60 seconds of clear, single-speaker audio\n"
+                    "> - No background music or noise\n"
+                    "> - WAV (16-bit), MP3, or M4A format\n"
+                    "> - Sample rate at least 24 kHz recommended\n"
+                    "> - Cloned voice TTS supports 10 core languages only"
+                ),
+                visible=False,
+            )
             add_pauses = gr.Checkbox(
                 value=True,
                 label="Add pauses between sections",
+                info="1.5s silence between chunks",
             )
+            generate_btn = gr.Button("Generate Audiobook", variant="primary", size="lg")
         with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Generated Audiobook", type="filepath")
             stats_output = gr.Markdown(label="Generation Stats")
             with gr.Accordion("Translation / Narration Transcript", open=False):
                 transcript_output = gr.Markdown()
+    sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
+    voice_mode.change(
+        fn=on_voice_mode_change,
+        inputs=voice_mode,
+        outputs=[preset_voice, clone_audio, clone_info],
     )
     generate_btn.click(
         fn=generate_wrapper,
+        inputs=[text_input, file_input, target_lang, voice_mode,
+                preset_voice, clone_audio, add_pauses],
         outputs=[audio_output, stats_output, transcript_output],
     )
     gr.Markdown(
         "---\n"
+        "**How it works:**\n\n"
+        "**Preset voice mode:** Text goes to Qwen3.5-Omni-Plus (translates + speaks in one call) then outputs MP3\n\n"
+        "**Clone voice mode:** Voice sample goes to Qwen Voice Enrollment (creates voice ID), "
+        "text goes to Qwen3.5-Omni-Plus (translates to target language), "
+        "then Qwen3-TTS-VC (synthesizes speech with cloned voice) outputs MP3\n\n"
+        "**Voice cloning supports:** Chinese, English, Japanese, Korean, German, French, "
+        "Russian, Portuguese, Spanish, Italian\n\n"
+        "Built with Gradio | Model by Alibaba Qwen | API via DashScope"
+    )
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ gradio>=5.25.0
 audioop-lts; python_version >= "3.13"
 pypdf>=4.0.0
 python-docx>=1.1.0

 audioop-lts; python_version >= "3.13"
 pypdf>=4.0.0
 python-docx>=1.1.0
+requests>=2.31.0