Spaces:

PlotweaverAI
/

plotweaver-audiobook

Running

App Files Files Community

Toadoum commited on 6 days ago

Commit

d34b995

verified ·

1 Parent(s): be38378

Create app.py

Browse files

Files changed (1) hide show

app.py +448 -0

app.py ADDED Viewed

	@@ -0,0 +1,448 @@

+"""
+PlotWeaver Audiobook Generator
+English → Hausa Translation + TTS with Timestamps
+A POC demonstrating AI-powered audiobook creation for African languages.
+"""
+import gradio as gr
+import torch
+import numpy as np
+import tempfile
+import os
+import re
+import json
+from pathlib import Path
+from datetime import timedelta
+from typing import List, Tuple, Optional
+# Document processing
+import fitz  # PyMuPDF
+from docx import Document
+# Translation & TTS
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, VitsModel
+import scipy.io.wavfile as wavfile
+# ============================================
+# CONFIGURATION
+# ============================================
+NLLB_MODEL = "facebook/nllb-200-distilled-600M"  # Optimized for speed
+TTS_MODEL = "facebook/mms-tts-hau"
+SRC_LANG = "eng_Latn"
+TGT_LANG = "hau_Latn"
+SAMPLE_RATE = 16000
+MAX_CHUNK_LENGTH = 200  # characters per TTS chunk
+# ============================================
+# MODEL LOADING (Cached)
+# ============================================
+def load_models():
+    """Load translation and TTS models."""
+    print("🔄 Loading models...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"   Device: {device}")
+    # Load NLLB translation model
+    print("   Loading NLLB-200...")
+    nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL, src_lang=SRC_LANG)
+    nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
+        NLLB_MODEL,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32
+    )
+    if device == "cuda":
+        nllb_model = nllb_model.cuda()
+    nllb_model.eval()
+    # Load MMS-TTS Hausa
+    print("   Loading MMS-TTS Hausa...")
+    tts_model = VitsModel.from_pretrained(TTS_MODEL)
+    tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL)
+    if device == "cuda":
+        tts_model = tts_model.cuda()
+    tts_model.eval()
+    print("✅ Models loaded successfully")
+    return nllb_model, nllb_tokenizer, tts_model, tts_tokenizer
+# Global model loading
+nllb_model, nllb_tokenizer, tts_model, tts_tokenizer = None, None, None, None
+def initialize_models():
+    global nllb_model, nllb_tokenizer, tts_model, tts_tokenizer
+    if nllb_model is None:
+        nllb_model, nllb_tokenizer, tts_model, tts_tokenizer = load_models()
+# ============================================
+# DOCUMENT EXTRACTION
+# ============================================
+def extract_text_from_pdf(file_path: str) -> List[dict]:
+    """Extract text from PDF with page numbers."""
+    doc = fitz.open(file_path)
+    chapters = []
+    for page_num, page in enumerate(doc, 1):
+        text = page.get_text().strip()
+        if text:
+            chapters.append({
+                "chapter": f"Page {page_num}",
+                "text": text
+            })
+    doc.close()
+    return chapters
+def extract_text_from_docx(file_path: str) -> List[dict]:
+    """Extract text from DOCX with paragraph grouping."""
+    doc = Document(file_path)
+    chapters = []
+    current_chapter = {"chapter": "Chapter 1", "text": ""}
+    chapter_num = 1
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            continue
+        # Detect chapter headings (simple heuristic)
+        if para.style.name.startswith('Heading') or (len(text) < 50 and text.isupper()):
+            if current_chapter["text"]:
+                chapters.append(current_chapter)
+            chapter_num += 1
+            current_chapter = {"chapter": text or f"Chapter {chapter_num}", "text": ""}
+        else:
+            current_chapter["text"] += text + "\n\n"
+    if current_chapter["text"]:
+        chapters.append(current_chapter)
+    return chapters
+def extract_text(file_path: str) -> List[dict]:
+    """Extract text from uploaded file."""
+    ext = Path(file_path).suffix.lower()
+    if ext == ".pdf":
+        return extract_text_from_pdf(file_path)
+    elif ext in [".docx", ".doc"]:
+        return extract_text_from_docx(file_path)
+    elif ext == ".txt":
+        with open(file_path, "r", encoding="utf-8") as f:
+            text = f.read()
+        return [{"chapter": "Full Text", "text": text}]
+    else:
+        raise ValueError(f"Unsupported file format: {ext}")
+# ============================================
+# TRANSLATION (NLLB-200)
+# ============================================
+def translate_text(text: str) -> str:
+    """Translate English text to Hausa using NLLB-200."""
+    initialize_models()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Split into sentences for better translation
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    translated_sentences = []
+    # Get target language token
+    tgt_lang_id = nllb_tokenizer.convert_tokens_to_ids(TGT_LANG)
+    with torch.no_grad():
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            # Tokenize
+            inputs = nllb_tokenizer(
+                sentence,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True
+            )
+            if device == "cuda":
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            # Translate
+            outputs = nllb_model.generate(
+                **inputs,
+                forced_bos_token_id=tgt_lang_id,
+                max_length=256,
+                num_beams=5,
+                early_stopping=True
+            )
+            # Decode
+            translated = nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            translated_sentences.append(translated)
+    return " ".join(translated_sentences)
+# ============================================
+# TEXT-TO-SPEECH (MMS-TTS)
+# ============================================
+def split_text_for_tts(text: str, max_length: int = MAX_CHUNK_LENGTH) -> List[str]:
+    """Split text into chunks suitable for TTS."""
+    # Split by sentences first
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= max_length:
+            current_chunk += sentence + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def generate_audio(text: str) -> Tuple[np.ndarray, List[dict]]:
+    """Generate audio from Hausa text with timestamps."""
+    initialize_models()
+    chunks = split_text_for_tts(text)
+    audio_segments = []
+    timestamps = []
+    current_time = 0.0
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    for chunk in chunks:
+        if not chunk.strip():
+            continue
+        # Tokenize
+        inputs = tts_tokenizer(chunk, return_tensors="pt")
+        if device == "cuda":
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        # Generate audio
+        with torch.no_grad():
+            output = tts_model(**inputs).waveform
+        audio = output.squeeze().cpu().numpy()
+        audio_segments.append(audio)
+        # Calculate timestamp
+        duration = len(audio) / SAMPLE_RATE
+        timestamps.append({
+            "start": format_timestamp(current_time),
+            "end": format_timestamp(current_time + duration),
+            "text": chunk
+        })
+        current_time += duration
+    # Concatenate all audio
+    if audio_segments:
+        full_audio = np.concatenate(audio_segments)
+    else:
+        full_audio = np.zeros(SAMPLE_RATE)  # 1 second of silence
+    return full_audio, timestamps
+def format_timestamp(seconds: float) -> str:
+    """Format seconds as HH:MM:SS.mmm"""
+    td = timedelta(seconds=seconds)
+    hours, remainder = divmod(td.seconds, 3600)
+    minutes, secs = divmod(remainder, 60)
+    milliseconds = int(td.microseconds / 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
+# ============================================
+# MAIN PIPELINE
+# ============================================
+def process_document(file, progress=gr.Progress()) -> Tuple[str, str, str, str]:
+    """
+    Main pipeline: Document → Translation → TTS → Audiobook
+    Returns: (audio_path, transcript, timestamps_json, status)
+    """
+    if file is None:
+        return None, "", "", "⚠️ Please upload a document"
+    try:
+        progress(0.1, desc="📄 Extracting text...")
+        chapters = extract_text(file.name)
+        if not chapters:
+            return None, "", "", "⚠️ No text found in document"
+        # Combine all text (for POC, limit to first 2000 chars)
+        full_text = "\n\n".join([c["text"] for c in chapters])[:2000]
+        progress(0.3, desc="🌍 Translating to Hausa...")
+        translated_text = translate_text(full_text)
+        progress(0.6, desc="🎙️ Generating audio...")
+        audio, timestamps = generate_audio(translated_text)
+        progress(0.9, desc="💾 Saving audiobook...")
+        # Save audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            wavfile.write(f.name, SAMPLE_RATE, (audio * 32767).astype(np.int16))
+            audio_path = f.name
+        # Format timestamps
+        timestamps_text = "\n".join([
+            f"[{t['start']} → {t['end']}] {t['text']}"
+            for t in timestamps
+        ])
+        # Create transcript
+        transcript = f"""## Original (English)
+{full_text[:500]}{'...' if len(full_text) > 500 else ''}
+## Translation (Hausa)
+{translated_text}
+"""
+        progress(1.0, desc="✅ Complete!")
+        return audio_path, transcript, timestamps_text, "✅ Audiobook generated successfully!"
+    except Exception as e:
+        return None, "", "", f"❌ Error: {str(e)}"
+# ============================================
+# GRADIO INTERFACE
+# ============================================
+def create_interface():
+    with gr.Blocks(
+        title="PlotWeaver Audiobook Generator",
+        theme=gr.themes.Soft(
+            primary_hue="orange",
+            secondary_hue="blue",
+        ),
+        css="""
+        .main-title {
+            text-align: center;
+            margin-bottom: 1rem;
+        }
+        .subtitle {
+            text-align: center;
+            color: #666;
+            margin-bottom: 2rem;
+        }
+        .output-panel {
+            border: 1px solid #ddd;
+            border-radius: 8px;
+            padding: 1rem;
+        }
+        """
+    ) as demo:
+        # Header
+        gr.HTML("""
+        <div class="main-title">
+            <h1>🎧 PlotWeaver Audiobook Generator</h1>
+        </div>
+        <div class="subtitle">
+            <p><strong>Transform English documents into Hausa audiobooks with timestamps</strong></p>
+            <p>Powered by NLLB-200 Translation + MMS-TTS</p>
+        </div>
+        """)
+        with gr.Row():
+            # Input Column
+            with gr.Column(scale=1):
+                gr.Markdown("### 📁 Upload Document")
+                file_input = gr.File(
+                    label="Upload PDF, DOCX, or TXT",
+                    file_types=[".pdf", ".docx", ".doc", ".txt"],
+                    type="filepath"
+                )
+                generate_btn = gr.Button(
+                    "🚀 Generate Audiobook",
+                    variant="primary",
+                    size="lg"
+                )
+                status_output = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    lines=1
+                )
+                gr.Markdown("""
+                ---
+                ### ℹ️ How it works
+                1. **Upload** your English document
+                2. **AI translates** to Hausa using NLLB-200
+                3. **TTS generates** natural Hausa audio
+                4. **Download** your audiobook with timestamps
+                ---
+                ### 🌍 Supported Languages
+                - 🇬🇧 English → 🇳🇬 Hausa
+                - *More languages coming soon!*
+                """)
+            # Output Column
+            with gr.Column(scale=2):
+                gr.Markdown("### 🎧 Generated Audiobook")
+                audio_output = gr.Audio(
+                    label="Hausa Audiobook",
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Tabs():
+                    with gr.Tab("📜 Transcript"):
+                        transcript_output = gr.Markdown(
+                            label="Translation",
+                            value="*Upload a document to see the transcript*"
+                        )
+                    with gr.Tab("⏱️ Timestamps"):
+                        timestamps_output = gr.Textbox(
+                            label="Timestamps",
+                            lines=10,
+                            interactive=False,
+                            placeholder="Timestamps will appear here..."
+                        )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 8px;">
+            <p><strong>PlotWeaver</strong> - AI-Powered African Language Technology</p>
+            <p style="color: #666; font-size: 0.9rem;">
+                Democratizing content access across Africa through voice technology
+            </p>
+        </div>
+        """)
+        # Event handlers
+        generate_btn.click(
+            fn=process_document,
+            inputs=[file_input],
+            outputs=[audio_output, transcript_output, timestamps_output, status_output],
+            show_progress=True
+        )
+    return demo
+# ============================================
+# MAIN
+# ============================================
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )