Spaces:

PlotweaverAI
/

plotweaver-audiobook

Running

App Files Files Community

Toadoum commited on 6 days ago

Commit

3cd33f6

verified ·

1 Parent(s): 6927959

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -324

app.py CHANGED Viewed

@@ -2,125 +2,55 @@
 PlotWeaver Audiobook Generator
 English → Hausa Translation + TTS with Timestamps
-A POC demonstrating AI-powered audiobook creation for African languages.
 """
 import gradio as gr
 import torch
 import numpy as np
 import tempfile
-import os
 import re
-import json
 from pathlib import Path
 from datetime import timedelta
-from typing import List, Tuple, Optional
 # Document processing
 import fitz  # PyMuPDF
 from docx import Document
-# Translation & TTS
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, VitsModel
 import scipy.io.wavfile as wavfile
 # ============================================
 # CONFIGURATION
 # ============================================
-NLLB_MODEL = "facebook/nllb-200-distilled-600M"  # Optimized for speed
 TTS_MODEL = "facebook/mms-tts-hau"
 SRC_LANG = "eng_Latn"
 TGT_LANG = "hau_Latn"
 SAMPLE_RATE = 16000
-MAX_CHUNK_LENGTH = 200  # characters per TTS chunk
-# ============================================
-# MODEL LOADING (Cached)
-# ============================================
-def load_models():
-    """Load translation and TTS models."""
-    print("🔄 Loading models...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"   Device: {device}")
-    # Load NLLB translation model
-    print("   Loading NLLB-200...")
-    nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL, src_lang=SRC_LANG)
-    nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
-        NLLB_MODEL,
-        torch_dtype=torch.float16 if device == "cuda" else torch.float32
-    )
-    if device == "cuda":
-        nllb_model = nllb_model.cuda()
-    nllb_model.eval()
-    # Load MMS-TTS Hausa
-    print("   Loading MMS-TTS Hausa...")
-    tts_model = VitsModel.from_pretrained(TTS_MODEL)
-    tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL)
-    if device == "cuda":
-        tts_model = tts_model.cuda()
-    tts_model.eval()
-    print("✅ Models loaded successfully")
-    return nllb_model, nllb_tokenizer, tts_model, tts_tokenizer
-# Global model loading
-nllb_model, nllb_tokenizer, tts_model, tts_tokenizer = None, None, None, None
-def initialize_models():
-    global nllb_model, nllb_tokenizer, tts_model, tts_tokenizer
-    if nllb_model is None:
-        nllb_model, nllb_tokenizer, tts_model, tts_tokenizer = load_models()
 # ============================================
 # DOCUMENT EXTRACTION
 # ============================================
-def extract_text_from_pdf(file_path: str) -> List[dict]:
-    """Extract text from PDF with page numbers."""
     doc = fitz.open(file_path)
-    chapters = []
-    for page_num, page in enumerate(doc, 1):
-        text = page.get_text().strip()
-        if text:
-            chapters.append({
-                "chapter": f"Page {page_num}",
-                "text": text
-            })
     doc.close()
-    return chapters
-def extract_text_from_docx(file_path: str) -> List[dict]:
-    """Extract text from DOCX with paragraph grouping."""
     doc = Document(file_path)
-    chapters = []
-    current_chapter = {"chapter": "Chapter 1", "text": ""}
-    chapter_num = 1
-    for para in doc.paragraphs:
-        text = para.text.strip()
-        if not text:
-            continue
-        # Detect chapter headings (simple heuristic)
-        if para.style.name.startswith('Heading') or (len(text) < 50 and text.isupper()):
-            if current_chapter["text"]:
-                chapters.append(current_chapter)
-            chapter_num += 1
-            current_chapter = {"chapter": text or f"Chapter {chapter_num}", "text": ""}
-        else:
-            current_chapter["text"] += text + "\n\n"
-    if current_chapter["text"]:
-        chapters.append(current_chapter)
-    return chapters
-def extract_text(file_path: str) -> List[dict]:
     """Extract text from uploaded file."""
     ext = Path(file_path).suffix.lower()
@@ -130,186 +60,179 @@ def extract_text(file_path: str) -> List[dict]:
         return extract_text_from_docx(file_path)
     elif ext == ".txt":
         with open(file_path, "r", encoding="utf-8") as f:
-            text = f.read()
-        return [{"chapter": "Full Text", "text": text}]
     else:
-        raise ValueError(f"Unsupported file format: {ext}")
 # ============================================
-# TRANSLATION (NLLB-200)
 # ============================================
-def translate_text(text: str) -> str:
-    """Translate English text to Hausa using NLLB-200."""
-    initialize_models()
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Split into sentences for better translation
     sentences = re.split(r'(?<=[.!?])\s+', text)
-    translated_sentences = []
-    # Get target language token
-    tgt_lang_id = nllb_tokenizer.convert_tokens_to_ids(TGT_LANG)
     with torch.no_grad():
         for sentence in sentences:
             if not sentence.strip():
                 continue
-            # Tokenize
-            inputs = nllb_tokenizer(
-                sentence,
-                return_tensors="pt",
-                truncation=True,
-                max_length=512,
-                padding=True
-            )
             if device == "cuda":
                 inputs = {k: v.cuda() for k, v in inputs.items()}
-            # Translate
-            outputs = nllb_model.generate(
                 **inputs,
                 forced_bos_token_id=tgt_lang_id,
                 max_length=256,
-                num_beams=5,
-                early_stopping=True
             )
-            # Decode
-            translated = nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
-            translated_sentences.append(translated)
-    return " ".join(translated_sentences)
 # ============================================
-# TEXT-TO-SPEECH (MMS-TTS)
 # ============================================
-def split_text_for_tts(text: str, max_length: int = MAX_CHUNK_LENGTH) -> List[str]:
-    """Split text into chunks suitable for TTS."""
-    # Split by sentences first
     sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= max_length:
-            current_chunk += sentence + " "
         else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
     return chunks
 def generate_audio(text: str) -> Tuple[np.ndarray, List[dict]]:
-    """Generate audio from Hausa text with timestamps."""
-    initialize_models()
-    chunks = split_text_for_tts(text)
     audio_segments = []
     timestamps = []
     current_time = 0.0
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    for chunk in chunks:
-        if not chunk.strip():
-            continue
-        # Tokenize
-        inputs = tts_tokenizer(chunk, return_tensors="pt")
-        if device == "cuda":
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        # Generate audio
-        with torch.no_grad():
-            output = tts_model(**inputs).waveform
-        audio = output.squeeze().cpu().numpy()
-        audio_segments.append(audio)
-        # Calculate timestamp
-        duration = len(audio) / SAMPLE_RATE
-        timestamps.append({
-            "start": format_timestamp(current_time),
-            "end": format_timestamp(current_time + duration),
-            "text": chunk
-        })
-        current_time += duration
-    # Concatenate all audio
-    if audio_segments:
-        full_audio = np.concatenate(audio_segments)
-    else:
-        full_audio = np.zeros(SAMPLE_RATE)  # 1 second of silence
-    return full_audio, timestamps
-def format_timestamp(seconds: float) -> str:
-    """Format seconds as HH:MM:SS.mmm"""
-    td = timedelta(seconds=seconds)
-    hours, remainder = divmod(td.seconds, 3600)
-    minutes, secs = divmod(remainder, 60)
-    milliseconds = int(td.microseconds / 1000)
-    return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
 # ============================================
 # MAIN PIPELINE
 # ============================================
-def process_document(file, progress=gr.Progress()) -> Tuple[str, str, str, str]:
-    """
-    Main pipeline: Document → Translation → TTS → Audiobook
-    Returns: (audio_path, transcript, timestamps_json, status)
-    """
     if file is None:
         return None, "", "", "⚠️ Please upload a document"
     try:
         progress(0.1, desc="📄 Extracting text...")
-        chapters = extract_text(file.name)
-        if not chapters:
-            return None, "", "", "⚠️ No text found in document"
-        # Combine all text (for POC, limit to first 2000 chars)
-        full_text = "\n\n".join([c["text"] for c in chapters])[:2000]
         progress(0.3, desc="🌍 Translating to Hausa...")
-        translated_text = translate_text(full_text)
         progress(0.6, desc="🎙️ Generating audio...")
-        audio, timestamps = generate_audio(translated_text)
-        progress(0.9, desc="💾 Saving audiobook...")
-        # Save audio
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, SAMPLE_RATE, (audio * 32767).astype(np.int16))
             audio_path = f.name
-        # Format timestamps
-        timestamps_text = "\n".join([
-            f"[{t['start']} → {t['end']}] {t['text']}"
-            for t in timestamps
-        ])
-        # Create transcript
-        transcript = f"""## Original (English)
-{full_text[:500]}{'...' if len(full_text) > 500 else ''}
-## Translation (Hausa)
-{translated_text}
-"""
-        progress(1.0, desc="✅ Complete!")
-        return audio_path, transcript, timestamps_text, "✅ Audiobook generated successfully!"
     except Exception as e:
         return None, "", "", f"❌ Error: {str(e)}"
@@ -317,132 +240,48 @@ def process_document(file, progress=gr.Progress()) -> Tuple[str, str, str, str]:
 # ============================================
 # GRADIO INTERFACE
 # ============================================
-def create_interface():
-    with gr.Blocks(
-        title="PlotWeaver Audiobook Generator",
-        theme=gr.themes.Soft(
-            primary_hue="orange",
-            secondary_hue="blue",
-        ),
-        css="""
-        .main-title {
-            text-align: center;
-            margin-bottom: 1rem;
-        }
-        .subtitle {
-            text-align: center;
-            color: #666;
-            margin-bottom: 2rem;
-        }
-        .output-panel {
-            border: 1px solid #ddd;
-            border-radius: 8px;
-            padding: 1rem;
-        }
-        """
-    ) as demo:
-        # Header
-        gr.HTML("""
-        <div class="main-title">
-            <h1>🎧 PlotWeaver Audiobook Generator</h1>
-        </div>
-        <div class="subtitle">
-            <p><strong>Transform English documents into Hausa audiobooks with timestamps</strong></p>
-            <p>Powered by NLLB-200 Translation + MMS-TTS</p>
-        </div>
-        """)
-        with gr.Row():
-            # Input Column
-            with gr.Column(scale=1):
-                gr.Markdown("### 📁 Upload Document")
-                file_input = gr.File(
-                    label="Upload PDF, DOCX, or TXT",
-                    file_types=[".pdf", ".docx", ".doc", ".txt"],
-                    type="filepath"
-                )
-                generate_btn = gr.Button(
-                    "🚀 Generate Audiobook",
-                    variant="primary",
-                    size="lg"
-                )
-                status_output = gr.Textbox(
-                    label="Status",
-                    interactive=False,
-                    lines=1
-                )
-                gr.Markdown("""
-                ---
-                ### ℹ️ How it works
-                1. **Upload** your English document
-                2. **AI translates** to Hausa using NLLB-200
-                3. **TTS generates** natural Hausa audio
-                4. **Download** your audiobook with timestamps
-                ---
-                ### 🌍 Supported Languages
-                - 🇬🇧 English → 🇳🇬 Hausa
-                - *More languages coming soon!*
-                """)
-            # Output Column
-            with gr.Column(scale=2):
-                gr.Markdown("### 🎧 Generated Audiobook")
-                audio_output = gr.Audio(
-                    label="Hausa Audiobook",
-                    type="filepath",
-                    interactive=False
-                )
-                with gr.Tabs():
-                    with gr.Tab("📜 Transcript"):
-                        transcript_output = gr.Markdown(
-                            label="Translation",
-                            value="*Upload a document to see the transcript*"
-                        )
-                    with gr.Tab("⏱️ Timestamps"):
-                        timestamps_output = gr.Textbox(
-                            label="Timestamps",
-                            lines=10,
-                            interactive=False,
-                            placeholder="Timestamps will appear here..."
-                        )
-        # Footer
-        gr.HTML("""
-        <div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 8px;">
-            <p><strong>PlotWeaver</strong> - AI-Powered African Language Technology</p>
-            <p style="color: #666; font-size: 0.9rem;">
-                Democratizing content access across Africa through voice technology
-            </p>
-        </div>
-        """)
-        # Event handlers
-        generate_btn.click(
-            fn=process_document,
-            inputs=[file_input],
-            outputs=[audio_output, transcript_output, timestamps_output, status_output],
-            show_progress=True
-        )
-    return demo
 # ============================================
-# MAIN
 # ============================================
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch(
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 PlotWeaver Audiobook Generator
 English → Hausa Translation + TTS with Timestamps
+Optimized for fast startup on HuggingFace Spaces.
 """
 import gradio as gr
 import torch
 import numpy as np
 import tempfile
 import re
 from pathlib import Path
 from datetime import timedelta
+from typing import List, Tuple
 # Document processing
 import fitz  # PyMuPDF
 from docx import Document
 import scipy.io.wavfile as wavfile
 # ============================================
 # CONFIGURATION
 # ============================================
+NLLB_MODEL = "facebook/nllb-200-distilled-600M"
 TTS_MODEL = "facebook/mms-tts-hau"
 SRC_LANG = "eng_Latn"
 TGT_LANG = "hau_Latn"
 SAMPLE_RATE = 16000
+MAX_CHUNK_LENGTH = 200
+# Global model cache (lazy loaded)
+_models = {}
 # ============================================
 # DOCUMENT EXTRACTION
 # ============================================
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from PDF."""
     doc = fitz.open(file_path)
+    text = ""
+    for page in doc:
+        text += page.get_text() + "\n"
     doc.close()
+    return text.strip()
+def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from DOCX."""
     doc = Document(file_path)
+    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+def extract_text(file_path: str) -> str:
     """Extract text from uploaded file."""
     ext = Path(file_path).suffix.lower()
         return extract_text_from_docx(file_path)
     elif ext == ".txt":
         with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
     else:
+        raise ValueError(f"Unsupported format: {ext}")
 # ============================================
+# LAZY MODEL LOADING
 # ============================================
+def get_translation_model():
+    """Load translation model only when needed."""
+    if "nllb" not in _models:
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        print("📥 Loading NLLB-200...")
+        tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL, src_lang=SRC_LANG)
+        model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL, torch_dtype=torch.float16)
+        if torch.cuda.is_available():
+            model = model.cuda()
+        model.eval()
+        _models["nllb"] = (model, tokenizer)
+        print("✅ NLLB-200 loaded")
+    return _models["nllb"]
+def get_tts_model():
+    """Load TTS model only when needed."""
+    if "tts" not in _models:
+        from transformers import VitsModel, AutoTokenizer
+        print("📥 Loading MMS-TTS Hausa...")
+        model = VitsModel.from_pretrained(TTS_MODEL)
+        tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL)
+        if torch.cuda.is_available():
+            model = model.cuda()
+        model.eval()
+        _models["tts"] = (model, tokenizer)
+        print("✅ MMS-TTS loaded")
+    return _models["tts"]
+# ============================================
+# TRANSLATION
+# ============================================
+def translate_text(text: str) -> str:
+    """Translate English to Hausa."""
+    model, tokenizer = get_translation_model()
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Split into sentences
     sentences = re.split(r'(?<=[.!?])\s+', text)
+    translated = []
+    tgt_lang_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
     with torch.no_grad():
         for sentence in sentences:
             if not sentence.strip():
                 continue
+            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
             if device == "cuda":
                 inputs = {k: v.cuda() for k, v in inputs.items()}
+            outputs = model.generate(
                 **inputs,
                 forced_bos_token_id=tgt_lang_id,
                 max_length=256,
+                num_beams=4,
             )
+            translated.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    return " ".join(translated)
 # ============================================
+# TEXT-TO-SPEECH
 # ============================================
+def split_text(text: str, max_len: int = MAX_CHUNK_LENGTH) -> List[str]:
+    """Split text into TTS-friendly chunks."""
     sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks, current = [], ""
+    for s in sentences:
+        if len(current) + len(s) <= max_len:
+            current += s + " "
         else:
+            if current:
+                chunks.append(current.strip())
+            current = s + " "
+    if current:
+        chunks.append(current.strip())
     return chunks
 def generate_audio(text: str) -> Tuple[np.ndarray, List[dict]]:
+    """Generate audio with timestamps."""
+    model, tokenizer = get_tts_model()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    chunks = split_text(text)
     audio_segments = []
     timestamps = []
     current_time = 0.0
+    with torch.no_grad():
+        for chunk in chunks:
+            if not chunk.strip():
+                continue
+            inputs = tokenizer(chunk, return_tensors="pt")
+            if device == "cuda":
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            audio = model(**inputs).waveform.squeeze().cpu().numpy()
+            audio_segments.append(audio)
+            duration = len(audio) / SAMPLE_RATE
+            timestamps.append({
+                "start": format_time(current_time),
+                "end": format_time(current_time + duration),
+                "text": chunk
+            })
+            current_time += duration
+    return np.concatenate(audio_segments) if audio_segments else np.zeros(SAMPLE_RATE), timestamps
+def format_time(seconds: float) -> str:
+    """Format as HH:MM:SS.mmm"""
+    h, r = divmod(int(seconds), 3600)
+    m, s = divmod(r, 60)
+    ms = int((seconds % 1) * 1000)
+    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
 # ============================================
 # MAIN PIPELINE
 # ============================================
+def process_document(file, progress=gr.Progress()):
+    """Main pipeline: Document → Translation → TTS → Audiobook"""
     if file is None:
         return None, "", "", "⚠️ Please upload a document"
     try:
+        # Extract text
         progress(0.1, desc="📄 Extracting text...")
+        text = extract_text(file.name)[:2000]  # Limit for POC
+        if not text:
+            return None, "", "", "⚠️ No text found"
+        # Translate
         progress(0.3, desc="🌍 Translating to Hausa...")
+        translated = translate_text(text)
+        # Generate audio
         progress(0.6, desc="🎙️ Generating audio...")
+        audio, timestamps = generate_audio(translated)
+        # Save
+        progress(0.9, desc="💾 Saving...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, SAMPLE_RATE, (audio * 32767).astype(np.int16))
             audio_path = f.name
+        # Format output
+        timestamps_text = "\n".join([f"[{t['start']} → {t['end']}] {t['text']}" for t in timestamps])
+        transcript = f"## Original (English)\n{text[:500]}{'...' if len(text) > 500 else ''}\n\n## Translation (Hausa)\n{translated}"
+        progress(1.0, desc="✅ Done!")
+        return audio_path, transcript, timestamps_text, "✅ Audiobook generated!"
     except Exception as e:
         return None, "", "", f"❌ Error: {str(e)}"
 # ============================================
 # GRADIO INTERFACE
 # ============================================
+with gr.Blocks(
+    title="PlotWeaver Audiobook",
+    theme=gr.themes.Soft(primary_hue="orange"),
+) as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 1rem;">
+        <h1>🎧 PlotWeaver Audiobook Generator</h1>
+        <p><strong>English → Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(label="📁 Upload PDF, DOCX, or TXT", file_types=[".pdf", ".docx", ".txt"])
+            btn = gr.Button("🚀 Generate Audiobook", variant="primary", size="lg")
+            status = gr.Textbox(label="Status", interactive=False)
+            gr.Markdown("""
+            ### How it works
+            1. Upload English document
+            2. AI translates to Hausa
+            3. TTS generates audio
+            4. Download with timestamps
+            """)
+        with gr.Column(scale=2):
+            audio_out = gr.Audio(label="🎧 Hausa Audiobook")
+            with gr.Tabs():
+                with gr.Tab("📜 Transcript"):
+                    transcript = gr.Markdown()
+                with gr.Tab("⏱️ Timestamps"):
+                    timestamps = gr.Textbox(lines=8, interactive=False)
+    gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
+        <strong>PlotWeaver</strong> - AI for African Languages
+    </div>""")
+    btn.click(process_document, [file_input], [audio_out, transcript, timestamps, status])
 # ============================================
+# LAUNCH
 # ============================================
 if __name__ == "__main__":
+    demo.launch()