Spaces:

PlotweaverAI
/

plotweaver-audiobook

Running

App Files Files Community

Toadoum commited on 3 days ago

Commit

7eea943

verified ·

1 Parent(s): 3c15094

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -28

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 PlotWeaver Audiobook Generator
-English → Hausa Translation + TTS with Timestamps
 Optimized for fast startup on HuggingFace Spaces.
 """
@@ -12,13 +12,14 @@ import tempfile
 import re
 from pathlib import Path
 from datetime import timedelta
-from typing import List, Tuple
 # Document processing
 import fitz  # PyMuPDF
 from docx import Document
 import scipy.io.wavfile as wavfile
 # ============================================
 # CONFIGURATION
@@ -30,6 +31,25 @@ TGT_LANG = "hau_Latn"
 SAMPLE_RATE = 16000
 MAX_CHUNK_LENGTH = 200
 # Global model cache (lazy loaded)
 _models = {}
@@ -197,6 +217,76 @@ def get_tts_model():
     return _models["tts"]
 # ============================================
 # TRANSLATION
 # ============================================
@@ -295,8 +385,8 @@ def format_time(seconds: float) -> str:
 # ============================================
 MAX_CHARS = 10000  # Max characters to process (increase for longer files)
-def process_document(file, progress=gr.Progress()):
-    """Main pipeline: Document → Translation → TTS → Audiobook"""
     if file is None:
         return None, "", "", "⚠️ Please upload a document"
@@ -320,8 +410,21 @@ def process_document(file, progress=gr.Progress()):
         # Split into sentences for batch processing
         sentences = re.split(r'(?<=[.!?])\s+', text)
         total_sentences = len(sentences)
         # Translate in batches
         progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
         translated_sentences = []
@@ -336,8 +439,9 @@ def process_document(file, progress=gr.Progress()):
                     continue
                 # Update progress
-                prog = 0.1 + (0.4 * (i / total_sentences))
-                progress(prog, desc=f"🌍 Translating sentence {i+1}/{total_sentences}...")
                 inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
                 if device == "cuda":
@@ -354,39 +458,72 @@ def process_document(file, progress=gr.Progress()):
         translated = " ".join(translated_sentences)
-        # Generate audio in batches
-        progress(0.5, desc="🎙️ Generating audio...")
-        chunks = split_text(translated)
-        total_chunks = len(chunks)
         tts_model, tts_tokenizer = get_tts_model()
         audio_segments = []
         timestamps = []
         current_time = 0.0
         with torch.no_grad():
-            for i, chunk in enumerate(chunks):
                 if not chunk.strip():
                     continue
                 # Update progress
-                prog = 0.5 + (0.4 * (i / total_chunks))
-                progress(prog, desc=f"🎙️ Generating audio {i+1}/{total_chunks}...")
                 inputs = tts_tokenizer(chunk, return_tensors="pt")
                 if device == "cuda":
                     inputs = {k: v.cuda() for k, v in inputs.items()}
                 audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
                 audio_segments.append(audio)
                 duration = len(audio) / SAMPLE_RATE
                 timestamps.append({
                     "start": format_time(current_time),
                     "end": format_time(current_time + duration),
-                    "text": chunk
                 })
-                current_time += duration
         # Concatenate audio
         if not audio_segments:
@@ -394,19 +531,33 @@ def process_document(file, progress=gr.Progress()):
         full_audio = np.concatenate(audio_segments)
         # Save audio
         progress(0.95, desc="💾 Saving audiobook...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
             audio_path = f.name
-        # Format output
-        timestamps_text = "\n".join([f"[{t['start']} → {t['end']}] {t['text']}" for t in timestamps])
         # Calculate audio duration
         audio_duration = len(full_audio) / SAMPLE_RATE
         duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
         transcript = f"""## Original (English)
 {text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
@@ -415,10 +566,12 @@ def process_document(file, progress=gr.Progress()):
 ---
 📊 **Stats**: {len(text):,} chars → {len(translated):,} chars | 🎵 Duration: {duration_str}
 """
         progress(1.0, desc="✅ Done!")
-        return audio_path, transcript, timestamps_text, f"✅ Audiobook generated! Duration: {duration_str}"
     except Exception as e:
         import traceback
@@ -437,6 +590,7 @@ with gr.Blocks(
     <div style="text-align: center; margin-bottom: 1rem;">
         <h1>🎧 PlotWeaver Audiobook Generator</h1>
         <p><strong>English → Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
     </div>
     """)
@@ -447,34 +601,54 @@ with gr.Blocks(
                 file_types=[".pdf", ".docx", ".doc", ".txt"],
                 type="filepath"
             )
             btn = gr.Button("🚀 Generate Audiobook", variant="primary", size="lg")
             status = gr.Textbox(label="Status", interactive=False)
             gr.Markdown("""
             ### How it works
             1. Upload English document (PDF, DOCX, DOC, TXT)
-            2. AI translates to Hausa
-            3. TTS generates natural audio
-            4. Download audiobook with timestamps
             ---
-            ⏱️ **Processing time**: ~1-2 min per page
-            📄 **Max length**: 10,000 characters (~4 pages)
             """)
         with gr.Column(scale=2):
-            audio_out = gr.Audio(label="🎧 Hausa Audiobook")
             with gr.Tabs():
                 with gr.Tab("📜 Transcript"):
                     transcript = gr.Markdown()
-                with gr.Tab("⏱️ Timestamps"):
-                    timestamps = gr.Textbox(lines=10, interactive=False)
     gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
-        <strong>PlotWeaver</strong> - AI for African Languages
     </div>""")
-    btn.click(process_document, [file_input], [audio_out, transcript, timestamps, status])
 # ============================================
 # LAUNCH

 """
 PlotWeaver Audiobook Generator
+English → Hausa Translation + TTS with Timestamps + Emotions
 Optimized for fast startup on HuggingFace Spaces.
 """
 import re
 from pathlib import Path
 from datetime import timedelta
+from typing import List, Tuple, Dict
 # Document processing
 import fitz  # PyMuPDF
 from docx import Document
 import scipy.io.wavfile as wavfile
+from scipy import signal
 # ============================================
 # CONFIGURATION
 SAMPLE_RATE = 16000
 MAX_CHUNK_LENGTH = 200
+# Emotion settings (pitch_shift, speed_factor, energy_boost)
+EMOTION_SETTINGS = {
+    "joy":      {"pitch": 1.15, "speed": 1.10, "energy": 1.2, "emoji": "😊"},
+    "sadness":  {"pitch": 0.90, "speed": 0.85, "energy": 0.8, "emoji": "😢"},
+    "anger":    {"pitch": 1.10, "speed": 1.15, "energy": 1.4, "emoji": "😠"},
+    "fear":     {"pitch": 1.20, "speed": 1.20, "energy": 1.1, "emoji": "😨"},
+    "surprise": {"pitch": 1.25, "speed": 1.05, "energy": 1.3, "emoji": "😲"},
+    "neutral":  {"pitch": 1.00, "speed": 1.00, "energy": 1.0, "emoji": "😐"},
+}
+# Emotion keywords for detection
+EMOTION_KEYWORDS = {
+    "joy": ["happy", "joy", "excited", "wonderful", "great", "love", "beautiful", "amazing", "fantastic", "delighted", "pleased", "glad", "cheerful", "celebrate", "laugh", "smile"],
+    "sadness": ["sad", "sorry", "unfortunately", "loss", "grief", "tears", "cry", "mourn", "depressed", "heartbroken", "tragic", "miserable", "lonely", "pain", "suffer"],
+    "anger": ["angry", "furious", "outraged", "hate", "frustrat", "annoyed", "mad", "rage", "hostile", "bitter", "resent", "irritat", "violent", "fight", "attack"],
+    "fear": ["afraid", "fear", "scared", "terrified", "worried", "anxious", "panic", "horror", "dread", "nervous", "frighten", "danger", "threat", "alarm"],
+    "surprise": ["surprised", "amazed", "astonished", "shocked", "unexpected", "wow", "incredible", "unbelievable", "sudden", "remarkable", "stunning"],
+}
 # Global model cache (lazy loaded)
 _models = {}
     return _models["tts"]
+# ============================================
+# EMOTION DETECTION
+# ============================================
+def detect_emotion(text: str) -> str:
+    """Detect emotion from English text using keyword matching."""
+    text_lower = text.lower()
+    emotion_scores = {emotion: 0 for emotion in EMOTION_KEYWORDS}
+    for emotion, keywords in EMOTION_KEYWORDS.items():
+        for keyword in keywords:
+            if keyword in text_lower:
+                emotion_scores[emotion] += 1
+    # Check for punctuation-based cues
+    if text.count('!') >= 2:
+        emotion_scores["joy"] += 1
+        emotion_scores["surprise"] += 1
+    if text.count('?') >= 2:
+        emotion_scores["surprise"] += 1
+    if text.isupper() and len(text) > 10:
+        emotion_scores["anger"] += 1
+    # Get highest scoring emotion
+    max_emotion = max(emotion_scores, key=emotion_scores.get)
+    if emotion_scores[max_emotion] > 0:
+        return max_emotion
+    return "neutral"
+# ============================================
+# AUDIO EMOTION PROCESSING
+# ============================================
+def apply_emotion_to_audio(audio: np.ndarray, emotion: str, sample_rate: int = SAMPLE_RATE) -> np.ndarray:
+    """Apply emotion effects to audio (pitch, speed, energy)."""
+    settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"])
+    # Skip processing for neutral
+    if emotion == "neutral":
+        return audio
+    # 1. Pitch shift using resampling
+    pitch_factor = settings["pitch"]
+    if pitch_factor != 1.0:
+        # Resample to change pitch
+        new_length = int(len(audio) / pitch_factor)
+        audio = signal.resample(audio, new_length)
+    # 2. Speed adjustment (time stretch using resampling)
+    speed_factor = settings["speed"]
+    if speed_factor != 1.0:
+        new_length = int(len(audio) / speed_factor)
+        audio = signal.resample(audio, new_length)
+    # 3. Energy/volume adjustment
+    energy_factor = settings["energy"]
+    audio = audio * energy_factor
+    # Normalize to prevent clipping
+    max_val = np.max(np.abs(audio))
+    if max_val > 0.95:
+        audio = audio * (0.95 / max_val)
+    return audio
+def add_pause(duration_ms: int = 300) -> np.ndarray:
+    """Generate silence for pauses between sentences."""
+    num_samples = int(SAMPLE_RATE * duration_ms / 1000)
+    return np.zeros(num_samples)
 # ============================================
 # TRANSLATION
 # ============================================
 # ============================================
 MAX_CHARS = 10000  # Max characters to process (increase for longer files)
+def process_document(file, enable_emotions=True, progress=gr.Progress()):
+    """Main pipeline: Document → Translation → TTS with Emotions → Audiobook"""
     if file is None:
         return None, "", "", "⚠️ Please upload a document"
         # Split into sentences for batch processing
         sentences = re.split(r'(?<=[.!?])\s+', text)
+        sentences = [s.strip() for s in sentences if s.strip()]
         total_sentences = len(sentences)
+        # Detect emotions for each sentence
+        progress(0.08, desc="🎭 Analyzing emotions...")
+        sentence_emotions = []
+        for sentence in sentences:
+            emotion = detect_emotion(sentence) if enable_emotions else "neutral"
+            sentence_emotions.append(emotion)
+        # Count emotions
+        emotion_counts = {}
+        for e in sentence_emotions:
+            emotion_counts[e] = emotion_counts.get(e, 0) + 1
         # Translate in batches
         progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
         translated_sentences = []
                     continue
                 # Update progress
+                prog = 0.1 + (0.35 * (i / total_sentences))
+                emotion_emoji = EMOTION_SETTINGS[sentence_emotions[i]]["emoji"]
+                progress(prog, desc=f"🌍 Translating {i+1}/{total_sentences} {emotion_emoji}")
                 inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
                 if device == "cuda":
         translated = " ".join(translated_sentences)
+        # Generate audio with emotions
+        progress(0.45, desc="🎙️ Generating expressive audio...")
         tts_model, tts_tokenizer = get_tts_model()
         audio_segments = []
         timestamps = []
         current_time = 0.0
+        # Split translated text for TTS
+        hausa_chunks = split_text(translated)
+        total_chunks = len(hausa_chunks)
+        # Map chunks to emotions (approximate)
+        chunk_emotions = []
+        chunk_idx = 0
+        for i, emotion in enumerate(sentence_emotions):
+            # Estimate how many chunks per sentence
+            if i < len(sentences):
+                sentence_len = len(translated_sentences[i]) if i < len(translated_sentences) else 100
+                chunks_per_sentence = max(1, sentence_len // MAX_CHUNK_LENGTH + 1)
+                for _ in range(chunks_per_sentence):
+                    if chunk_idx < total_chunks:
+                        chunk_emotions.append(emotion)
+                        chunk_idx += 1
+        # Fill remaining with neutral
+        while len(chunk_emotions) < total_chunks:
+            chunk_emotions.append("neutral")
         with torch.no_grad():
+            for i, chunk in enumerate(hausa_chunks):
                 if not chunk.strip():
                     continue
+                # Get emotion for this chunk
+                emotion = chunk_emotions[i] if i < len(chunk_emotions) else "neutral"
+                emotion_emoji = EMOTION_SETTINGS[emotion]["emoji"]
                 # Update progress
+                prog = 0.45 + (0.45 * (i / total_chunks))
+                progress(prog, desc=f"🎙️ Generating audio {i+1}/{total_chunks} {emotion_emoji}")
                 inputs = tts_tokenizer(chunk, return_tensors="pt")
                 if device == "cuda":
                     inputs = {k: v.cuda() for k, v in inputs.items()}
                 audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
+                # Apply emotion effects
+                if enable_emotions and emotion != "neutral":
+                    audio = apply_emotion_to_audio(audio, emotion)
                 audio_segments.append(audio)
+                # Add small pause between chunks
+                audio_segments.append(add_pause(200))
                 duration = len(audio) / SAMPLE_RATE
                 timestamps.append({
                     "start": format_time(current_time),
                     "end": format_time(current_time + duration),
+                    "text": chunk,
+                    "emotion": emotion,
+                    "emoji": emotion_emoji
                 })
+                current_time += duration + 0.2  # Include pause
         # Concatenate audio
         if not audio_segments:
         full_audio = np.concatenate(audio_segments)
+        # Normalize final audio
+        max_val = np.max(np.abs(full_audio))
+        if max_val > 0:
+            full_audio = full_audio * (0.9 / max_val)
         # Save audio
         progress(0.95, desc="💾 Saving audiobook...")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
             audio_path = f.name
+        # Format timestamps with emotions
+        timestamps_text = "\n".join([
+            f"[{t['start']} → {t['end']}] {t['emoji']} [{t['emotion'].upper()}] {t['text']}"
+            for t in timestamps
+        ])
         # Calculate audio duration
         audio_duration = len(full_audio) / SAMPLE_RATE
         duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
+        # Emotion summary
+        emotion_summary = " | ".join([
+            f"{EMOTION_SETTINGS[e]['emoji']} {e}: {c}"
+            for e, c in sorted(emotion_counts.items(), key=lambda x: -x[1])
+        ])
         transcript = f"""## Original (English)
 {text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
 ---
 📊 **Stats**: {len(text):,} chars → {len(translated):,} chars | 🎵 Duration: {duration_str}
+🎭 **Emotions detected**: {emotion_summary}
 """
         progress(1.0, desc="✅ Done!")
+        return audio_path, transcript, timestamps_text, f"✅ Audiobook generated! Duration: {duration_str} | 🎭 Emotions: {len([e for e in sentence_emotions if e != 'neutral'])} expressive segments"
     except Exception as e:
         import traceback
     <div style="text-align: center; margin-bottom: 1rem;">
         <h1>🎧 PlotWeaver Audiobook Generator</h1>
         <p><strong>English → Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
+        <p style="color: #666;">✨ Now with Emotional Expression!</p>
     </div>
     """)
                 file_types=[".pdf", ".docx", ".doc", ".txt"],
                 type="filepath"
             )
+            emotion_toggle = gr.Checkbox(
+                label="🎭 Enable Emotional Expression",
+                value=True,
+                info="Adds emotion to voice based on text sentiment"
+            )
             btn = gr.Button("🚀 Generate Audiobook", variant="primary", size="lg")
             status = gr.Textbox(label="Status", interactive=False)
             gr.Markdown("""
             ### How it works
             1. Upload English document (PDF, DOCX, DOC, TXT)
+            2. AI **detects emotions** in text
+            3. Translates to Hausa with NLLB-200
+            4. TTS generates **expressive audio**
+            5. Download audiobook with timestamps
+            ---
+            ### 🎭 Emotions Detected
+            - 😊 **Joy** - Higher pitch, faster pace
+            - 😢 **Sadness** - Lower pitch, slower pace
+            - 😠 **Anger** - Intense, louder
+            - 😨 **Fear** - Faster, higher pitch
+            - 😲 **Surprise** - Excited tone
+            - 😐 **Neutral** - Normal speech
             ---
+            ⏱️ **Processing**: ~1-2 min per page
             """)
         with gr.Column(scale=2):
+            audio_out = gr.Audio(label="🎧 Hausa Audiobook (with Emotions)")
             with gr.Tabs():
                 with gr.Tab("📜 Transcript"):
                     transcript = gr.Markdown()
+                with gr.Tab("⏱️ Timestamps + Emotions"):
+                    timestamps = gr.Textbox(lines=12, interactive=False)
     gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
+        <strong>PlotWeaver</strong> - AI for African Languages | 🎭 Expressive Audiobooks
     </div>""")
+    btn.click(
+        process_document,
+        [file_input, emotion_toggle],
+        [audio_out, transcript, timestamps, status]
+    )
 # ============================================
 # LAUNCH