Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
PlotWeaver Audiobook Generator
|
| 3 |
-
English β Hausa Translation + TTS with Timestamps
|
| 4 |
|
| 5 |
Optimized for fast startup on HuggingFace Spaces.
|
| 6 |
"""
|
|
@@ -12,13 +12,14 @@ import tempfile
|
|
| 12 |
import re
|
| 13 |
from pathlib import Path
|
| 14 |
from datetime import timedelta
|
| 15 |
-
from typing import List, Tuple
|
| 16 |
|
| 17 |
# Document processing
|
| 18 |
import fitz # PyMuPDF
|
| 19 |
from docx import Document
|
| 20 |
|
| 21 |
import scipy.io.wavfile as wavfile
|
|
|
|
| 22 |
|
| 23 |
# ============================================
|
| 24 |
# CONFIGURATION
|
|
@@ -30,6 +31,25 @@ TGT_LANG = "hau_Latn"
|
|
| 30 |
SAMPLE_RATE = 16000
|
| 31 |
MAX_CHUNK_LENGTH = 200
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Global model cache (lazy loaded)
|
| 34 |
_models = {}
|
| 35 |
|
|
@@ -197,6 +217,76 @@ def get_tts_model():
|
|
| 197 |
|
| 198 |
return _models["tts"]
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# ============================================
|
| 201 |
# TRANSLATION
|
| 202 |
# ============================================
|
|
@@ -295,8 +385,8 @@ def format_time(seconds: float) -> str:
|
|
| 295 |
# ============================================
|
| 296 |
MAX_CHARS = 10000 # Max characters to process (increase for longer files)
|
| 297 |
|
| 298 |
-
def process_document(file, progress=gr.Progress()):
|
| 299 |
-
"""Main pipeline: Document β Translation β TTS β Audiobook"""
|
| 300 |
|
| 301 |
if file is None:
|
| 302 |
return None, "", "", "β οΈ Please upload a document"
|
|
@@ -320,8 +410,21 @@ def process_document(file, progress=gr.Progress()):
|
|
| 320 |
|
| 321 |
# Split into sentences for batch processing
|
| 322 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
|
|
| 323 |
total_sentences = len(sentences)
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
# Translate in batches
|
| 326 |
progress(0.1, desc=f"π Translating {total_sentences} sentences...")
|
| 327 |
translated_sentences = []
|
|
@@ -336,8 +439,9 @@ def process_document(file, progress=gr.Progress()):
|
|
| 336 |
continue
|
| 337 |
|
| 338 |
# Update progress
|
| 339 |
-
prog = 0.1 + (0.
|
| 340 |
-
|
|
|
|
| 341 |
|
| 342 |
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
|
| 343 |
if device == "cuda":
|
|
@@ -354,39 +458,72 @@ def process_document(file, progress=gr.Progress()):
|
|
| 354 |
|
| 355 |
translated = " ".join(translated_sentences)
|
| 356 |
|
| 357 |
-
# Generate audio
|
| 358 |
-
progress(0.
|
| 359 |
-
chunks = split_text(translated)
|
| 360 |
-
total_chunks = len(chunks)
|
| 361 |
|
| 362 |
tts_model, tts_tokenizer = get_tts_model()
|
| 363 |
audio_segments = []
|
| 364 |
timestamps = []
|
| 365 |
current_time = 0.0
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
with torch.no_grad():
|
| 368 |
-
for i, chunk in enumerate(
|
| 369 |
if not chunk.strip():
|
| 370 |
continue
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
# Update progress
|
| 373 |
-
prog = 0.
|
| 374 |
-
progress(prog, desc=f"ποΈ Generating audio {i+1}/{total_chunks}
|
| 375 |
|
| 376 |
inputs = tts_tokenizer(chunk, return_tensors="pt")
|
| 377 |
if device == "cuda":
|
| 378 |
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 379 |
|
| 380 |
audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
audio_segments.append(audio)
|
| 382 |
|
|
|
|
|
|
|
|
|
|
| 383 |
duration = len(audio) / SAMPLE_RATE
|
| 384 |
timestamps.append({
|
| 385 |
"start": format_time(current_time),
|
| 386 |
"end": format_time(current_time + duration),
|
| 387 |
-
"text": chunk
|
|
|
|
|
|
|
| 388 |
})
|
| 389 |
-
current_time += duration
|
| 390 |
|
| 391 |
# Concatenate audio
|
| 392 |
if not audio_segments:
|
|
@@ -394,19 +531,33 @@ def process_document(file, progress=gr.Progress()):
|
|
| 394 |
|
| 395 |
full_audio = np.concatenate(audio_segments)
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
# Save audio
|
| 398 |
progress(0.95, desc="πΎ Saving audiobook...")
|
| 399 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 400 |
wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
|
| 401 |
audio_path = f.name
|
| 402 |
|
| 403 |
-
# Format
|
| 404 |
-
timestamps_text = "\n".join([
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
# Calculate audio duration
|
| 407 |
audio_duration = len(full_audio) / SAMPLE_RATE
|
| 408 |
duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
transcript = f"""## Original (English)
|
| 411 |
{text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
|
| 412 |
|
|
@@ -415,10 +566,12 @@ def process_document(file, progress=gr.Progress()):
|
|
| 415 |
|
| 416 |
---
|
| 417 |
π **Stats**: {len(text):,} chars β {len(translated):,} chars | π΅ Duration: {duration_str}
|
|
|
|
|
|
|
| 418 |
"""
|
| 419 |
|
| 420 |
progress(1.0, desc="β
Done!")
|
| 421 |
-
return audio_path, transcript, timestamps_text, f"β
Audiobook generated! Duration: {duration_str}"
|
| 422 |
|
| 423 |
except Exception as e:
|
| 424 |
import traceback
|
|
@@ -437,6 +590,7 @@ with gr.Blocks(
|
|
| 437 |
<div style="text-align: center; margin-bottom: 1rem;">
|
| 438 |
<h1>π§ PlotWeaver Audiobook Generator</h1>
|
| 439 |
<p><strong>English β Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
|
|
|
|
| 440 |
</div>
|
| 441 |
""")
|
| 442 |
|
|
@@ -447,34 +601,54 @@ with gr.Blocks(
|
|
| 447 |
file_types=[".pdf", ".docx", ".doc", ".txt"],
|
| 448 |
type="filepath"
|
| 449 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
btn = gr.Button("π Generate Audiobook", variant="primary", size="lg")
|
| 451 |
status = gr.Textbox(label="Status", interactive=False)
|
| 452 |
|
| 453 |
gr.Markdown("""
|
| 454 |
### How it works
|
| 455 |
1. Upload English document (PDF, DOCX, DOC, TXT)
|
| 456 |
-
2. AI
|
| 457 |
-
3.
|
| 458 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
---
|
| 461 |
-
β±οΈ **Processing
|
| 462 |
-
π **Max length**: 10,000 characters (~4 pages)
|
| 463 |
""")
|
| 464 |
|
| 465 |
with gr.Column(scale=2):
|
| 466 |
-
audio_out = gr.Audio(label="π§ Hausa Audiobook")
|
| 467 |
with gr.Tabs():
|
| 468 |
with gr.Tab("π Transcript"):
|
| 469 |
transcript = gr.Markdown()
|
| 470 |
-
with gr.Tab("β±οΈ Timestamps"):
|
| 471 |
-
timestamps = gr.Textbox(lines=
|
| 472 |
|
| 473 |
gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
|
| 474 |
-
<strong>PlotWeaver</strong> - AI for African Languages
|
| 475 |
</div>""")
|
| 476 |
|
| 477 |
-
btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
# ============================================
|
| 480 |
# LAUNCH
|
|
|
|
| 1 |
"""
|
| 2 |
PlotWeaver Audiobook Generator
|
| 3 |
+
English β Hausa Translation + TTS with Timestamps + Emotions
|
| 4 |
|
| 5 |
Optimized for fast startup on HuggingFace Spaces.
|
| 6 |
"""
|
|
|
|
| 12 |
import re
|
| 13 |
from pathlib import Path
|
| 14 |
from datetime import timedelta
|
| 15 |
+
from typing import List, Tuple, Dict
|
| 16 |
|
| 17 |
# Document processing
|
| 18 |
import fitz # PyMuPDF
|
| 19 |
from docx import Document
|
| 20 |
|
| 21 |
import scipy.io.wavfile as wavfile
|
| 22 |
+
from scipy import signal
|
| 23 |
|
| 24 |
# ============================================
|
| 25 |
# CONFIGURATION
|
|
|
|
| 31 |
SAMPLE_RATE = 16000
|
| 32 |
MAX_CHUNK_LENGTH = 200
|
| 33 |
|
| 34 |
+
# Emotion settings (pitch_shift, speed_factor, energy_boost)
|
| 35 |
+
EMOTION_SETTINGS = {
|
| 36 |
+
"joy": {"pitch": 1.15, "speed": 1.10, "energy": 1.2, "emoji": "π"},
|
| 37 |
+
"sadness": {"pitch": 0.90, "speed": 0.85, "energy": 0.8, "emoji": "π’"},
|
| 38 |
+
"anger": {"pitch": 1.10, "speed": 1.15, "energy": 1.4, "emoji": "π "},
|
| 39 |
+
"fear": {"pitch": 1.20, "speed": 1.20, "energy": 1.1, "emoji": "π¨"},
|
| 40 |
+
"surprise": {"pitch": 1.25, "speed": 1.05, "energy": 1.3, "emoji": "π²"},
|
| 41 |
+
"neutral": {"pitch": 1.00, "speed": 1.00, "energy": 1.0, "emoji": "π"},
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Emotion keywords for detection
|
| 45 |
+
EMOTION_KEYWORDS = {
|
| 46 |
+
"joy": ["happy", "joy", "excited", "wonderful", "great", "love", "beautiful", "amazing", "fantastic", "delighted", "pleased", "glad", "cheerful", "celebrate", "laugh", "smile"],
|
| 47 |
+
"sadness": ["sad", "sorry", "unfortunately", "loss", "grief", "tears", "cry", "mourn", "depressed", "heartbroken", "tragic", "miserable", "lonely", "pain", "suffer"],
|
| 48 |
+
"anger": ["angry", "furious", "outraged", "hate", "frustrat", "annoyed", "mad", "rage", "hostile", "bitter", "resent", "irritat", "violent", "fight", "attack"],
|
| 49 |
+
"fear": ["afraid", "fear", "scared", "terrified", "worried", "anxious", "panic", "horror", "dread", "nervous", "frighten", "danger", "threat", "alarm"],
|
| 50 |
+
"surprise": ["surprised", "amazed", "astonished", "shocked", "unexpected", "wow", "incredible", "unbelievable", "sudden", "remarkable", "stunning"],
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
# Global model cache (lazy loaded)
|
| 54 |
_models = {}
|
| 55 |
|
|
|
|
| 217 |
|
| 218 |
return _models["tts"]
|
| 219 |
|
| 220 |
+
# ============================================
|
| 221 |
+
# EMOTION DETECTION
|
| 222 |
+
# ============================================
|
| 223 |
+
def detect_emotion(text: str) -> str:
|
| 224 |
+
"""Detect emotion from English text using keyword matching."""
|
| 225 |
+
text_lower = text.lower()
|
| 226 |
+
|
| 227 |
+
emotion_scores = {emotion: 0 for emotion in EMOTION_KEYWORDS}
|
| 228 |
+
|
| 229 |
+
for emotion, keywords in EMOTION_KEYWORDS.items():
|
| 230 |
+
for keyword in keywords:
|
| 231 |
+
if keyword in text_lower:
|
| 232 |
+
emotion_scores[emotion] += 1
|
| 233 |
+
|
| 234 |
+
# Check for punctuation-based cues
|
| 235 |
+
if text.count('!') >= 2:
|
| 236 |
+
emotion_scores["joy"] += 1
|
| 237 |
+
emotion_scores["surprise"] += 1
|
| 238 |
+
if text.count('?') >= 2:
|
| 239 |
+
emotion_scores["surprise"] += 1
|
| 240 |
+
if text.isupper() and len(text) > 10:
|
| 241 |
+
emotion_scores["anger"] += 1
|
| 242 |
+
|
| 243 |
+
# Get highest scoring emotion
|
| 244 |
+
max_emotion = max(emotion_scores, key=emotion_scores.get)
|
| 245 |
+
|
| 246 |
+
if emotion_scores[max_emotion] > 0:
|
| 247 |
+
return max_emotion
|
| 248 |
+
return "neutral"
|
| 249 |
+
|
| 250 |
+
# ============================================
|
| 251 |
+
# AUDIO EMOTION PROCESSING
|
| 252 |
+
# ============================================
|
| 253 |
+
def apply_emotion_to_audio(audio: np.ndarray, emotion: str, sample_rate: int = SAMPLE_RATE) -> np.ndarray:
|
| 254 |
+
"""Apply emotion effects to audio (pitch, speed, energy)."""
|
| 255 |
+
settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"])
|
| 256 |
+
|
| 257 |
+
# Skip processing for neutral
|
| 258 |
+
if emotion == "neutral":
|
| 259 |
+
return audio
|
| 260 |
+
|
| 261 |
+
# 1. Pitch shift using resampling
|
| 262 |
+
pitch_factor = settings["pitch"]
|
| 263 |
+
if pitch_factor != 1.0:
|
| 264 |
+
# Resample to change pitch
|
| 265 |
+
new_length = int(len(audio) / pitch_factor)
|
| 266 |
+
audio = signal.resample(audio, new_length)
|
| 267 |
+
|
| 268 |
+
# 2. Speed adjustment (time stretch using resampling)
|
| 269 |
+
speed_factor = settings["speed"]
|
| 270 |
+
if speed_factor != 1.0:
|
| 271 |
+
new_length = int(len(audio) / speed_factor)
|
| 272 |
+
audio = signal.resample(audio, new_length)
|
| 273 |
+
|
| 274 |
+
# 3. Energy/volume adjustment
|
| 275 |
+
energy_factor = settings["energy"]
|
| 276 |
+
audio = audio * energy_factor
|
| 277 |
+
|
| 278 |
+
# Normalize to prevent clipping
|
| 279 |
+
max_val = np.max(np.abs(audio))
|
| 280 |
+
if max_val > 0.95:
|
| 281 |
+
audio = audio * (0.95 / max_val)
|
| 282 |
+
|
| 283 |
+
return audio
|
| 284 |
+
|
| 285 |
+
def add_pause(duration_ms: int = 300) -> np.ndarray:
|
| 286 |
+
"""Generate silence for pauses between sentences."""
|
| 287 |
+
num_samples = int(SAMPLE_RATE * duration_ms / 1000)
|
| 288 |
+
return np.zeros(num_samples)
|
| 289 |
+
|
| 290 |
# ============================================
|
| 291 |
# TRANSLATION
|
| 292 |
# ============================================
|
|
|
|
| 385 |
# ============================================
|
| 386 |
MAX_CHARS = 10000 # Max characters to process (increase for longer files)
|
| 387 |
|
| 388 |
+
def process_document(file, enable_emotions=True, progress=gr.Progress()):
|
| 389 |
+
"""Main pipeline: Document β Translation β TTS with Emotions β Audiobook"""
|
| 390 |
|
| 391 |
if file is None:
|
| 392 |
return None, "", "", "β οΈ Please upload a document"
|
|
|
|
| 410 |
|
| 411 |
# Split into sentences for batch processing
|
| 412 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 413 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 414 |
total_sentences = len(sentences)
|
| 415 |
|
| 416 |
+
# Detect emotions for each sentence
|
| 417 |
+
progress(0.08, desc="π Analyzing emotions...")
|
| 418 |
+
sentence_emotions = []
|
| 419 |
+
for sentence in sentences:
|
| 420 |
+
emotion = detect_emotion(sentence) if enable_emotions else "neutral"
|
| 421 |
+
sentence_emotions.append(emotion)
|
| 422 |
+
|
| 423 |
+
# Count emotions
|
| 424 |
+
emotion_counts = {}
|
| 425 |
+
for e in sentence_emotions:
|
| 426 |
+
emotion_counts[e] = emotion_counts.get(e, 0) + 1
|
| 427 |
+
|
| 428 |
# Translate in batches
|
| 429 |
progress(0.1, desc=f"π Translating {total_sentences} sentences...")
|
| 430 |
translated_sentences = []
|
|
|
|
| 439 |
continue
|
| 440 |
|
| 441 |
# Update progress
|
| 442 |
+
prog = 0.1 + (0.35 * (i / total_sentences))
|
| 443 |
+
emotion_emoji = EMOTION_SETTINGS[sentence_emotions[i]]["emoji"]
|
| 444 |
+
progress(prog, desc=f"π Translating {i+1}/{total_sentences} {emotion_emoji}")
|
| 445 |
|
| 446 |
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
|
| 447 |
if device == "cuda":
|
|
|
|
| 458 |
|
| 459 |
translated = " ".join(translated_sentences)
|
| 460 |
|
| 461 |
+
# Generate audio with emotions
|
| 462 |
+
progress(0.45, desc="ποΈ Generating expressive audio...")
|
|
|
|
|
|
|
| 463 |
|
| 464 |
tts_model, tts_tokenizer = get_tts_model()
|
| 465 |
audio_segments = []
|
| 466 |
timestamps = []
|
| 467 |
current_time = 0.0
|
| 468 |
|
| 469 |
+
# Split translated text for TTS
|
| 470 |
+
hausa_chunks = split_text(translated)
|
| 471 |
+
total_chunks = len(hausa_chunks)
|
| 472 |
+
|
| 473 |
+
# Map chunks to emotions (approximate)
|
| 474 |
+
chunk_emotions = []
|
| 475 |
+
chunk_idx = 0
|
| 476 |
+
for i, emotion in enumerate(sentence_emotions):
|
| 477 |
+
# Estimate how many chunks per sentence
|
| 478 |
+
if i < len(sentences):
|
| 479 |
+
sentence_len = len(translated_sentences[i]) if i < len(translated_sentences) else 100
|
| 480 |
+
chunks_per_sentence = max(1, sentence_len // MAX_CHUNK_LENGTH + 1)
|
| 481 |
+
for _ in range(chunks_per_sentence):
|
| 482 |
+
if chunk_idx < total_chunks:
|
| 483 |
+
chunk_emotions.append(emotion)
|
| 484 |
+
chunk_idx += 1
|
| 485 |
+
|
| 486 |
+
# Fill remaining with neutral
|
| 487 |
+
while len(chunk_emotions) < total_chunks:
|
| 488 |
+
chunk_emotions.append("neutral")
|
| 489 |
+
|
| 490 |
with torch.no_grad():
|
| 491 |
+
for i, chunk in enumerate(hausa_chunks):
|
| 492 |
if not chunk.strip():
|
| 493 |
continue
|
| 494 |
|
| 495 |
+
# Get emotion for this chunk
|
| 496 |
+
emotion = chunk_emotions[i] if i < len(chunk_emotions) else "neutral"
|
| 497 |
+
emotion_emoji = EMOTION_SETTINGS[emotion]["emoji"]
|
| 498 |
+
|
| 499 |
# Update progress
|
| 500 |
+
prog = 0.45 + (0.45 * (i / total_chunks))
|
| 501 |
+
progress(prog, desc=f"ποΈ Generating audio {i+1}/{total_chunks} {emotion_emoji}")
|
| 502 |
|
| 503 |
inputs = tts_tokenizer(chunk, return_tensors="pt")
|
| 504 |
if device == "cuda":
|
| 505 |
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 506 |
|
| 507 |
audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
|
| 508 |
+
|
| 509 |
+
# Apply emotion effects
|
| 510 |
+
if enable_emotions and emotion != "neutral":
|
| 511 |
+
audio = apply_emotion_to_audio(audio, emotion)
|
| 512 |
+
|
| 513 |
audio_segments.append(audio)
|
| 514 |
|
| 515 |
+
# Add small pause between chunks
|
| 516 |
+
audio_segments.append(add_pause(200))
|
| 517 |
+
|
| 518 |
duration = len(audio) / SAMPLE_RATE
|
| 519 |
timestamps.append({
|
| 520 |
"start": format_time(current_time),
|
| 521 |
"end": format_time(current_time + duration),
|
| 522 |
+
"text": chunk,
|
| 523 |
+
"emotion": emotion,
|
| 524 |
+
"emoji": emotion_emoji
|
| 525 |
})
|
| 526 |
+
current_time += duration + 0.2 # Include pause
|
| 527 |
|
| 528 |
# Concatenate audio
|
| 529 |
if not audio_segments:
|
|
|
|
| 531 |
|
| 532 |
full_audio = np.concatenate(audio_segments)
|
| 533 |
|
| 534 |
+
# Normalize final audio
|
| 535 |
+
max_val = np.max(np.abs(full_audio))
|
| 536 |
+
if max_val > 0:
|
| 537 |
+
full_audio = full_audio * (0.9 / max_val)
|
| 538 |
+
|
| 539 |
# Save audio
|
| 540 |
progress(0.95, desc="πΎ Saving audiobook...")
|
| 541 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 542 |
wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
|
| 543 |
audio_path = f.name
|
| 544 |
|
| 545 |
+
# Format timestamps with emotions
|
| 546 |
+
timestamps_text = "\n".join([
|
| 547 |
+
f"[{t['start']} β {t['end']}] {t['emoji']} [{t['emotion'].upper()}] {t['text']}"
|
| 548 |
+
for t in timestamps
|
| 549 |
+
])
|
| 550 |
|
| 551 |
# Calculate audio duration
|
| 552 |
audio_duration = len(full_audio) / SAMPLE_RATE
|
| 553 |
duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
|
| 554 |
|
| 555 |
+
# Emotion summary
|
| 556 |
+
emotion_summary = " | ".join([
|
| 557 |
+
f"{EMOTION_SETTINGS[e]['emoji']} {e}: {c}"
|
| 558 |
+
for e, c in sorted(emotion_counts.items(), key=lambda x: -x[1])
|
| 559 |
+
])
|
| 560 |
+
|
| 561 |
transcript = f"""## Original (English)
|
| 562 |
{text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
|
| 563 |
|
|
|
|
| 566 |
|
| 567 |
---
|
| 568 |
π **Stats**: {len(text):,} chars β {len(translated):,} chars | π΅ Duration: {duration_str}
|
| 569 |
+
|
| 570 |
+
π **Emotions detected**: {emotion_summary}
|
| 571 |
"""
|
| 572 |
|
| 573 |
progress(1.0, desc="β
Done!")
|
| 574 |
+
return audio_path, transcript, timestamps_text, f"β
Audiobook generated! Duration: {duration_str} | π Emotions: {len([e for e in sentence_emotions if e != 'neutral'])} expressive segments"
|
| 575 |
|
| 576 |
except Exception as e:
|
| 577 |
import traceback
|
|
|
|
| 590 |
<div style="text-align: center; margin-bottom: 1rem;">
|
| 591 |
<h1>π§ PlotWeaver Audiobook Generator</h1>
|
| 592 |
<p><strong>English β Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
|
| 593 |
+
<p style="color: #666;">β¨ Now with Emotional Expression!</p>
|
| 594 |
</div>
|
| 595 |
""")
|
| 596 |
|
|
|
|
| 601 |
file_types=[".pdf", ".docx", ".doc", ".txt"],
|
| 602 |
type="filepath"
|
| 603 |
)
|
| 604 |
+
|
| 605 |
+
emotion_toggle = gr.Checkbox(
|
| 606 |
+
label="π Enable Emotional Expression",
|
| 607 |
+
value=True,
|
| 608 |
+
info="Adds emotion to voice based on text sentiment"
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
btn = gr.Button("π Generate Audiobook", variant="primary", size="lg")
|
| 612 |
status = gr.Textbox(label="Status", interactive=False)
|
| 613 |
|
| 614 |
gr.Markdown("""
|
| 615 |
### How it works
|
| 616 |
1. Upload English document (PDF, DOCX, DOC, TXT)
|
| 617 |
+
2. AI **detects emotions** in text
|
| 618 |
+
3. Translates to Hausa with NLLB-200
|
| 619 |
+
4. TTS generates **expressive audio**
|
| 620 |
+
5. Download audiobook with timestamps
|
| 621 |
+
|
| 622 |
+
---
|
| 623 |
+
### π Emotions Detected
|
| 624 |
+
- π **Joy** - Higher pitch, faster pace
|
| 625 |
+
- π’ **Sadness** - Lower pitch, slower pace
|
| 626 |
+
- π **Anger** - Intense, louder
|
| 627 |
+
- π¨ **Fear** - Faster, higher pitch
|
| 628 |
+
- π² **Surprise** - Excited tone
|
| 629 |
+
- π **Neutral** - Normal speech
|
| 630 |
|
| 631 |
---
|
| 632 |
+
β±οΈ **Processing**: ~1-2 min per page
|
|
|
|
| 633 |
""")
|
| 634 |
|
| 635 |
with gr.Column(scale=2):
|
| 636 |
+
audio_out = gr.Audio(label="π§ Hausa Audiobook (with Emotions)")
|
| 637 |
with gr.Tabs():
|
| 638 |
with gr.Tab("π Transcript"):
|
| 639 |
transcript = gr.Markdown()
|
| 640 |
+
with gr.Tab("β±οΈ Timestamps + Emotions"):
|
| 641 |
+
timestamps = gr.Textbox(lines=12, interactive=False)
|
| 642 |
|
| 643 |
gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
|
| 644 |
+
<strong>PlotWeaver</strong> - AI for African Languages | π Expressive Audiobooks
|
| 645 |
</div>""")
|
| 646 |
|
| 647 |
+
btn.click(
|
| 648 |
+
process_document,
|
| 649 |
+
[file_input, emotion_toggle],
|
| 650 |
+
[audio_out, transcript, timestamps, status]
|
| 651 |
+
)
|
| 652 |
|
| 653 |
# ============================================
|
| 654 |
# LAUNCH
|