Toadoum commited on
Commit
7eea943
Β·
verified Β·
1 Parent(s): 3c15094

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -28
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  PlotWeaver Audiobook Generator
3
- English β†’ Hausa Translation + TTS with Timestamps
4
 
5
  Optimized for fast startup on HuggingFace Spaces.
6
  """
@@ -12,13 +12,14 @@ import tempfile
12
  import re
13
  from pathlib import Path
14
  from datetime import timedelta
15
- from typing import List, Tuple
16
 
17
  # Document processing
18
  import fitz # PyMuPDF
19
  from docx import Document
20
 
21
  import scipy.io.wavfile as wavfile
 
22
 
23
  # ============================================
24
  # CONFIGURATION
@@ -30,6 +31,25 @@ TGT_LANG = "hau_Latn"
30
  SAMPLE_RATE = 16000
31
  MAX_CHUNK_LENGTH = 200
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Global model cache (lazy loaded)
34
  _models = {}
35
 
@@ -197,6 +217,76 @@ def get_tts_model():
197
 
198
  return _models["tts"]
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  # ============================================
201
  # TRANSLATION
202
  # ============================================
@@ -295,8 +385,8 @@ def format_time(seconds: float) -> str:
295
  # ============================================
296
  MAX_CHARS = 10000 # Max characters to process (increase for longer files)
297
 
298
- def process_document(file, progress=gr.Progress()):
299
- """Main pipeline: Document β†’ Translation β†’ TTS β†’ Audiobook"""
300
 
301
  if file is None:
302
  return None, "", "", "⚠️ Please upload a document"
@@ -320,8 +410,21 @@ def process_document(file, progress=gr.Progress()):
320
 
321
  # Split into sentences for batch processing
322
  sentences = re.split(r'(?<=[.!?])\s+', text)
 
323
  total_sentences = len(sentences)
324
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  # Translate in batches
326
  progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
327
  translated_sentences = []
@@ -336,8 +439,9 @@ def process_document(file, progress=gr.Progress()):
336
  continue
337
 
338
  # Update progress
339
- prog = 0.1 + (0.4 * (i / total_sentences))
340
- progress(prog, desc=f"🌍 Translating sentence {i+1}/{total_sentences}...")
 
341
 
342
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
343
  if device == "cuda":
@@ -354,39 +458,72 @@ def process_document(file, progress=gr.Progress()):
354
 
355
  translated = " ".join(translated_sentences)
356
 
357
- # Generate audio in batches
358
- progress(0.5, desc="πŸŽ™οΈ Generating audio...")
359
- chunks = split_text(translated)
360
- total_chunks = len(chunks)
361
 
362
  tts_model, tts_tokenizer = get_tts_model()
363
  audio_segments = []
364
  timestamps = []
365
  current_time = 0.0
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  with torch.no_grad():
368
- for i, chunk in enumerate(chunks):
369
  if not chunk.strip():
370
  continue
371
 
 
 
 
 
372
  # Update progress
373
- prog = 0.5 + (0.4 * (i / total_chunks))
374
- progress(prog, desc=f"πŸŽ™οΈ Generating audio {i+1}/{total_chunks}...")
375
 
376
  inputs = tts_tokenizer(chunk, return_tensors="pt")
377
  if device == "cuda":
378
  inputs = {k: v.cuda() for k, v in inputs.items()}
379
 
380
  audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
 
 
 
 
 
381
  audio_segments.append(audio)
382
 
 
 
 
383
  duration = len(audio) / SAMPLE_RATE
384
  timestamps.append({
385
  "start": format_time(current_time),
386
  "end": format_time(current_time + duration),
387
- "text": chunk
 
 
388
  })
389
- current_time += duration
390
 
391
  # Concatenate audio
392
  if not audio_segments:
@@ -394,19 +531,33 @@ def process_document(file, progress=gr.Progress()):
394
 
395
  full_audio = np.concatenate(audio_segments)
396
 
 
 
 
 
 
397
  # Save audio
398
  progress(0.95, desc="πŸ’Ύ Saving audiobook...")
399
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
400
  wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
401
  audio_path = f.name
402
 
403
- # Format output
404
- timestamps_text = "\n".join([f"[{t['start']} β†’ {t['end']}] {t['text']}" for t in timestamps])
 
 
 
405
 
406
  # Calculate audio duration
407
  audio_duration = len(full_audio) / SAMPLE_RATE
408
  duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
409
 
 
 
 
 
 
 
410
  transcript = f"""## Original (English)
411
  {text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
412
 
@@ -415,10 +566,12 @@ def process_document(file, progress=gr.Progress()):
415
 
416
  ---
417
  πŸ“Š **Stats**: {len(text):,} chars β†’ {len(translated):,} chars | 🎡 Duration: {duration_str}
 
 
418
  """
419
 
420
  progress(1.0, desc="βœ… Done!")
421
- return audio_path, transcript, timestamps_text, f"βœ… Audiobook generated! Duration: {duration_str}"
422
 
423
  except Exception as e:
424
  import traceback
@@ -437,6 +590,7 @@ with gr.Blocks(
437
  <div style="text-align: center; margin-bottom: 1rem;">
438
  <h1>🎧 PlotWeaver Audiobook Generator</h1>
439
  <p><strong>English β†’ Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
 
440
  </div>
441
  """)
442
 
@@ -447,34 +601,54 @@ with gr.Blocks(
447
  file_types=[".pdf", ".docx", ".doc", ".txt"],
448
  type="filepath"
449
  )
 
 
 
 
 
 
 
450
  btn = gr.Button("πŸš€ Generate Audiobook", variant="primary", size="lg")
451
  status = gr.Textbox(label="Status", interactive=False)
452
 
453
  gr.Markdown("""
454
  ### How it works
455
  1. Upload English document (PDF, DOCX, DOC, TXT)
456
- 2. AI translates to Hausa
457
- 3. TTS generates natural audio
458
- 4. Download audiobook with timestamps
 
 
 
 
 
 
 
 
 
 
459
 
460
  ---
461
- ⏱️ **Processing time**: ~1-2 min per page
462
- πŸ“„ **Max length**: 10,000 characters (~4 pages)
463
  """)
464
 
465
  with gr.Column(scale=2):
466
- audio_out = gr.Audio(label="🎧 Hausa Audiobook")
467
  with gr.Tabs():
468
  with gr.Tab("πŸ“œ Transcript"):
469
  transcript = gr.Markdown()
470
- with gr.Tab("⏱️ Timestamps"):
471
- timestamps = gr.Textbox(lines=10, interactive=False)
472
 
473
  gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
474
- <strong>PlotWeaver</strong> - AI for African Languages
475
  </div>""")
476
 
477
- btn.click(process_document, [file_input], [audio_out, transcript, timestamps, status])
 
 
 
 
478
 
479
  # ============================================
480
  # LAUNCH
 
1
  """
2
  PlotWeaver Audiobook Generator
3
+ English β†’ Hausa Translation + TTS with Timestamps + Emotions
4
 
5
  Optimized for fast startup on HuggingFace Spaces.
6
  """
 
12
  import re
13
  from pathlib import Path
14
  from datetime import timedelta
15
+ from typing import List, Tuple, Dict
16
 
17
  # Document processing
18
  import fitz # PyMuPDF
19
  from docx import Document
20
 
21
  import scipy.io.wavfile as wavfile
22
+ from scipy import signal
23
 
24
  # ============================================
25
  # CONFIGURATION
 
31
  SAMPLE_RATE = 16000
32
  MAX_CHUNK_LENGTH = 200
33
 
34
+ # Emotion settings (pitch_shift, speed_factor, energy_boost)
35
+ EMOTION_SETTINGS = {
36
+ "joy": {"pitch": 1.15, "speed": 1.10, "energy": 1.2, "emoji": "😊"},
37
+ "sadness": {"pitch": 0.90, "speed": 0.85, "energy": 0.8, "emoji": "😒"},
38
+ "anger": {"pitch": 1.10, "speed": 1.15, "energy": 1.4, "emoji": "😠"},
39
+ "fear": {"pitch": 1.20, "speed": 1.20, "energy": 1.1, "emoji": "😨"},
40
+ "surprise": {"pitch": 1.25, "speed": 1.05, "energy": 1.3, "emoji": "😲"},
41
+ "neutral": {"pitch": 1.00, "speed": 1.00, "energy": 1.0, "emoji": "😐"},
42
+ }
43
+
44
+ # Emotion keywords for detection
45
+ EMOTION_KEYWORDS = {
46
+ "joy": ["happy", "joy", "excited", "wonderful", "great", "love", "beautiful", "amazing", "fantastic", "delighted", "pleased", "glad", "cheerful", "celebrate", "laugh", "smile"],
47
+ "sadness": ["sad", "sorry", "unfortunately", "loss", "grief", "tears", "cry", "mourn", "depressed", "heartbroken", "tragic", "miserable", "lonely", "pain", "suffer"],
48
+ "anger": ["angry", "furious", "outraged", "hate", "frustrat", "annoyed", "mad", "rage", "hostile", "bitter", "resent", "irritat", "violent", "fight", "attack"],
49
+ "fear": ["afraid", "fear", "scared", "terrified", "worried", "anxious", "panic", "horror", "dread", "nervous", "frighten", "danger", "threat", "alarm"],
50
+ "surprise": ["surprised", "amazed", "astonished", "shocked", "unexpected", "wow", "incredible", "unbelievable", "sudden", "remarkable", "stunning"],
51
+ }
52
+
53
  # Global model cache (lazy loaded)
54
  _models = {}
55
 
 
217
 
218
  return _models["tts"]
219
 
220
+ # ============================================
221
+ # EMOTION DETECTION
222
+ # ============================================
223
+ def detect_emotion(text: str) -> str:
224
+ """Detect emotion from English text using keyword matching."""
225
+ text_lower = text.lower()
226
+
227
+ emotion_scores = {emotion: 0 for emotion in EMOTION_KEYWORDS}
228
+
229
+ for emotion, keywords in EMOTION_KEYWORDS.items():
230
+ for keyword in keywords:
231
+ if keyword in text_lower:
232
+ emotion_scores[emotion] += 1
233
+
234
+ # Check for punctuation-based cues
235
+ if text.count('!') >= 2:
236
+ emotion_scores["joy"] += 1
237
+ emotion_scores["surprise"] += 1
238
+ if text.count('?') >= 2:
239
+ emotion_scores["surprise"] += 1
240
+ if text.isupper() and len(text) > 10:
241
+ emotion_scores["anger"] += 1
242
+
243
+ # Get highest scoring emotion
244
+ max_emotion = max(emotion_scores, key=emotion_scores.get)
245
+
246
+ if emotion_scores[max_emotion] > 0:
247
+ return max_emotion
248
+ return "neutral"
249
+
250
+ # ============================================
251
+ # AUDIO EMOTION PROCESSING
252
+ # ============================================
253
+ def apply_emotion_to_audio(audio: np.ndarray, emotion: str, sample_rate: int = SAMPLE_RATE) -> np.ndarray:
254
+ """Apply emotion effects to audio (pitch, speed, energy)."""
255
+ settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"])
256
+
257
+ # Skip processing for neutral
258
+ if emotion == "neutral":
259
+ return audio
260
+
261
+ # 1. Pitch shift using resampling
262
+ pitch_factor = settings["pitch"]
263
+ if pitch_factor != 1.0:
264
+ # Resample to change pitch
265
+ new_length = int(len(audio) / pitch_factor)
266
+ audio = signal.resample(audio, new_length)
267
+
268
+ # 2. Speed adjustment (time stretch using resampling)
269
+ speed_factor = settings["speed"]
270
+ if speed_factor != 1.0:
271
+ new_length = int(len(audio) / speed_factor)
272
+ audio = signal.resample(audio, new_length)
273
+
274
+ # 3. Energy/volume adjustment
275
+ energy_factor = settings["energy"]
276
+ audio = audio * energy_factor
277
+
278
+ # Normalize to prevent clipping
279
+ max_val = np.max(np.abs(audio))
280
+ if max_val > 0.95:
281
+ audio = audio * (0.95 / max_val)
282
+
283
+ return audio
284
+
285
+ def add_pause(duration_ms: int = 300) -> np.ndarray:
286
+ """Generate silence for pauses between sentences."""
287
+ num_samples = int(SAMPLE_RATE * duration_ms / 1000)
288
+ return np.zeros(num_samples)
289
+
290
  # ============================================
291
  # TRANSLATION
292
  # ============================================
 
385
  # ============================================
386
  MAX_CHARS = 10000 # Max characters to process (increase for longer files)
387
 
388
+ def process_document(file, enable_emotions=True, progress=gr.Progress()):
389
+ """Main pipeline: Document β†’ Translation β†’ TTS with Emotions β†’ Audiobook"""
390
 
391
  if file is None:
392
  return None, "", "", "⚠️ Please upload a document"
 
410
 
411
  # Split into sentences for batch processing
412
  sentences = re.split(r'(?<=[.!?])\s+', text)
413
+ sentences = [s.strip() for s in sentences if s.strip()]
414
  total_sentences = len(sentences)
415
 
416
+ # Detect emotions for each sentence
417
+ progress(0.08, desc="🎭 Analyzing emotions...")
418
+ sentence_emotions = []
419
+ for sentence in sentences:
420
+ emotion = detect_emotion(sentence) if enable_emotions else "neutral"
421
+ sentence_emotions.append(emotion)
422
+
423
+ # Count emotions
424
+ emotion_counts = {}
425
+ for e in sentence_emotions:
426
+ emotion_counts[e] = emotion_counts.get(e, 0) + 1
427
+
428
  # Translate in batches
429
  progress(0.1, desc=f"🌍 Translating {total_sentences} sentences...")
430
  translated_sentences = []
 
439
  continue
440
 
441
  # Update progress
442
+ prog = 0.1 + (0.35 * (i / total_sentences))
443
+ emotion_emoji = EMOTION_SETTINGS[sentence_emotions[i]]["emoji"]
444
+ progress(prog, desc=f"🌍 Translating {i+1}/{total_sentences} {emotion_emoji}")
445
 
446
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
447
  if device == "cuda":
 
458
 
459
  translated = " ".join(translated_sentences)
460
 
461
+ # Generate audio with emotions
462
+ progress(0.45, desc="πŸŽ™οΈ Generating expressive audio...")
 
 
463
 
464
  tts_model, tts_tokenizer = get_tts_model()
465
  audio_segments = []
466
  timestamps = []
467
  current_time = 0.0
468
 
469
+ # Split translated text for TTS
470
+ hausa_chunks = split_text(translated)
471
+ total_chunks = len(hausa_chunks)
472
+
473
+ # Map chunks to emotions (approximate)
474
+ chunk_emotions = []
475
+ chunk_idx = 0
476
+ for i, emotion in enumerate(sentence_emotions):
477
+ # Estimate how many chunks per sentence
478
+ if i < len(sentences):
479
+ sentence_len = len(translated_sentences[i]) if i < len(translated_sentences) else 100
480
+ chunks_per_sentence = max(1, sentence_len // MAX_CHUNK_LENGTH + 1)
481
+ for _ in range(chunks_per_sentence):
482
+ if chunk_idx < total_chunks:
483
+ chunk_emotions.append(emotion)
484
+ chunk_idx += 1
485
+
486
+ # Fill remaining with neutral
487
+ while len(chunk_emotions) < total_chunks:
488
+ chunk_emotions.append("neutral")
489
+
490
  with torch.no_grad():
491
+ for i, chunk in enumerate(hausa_chunks):
492
  if not chunk.strip():
493
  continue
494
 
495
+ # Get emotion for this chunk
496
+ emotion = chunk_emotions[i] if i < len(chunk_emotions) else "neutral"
497
+ emotion_emoji = EMOTION_SETTINGS[emotion]["emoji"]
498
+
499
  # Update progress
500
+ prog = 0.45 + (0.45 * (i / total_chunks))
501
+ progress(prog, desc=f"πŸŽ™οΈ Generating audio {i+1}/{total_chunks} {emotion_emoji}")
502
 
503
  inputs = tts_tokenizer(chunk, return_tensors="pt")
504
  if device == "cuda":
505
  inputs = {k: v.cuda() for k, v in inputs.items()}
506
 
507
  audio = tts_model(**inputs).waveform.squeeze().cpu().numpy()
508
+
509
+ # Apply emotion effects
510
+ if enable_emotions and emotion != "neutral":
511
+ audio = apply_emotion_to_audio(audio, emotion)
512
+
513
  audio_segments.append(audio)
514
 
515
+ # Add small pause between chunks
516
+ audio_segments.append(add_pause(200))
517
+
518
  duration = len(audio) / SAMPLE_RATE
519
  timestamps.append({
520
  "start": format_time(current_time),
521
  "end": format_time(current_time + duration),
522
+ "text": chunk,
523
+ "emotion": emotion,
524
+ "emoji": emotion_emoji
525
  })
526
+ current_time += duration + 0.2 # Include pause
527
 
528
  # Concatenate audio
529
  if not audio_segments:
 
531
 
532
  full_audio = np.concatenate(audio_segments)
533
 
534
+ # Normalize final audio
535
+ max_val = np.max(np.abs(full_audio))
536
+ if max_val > 0:
537
+ full_audio = full_audio * (0.9 / max_val)
538
+
539
  # Save audio
540
  progress(0.95, desc="πŸ’Ύ Saving audiobook...")
541
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
542
  wavfile.write(f.name, SAMPLE_RATE, (full_audio * 32767).astype(np.int16))
543
  audio_path = f.name
544
 
545
+ # Format timestamps with emotions
546
+ timestamps_text = "\n".join([
547
+ f"[{t['start']} β†’ {t['end']}] {t['emoji']} [{t['emotion'].upper()}] {t['text']}"
548
+ for t in timestamps
549
+ ])
550
 
551
  # Calculate audio duration
552
  audio_duration = len(full_audio) / SAMPLE_RATE
553
  duration_str = f"{int(audio_duration // 60)}:{int(audio_duration % 60):02d}"
554
 
555
+ # Emotion summary
556
+ emotion_summary = " | ".join([
557
+ f"{EMOTION_SETTINGS[e]['emoji']} {e}: {c}"
558
+ for e, c in sorted(emotion_counts.items(), key=lambda x: -x[1])
559
+ ])
560
+
561
  transcript = f"""## Original (English)
562
  {text[:1000]}{'...' if len(text) > 1000 else ''}{truncated_msg}
563
 
 
566
 
567
  ---
568
  πŸ“Š **Stats**: {len(text):,} chars β†’ {len(translated):,} chars | 🎡 Duration: {duration_str}
569
+
570
+ 🎭 **Emotions detected**: {emotion_summary}
571
  """
572
 
573
  progress(1.0, desc="βœ… Done!")
574
+ return audio_path, transcript, timestamps_text, f"βœ… Audiobook generated! Duration: {duration_str} | 🎭 Emotions: {len([e for e in sentence_emotions if e != 'neutral'])} expressive segments"
575
 
576
  except Exception as e:
577
  import traceback
 
590
  <div style="text-align: center; margin-bottom: 1rem;">
591
  <h1>🎧 PlotWeaver Audiobook Generator</h1>
592
  <p><strong>English β†’ Hausa</strong> | Powered by NLLB-200 + MMS-TTS</p>
593
+ <p style="color: #666;">✨ Now with Emotional Expression!</p>
594
  </div>
595
  """)
596
 
 
601
  file_types=[".pdf", ".docx", ".doc", ".txt"],
602
  type="filepath"
603
  )
604
+
605
+ emotion_toggle = gr.Checkbox(
606
+ label="🎭 Enable Emotional Expression",
607
+ value=True,
608
+ info="Adds emotion to voice based on text sentiment"
609
+ )
610
+
611
  btn = gr.Button("πŸš€ Generate Audiobook", variant="primary", size="lg")
612
  status = gr.Textbox(label="Status", interactive=False)
613
 
614
  gr.Markdown("""
615
  ### How it works
616
  1. Upload English document (PDF, DOCX, DOC, TXT)
617
+ 2. AI **detects emotions** in text
618
+ 3. Translates to Hausa with NLLB-200
619
+ 4. TTS generates **expressive audio**
620
+ 5. Download audiobook with timestamps
621
+
622
+ ---
623
+ ### 🎭 Emotions Detected
624
+ - 😊 **Joy** - Higher pitch, faster pace
625
+ - 😒 **Sadness** - Lower pitch, slower pace
626
+ - 😠 **Anger** - Intense, louder
627
+ - 😨 **Fear** - Faster, higher pitch
628
+ - 😲 **Surprise** - Excited tone
629
+ - 😐 **Neutral** - Normal speech
630
 
631
  ---
632
+ ⏱️ **Processing**: ~1-2 min per page
 
633
  """)
634
 
635
  with gr.Column(scale=2):
636
+ audio_out = gr.Audio(label="🎧 Hausa Audiobook (with Emotions)")
637
  with gr.Tabs():
638
  with gr.Tab("πŸ“œ Transcript"):
639
  transcript = gr.Markdown()
640
+ with gr.Tab("⏱️ Timestamps + Emotions"):
641
+ timestamps = gr.Textbox(lines=12, interactive=False)
642
 
643
  gr.HTML("""<div style="text-align: center; padding: 1rem; background: #f8f9fa; border-radius: 8px; margin-top: 1rem;">
644
+ <strong>PlotWeaver</strong> - AI for African Languages | 🎭 Expressive Audiobooks
645
  </div>""")
646
 
647
+ btn.click(
648
+ process_document,
649
+ [file_input, emotion_toggle],
650
+ [audio_out, transcript, timestamps, status]
651
+ )
652
 
653
  # ============================================
654
  # LAUNCH