jefffffff9 Claude Sonnet 4.6 commited on
Commit
96cdb10
·
1 Parent(s): 6f4d8d0

Fix: replace YouTube download with audio upload (HF Spaces blocks outbound HTTP)

Browse files

HF cpu-basic tier has no outbound internet access, so yt-dlp can't reach
YouTube. Replaced with an upload widget — user converts video to MP3 locally
(ytmp3.cc / cobalt.tools), uploads here with transcription.

Also removed yt-dlp from requirements.txt.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +64 -97
  2. requirements.txt +0 -3
app.py CHANGED
@@ -499,88 +499,50 @@ def _load_phrase_additions_from_hub() -> None:
499
  threading.Thread(target=_load_phrase_additions_from_hub, daemon=True).start()
500
 
501
 
502
- def _download_youtube_for_training(lang_label: str, url: str, transcript: str) -> str:
503
- """Download YouTube audio and save as a training sample to HF Hub."""
504
- url = url.strip()
505
  transcript = transcript.strip()
506
- if not url:
507
- return "⚠️ Please enter a YouTube URL."
508
  if not transcript:
509
- return "⚠️ Please type the transcription (what is said in the video)."
510
-
511
- try:
512
- import yt_dlp # noqa: F401
513
- except ImportError:
514
- return "⚠️ yt-dlp is not installed. Add 'yt-dlp' to requirements.txt and redeploy."
515
 
516
  lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
 
 
 
517
 
518
- try:
519
- import tempfile, os
520
- with tempfile.TemporaryDirectory() as tmp:
521
- out_template = os.path.join(tmp, "audio.%(ext)s")
522
- ydl_opts = {
523
- "format": "bestaudio/best",
524
- "outtmpl": out_template,
525
- "quiet": True,
526
- "no_warnings": True,
527
- "max_filesize": 50 * 1024 * 1024, # 50 MB cap
528
- "postprocessors": [{
529
- "key": "FFmpegExtractAudio",
530
- "preferredcodec": "wav",
531
- "preferredquality": "16",
532
- }],
533
- }
534
- import yt_dlp
535
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
536
- info = ydl.extract_info(url, download=True)
537
- title = info.get("title", "unknown")
538
- duration = info.get("duration", 0)
539
-
540
- if duration and duration > 600:
541
- return "⚠️ Video is longer than 10 minutes. Use a shorter clip or timestamp range."
542
-
543
- # Find the downloaded file
544
- wav_file = os.path.join(tmp, "audio.wav")
545
- if not os.path.exists(wav_file):
546
- # Try any audio file
547
- for fname in os.listdir(tmp):
548
- if fname.startswith("audio."):
549
- wav_file = os.path.join(tmp, fname)
550
- break
551
-
552
- if not os.path.exists(wav_file):
553
- return "❌ Audio download failed. The video may be unavailable or geo-restricted."
554
-
555
- # Upload to HF Hub feedback dataset
556
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
557
- audio_repo_path = f"youtube_audio/{lang}/{timestamp}.wav"
558
- meta_repo_path = f"youtube_audio/{lang}/{timestamp}.txt"
559
-
560
- if _hf_api is not None and FEEDBACK_REPO_ID:
561
- import io
562
- _hf_api.upload_file(
563
- path_or_fileobj=wav_file,
564
- path_in_repo=audio_repo_path,
565
- repo_id=FEEDBACK_REPO_ID,
566
- repo_type="dataset",
567
- )
568
- meta = f"title: {title}\nurl: {url}\nlanguage: {lang}\ntranscription: {transcript}\n"
569
- _hf_api.upload_file(
570
- path_or_fileobj=io.BytesIO(meta.encode()),
571
- path_in_repo=meta_repo_path,
572
- repo_id=FEEDBACK_REPO_ID,
573
- repo_type="dataset",
574
- )
575
- return (f"✅ Saved to training dataset!\n"
576
- f"Title: {title} ({duration}s)\n"
577
- f"Audio: {audio_repo_path}\n"
578
- f"The transcription is saved alongside it.")
579
- else:
580
- return "⚠️ HF_TOKEN not set — audio downloaded but could not be saved to Hub."
581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
  except Exception as exc:
583
- return f"❌ Download failed: {exc}"
584
 
585
 
586
  # ── Main ask handler ──────────────────────────────────────────────────────────
@@ -797,37 +759,42 @@ def build_ui() -> gr.Blocks:
797
  kb_import_btn = gr.Button("➕ Add to Knowledge Base", variant="primary")
798
  kb_status = gr.Textbox(label="Status", interactive=False, lines=3)
799
 
800
- # ── Right: YouTube import ─────────────────────────────────
801
  with gr.Column():
802
  gr.Markdown(
803
- "### 🎬 Import audio from YouTube\n"
804
- "Paste a link to a Bambara or Fula YouTube video "
805
- "(lesson, conversation, song, news, etc.). "
806
- "The audio is downloaded and saved to the training dataset.\n\n"
807
- "**Tips for finding good videos:**\n"
808
- "- Search YouTube for **'Bambara conversation'**, **'Bamanankan'**, "
809
- "**'Pular leçon'**, **'Fulfulde'**\n"
810
- "- Language learning channels and radio recordings work best\n"
811
- "- Videos under 10 minutes are preferred\n\n"
812
- "After saving, run the training notebook on Kaggle/Colab to fine-tune "
813
- "the speech model with your new audio."
814
  )
815
  yt_lang = gr.Dropdown(
816
  choices=["Bambara (bam)", "Fula (ful)"],
817
  value="Bambara (bam)",
818
- label="Language spoken in the video",
819
  )
820
- yt_url = gr.Textbox(
821
- placeholder="https://www.youtube.com/watch?v=...",
822
- label="YouTube URL",
 
823
  )
824
  yt_transcript = gr.Textbox(
825
  lines=5,
826
- placeholder="Type what is said in the video (as much as you can). "
827
- "This transcription will be used to train the speech model.",
828
- label="Transcription — what is said in this video",
 
 
 
 
829
  )
830
- yt_btn = gr.Button("⬇️ Download & Save for Training", variant="secondary")
831
  yt_status = gr.Textbox(label="Status", interactive=False, lines=4)
832
 
833
  kb_import_btn.click(
@@ -836,8 +803,8 @@ def build_ui() -> gr.Blocks:
836
  outputs=[kb_status],
837
  )
838
  yt_btn.click(
839
- fn=_download_youtube_for_training,
840
- inputs=[yt_lang, yt_url, yt_transcript],
841
  outputs=[yt_status],
842
  )
843
 
 
499
  threading.Thread(target=_load_phrase_additions_from_hub, daemon=True).start()
500
 
501
 
502
+ def _save_audio_for_training(lang_label: str, audio_path: str | None, transcript: str, source_note: str) -> str:
503
+ """Save an uploaded audio file + transcription as a training sample to HF Hub."""
 
504
  transcript = transcript.strip()
505
+ if audio_path is None:
506
+ return "⚠️ Please upload an audio file first."
507
  if not transcript:
508
+ return "⚠️ Please type the transcription what is said in this audio."
 
 
 
 
 
509
 
510
  lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
511
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
512
+ audio_repo_path = f"training_audio/{lang}/{timestamp}.wav"
513
+ meta_repo_path = f"training_audio/{lang}/{timestamp}.txt"
514
 
515
+ if _hf_api is None or not FEEDBACK_REPO_ID:
516
+ return "⚠️ HF_TOKEN not set — file saved locally only, not uploaded to Hub."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
 
518
+ try:
519
+ import io
520
+ _hf_api.upload_file(
521
+ path_or_fileobj=audio_path,
522
+ path_in_repo=audio_repo_path,
523
+ repo_id=FEEDBACK_REPO_ID,
524
+ repo_type="dataset",
525
+ )
526
+ meta = (
527
+ f"language: {lang}\n"
528
+ f"transcription: {transcript}\n"
529
+ f"source: {source_note.strip() or 'uploaded'}\n"
530
+ f"timestamp: {timestamp}\n"
531
+ )
532
+ _hf_api.upload_file(
533
+ path_or_fileobj=io.BytesIO(meta.encode()),
534
+ path_in_repo=meta_repo_path,
535
+ repo_id=FEEDBACK_REPO_ID,
536
+ repo_type="dataset",
537
+ )
538
+ return (
539
+ f"✅ Saved to training dataset!\n"
540
+ f"Audio: {audio_repo_path}\n"
541
+ f"Transcription: {transcript[:80]}{'…' if len(transcript) > 80 else ''}\n"
542
+ f"Run the training notebook on Kaggle to include this in the next model update."
543
+ )
544
  except Exception as exc:
545
+ return f"❌ Upload failed: {exc}"
546
 
547
 
548
  # ── Main ask handler ──────────────────────────────────────────────────────────
 
759
  kb_import_btn = gr.Button("➕ Add to Knowledge Base", variant="primary")
760
  kb_status = gr.Textbox(label="Status", interactive=False, lines=3)
761
 
762
+ # ── Right: audio upload for training ─────────────────────
763
  with gr.Column():
764
  gr.Markdown(
765
+ "### 🎬 Add audio from YouTube (or anywhere)\n"
766
+ "HuggingFace Spaces cannot download YouTube directly, "
767
+ "so convert the video to audio first on your computer:\n\n"
768
+ "**Free online converters:**\n"
769
+ "- [ytmp3.cc](https://ytmp3.cc) paste YouTube URL → download MP3\n"
770
+ "- [cobalt.tools](https://cobalt.tools) paste any video URL → download audio\n"
771
+ "- [y2mate.com](https://y2mate.com) — paste YouTube URL → download MP3\n\n"
772
+ "**Good YouTube search terms:**\n"
773
+ "- Bambara: *'Bamanankan conversation'*, *'Bambara leçon'*, *'donsomana'*\n"
774
+ "- Fula: *'Fulfulde leçon'*, *'Pular conversation'*, *'Fula radio'*\n\n"
775
+ "Then upload the MP3/WAV file below with its transcription."
776
  )
777
  yt_lang = gr.Dropdown(
778
  choices=["Bambara (bam)", "Fula (ful)"],
779
  value="Bambara (bam)",
780
+ label="Language spoken in the audio",
781
  )
782
+ yt_audio = gr.Audio(
783
+ sources=["upload"],
784
+ type="filepath",
785
+ label="Upload audio file (MP3 or WAV)",
786
  )
787
  yt_transcript = gr.Textbox(
788
  lines=5,
789
+ placeholder="Type what is said in the audio (as much as you can).\n"
790
+ "Example:\nJam waali. No mbadda. Mi woni ɗoo wallude ma.",
791
+ label="Transcription — what is said in this audio",
792
+ )
793
+ yt_source = gr.Textbox(
794
+ placeholder="e.g. YouTube: Bambara lesson by Moussa Kouyaté",
795
+ label="Source (optional — for your records)",
796
  )
797
+ yt_btn = gr.Button("💾 Save Audio for Training", variant="secondary")
798
  yt_status = gr.Textbox(label="Status", interactive=False, lines=4)
799
 
800
  kb_import_btn.click(
 
803
  outputs=[kb_status],
804
  )
805
  yt_btn.click(
806
+ fn=_save_audio_for_training,
807
+ inputs=[yt_lang, yt_audio, yt_transcript, yt_source],
808
  outputs=[yt_status],
809
  )
810
 
requirements.txt CHANGED
@@ -51,6 +51,3 @@ scipy==1.15.2
51
 
52
  # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
53
  rapidfuzz==3.13.0
54
-
55
- # YouTube audio download for training data collection
56
- yt-dlp==2025.3.31
 
51
 
52
  # Phrase matching (fuzzy match for Whisper mis-transcriptions of Bambara/Fula)
53
  rapidfuzz==3.13.0