jefffffff9 Claude Sonnet 4.6 commited on
Commit
eddcaea
·
1 Parent(s): 98eece8

Add mass training + internet self-teaching features

Browse files

New Tab 5 — Bulk Upload:
- Upload ZIP of audio files + CSV (filename,transcription) in one operation
- Batch-inserts all samples into corrections.jsonl + uploads audio to Hub

New Tab 6 — Self-Teaching:
- Wikipedia harvest: bm/ff Wikipedia API → sentence text → vocabulary.jsonl
(868 Bambara articles, 17k+ Fula articles)
- HF dataset import: RobotsMali/jeli-asr (33k Bambara) + google/fleurs ff_sn
(Fula) → corrections.jsonl, one click
- Auto-training trigger: fires Kaggle kernel via REST API when
corrections.jsonl reaches AUTO_TRAIN_THRESHOLD (default 50) entries
Requires KAGGLE_USERNAME + KAGGLE_KEY Space secrets

New src/data/web_harvester.py:
- harvest_wikipedia_text(lang, max_articles) — MediaWiki API, no deps
- harvest_hf_audio(lang, token) — generator yielding (wav_bytes, text, path)
- _numpy_to_wav_bytes() — stdlib-only WAV encoder

Auto-trigger also fires after each Tab 2 correction save.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +407 -6
  2. src/data/web_harvester.py +171 -0
app.py CHANGED
@@ -3,11 +3,14 @@ Sahel-Agri Voice AI — HuggingFace Spaces (ZeroGPU)
3
  Two-way voice assistant: Bambara / Fula / French / English → voice response
4
 
5
  Environment variables (set in Space Settings → Secrets):
6
- HF_TOKEN — HF write-access token
7
- FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
8
- ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
9
- WHISPER_MODEL_ID — default: openai/whisper-large-v3-turbo
10
- (use openai/whisper-base for local CPU testing)
 
 
 
11
  """
12
 
13
  from __future__ import annotations
@@ -33,7 +36,11 @@ FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedba
33
  ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
34
  # whisper-small: ~10s on cpu-basic, good multilingual quality.
35
  # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
36
- WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-small")
 
 
 
 
37
 
38
  # On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
39
  _ON_SPACES = os.environ.get("SPACE_ID") is not None
@@ -324,6 +331,7 @@ def _save_feedback_to_hub(
324
  return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
325
 
326
  total = updated.count("\n")
 
327
  return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
328
 
329
  except Exception as e:
@@ -582,6 +590,289 @@ def _save_audio_for_training(lang_label: str, audio_path: str | None, transcript
582
  return f"❌ Upload failed: {exc}"
583
 
584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  # ── Main ask handler ──────────────────────────────────────────────────────────
586
 
587
  def handle_ask(audio_path, language_label):
@@ -872,6 +1163,116 @@ def build_ui() -> gr.Blocks:
872
  reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
873
  reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
  return demo
876
 
877
 
 
3
  Two-way voice assistant: Bambara / Fula / French / English → voice response
4
 
5
  Environment variables (set in Space Settings → Secrets):
6
+ HF_TOKEN — HF write-access token
7
+ FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
8
+ ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
9
+ WHISPER_MODEL_ID — default: openai/whisper-small
10
+ KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
11
+ KAGGLE_KEY — Kaggle API key (for auto-trigger training)
12
+ KAGGLE_KERNEL_SLUG — default: ous-sow/sahel-voice-master-trainer
13
+ AUTO_TRAIN_THRESHOLD — corrections count that triggers auto-training (default: 50)
14
  """
15
 
16
  from __future__ import annotations
 
36
  ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
37
  # whisper-small: ~10s on cpu-basic, good multilingual quality.
38
  # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
39
+ WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-small")
40
+ KAGGLE_USERNAME = os.environ.get("KAGGLE_USERNAME", "")
41
+ KAGGLE_KEY = os.environ.get("KAGGLE_KEY", "")
42
+ KAGGLE_KERNEL_SLUG = os.environ.get("KAGGLE_KERNEL_SLUG", "ous-sow/sahel-voice-master-trainer")
43
+ AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
44
 
45
  # On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
46
  _ON_SPACES = os.environ.get("SPACE_ID") is not None
 
331
  return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
332
 
333
  total = updated.count("\n")
334
+ _maybe_auto_trigger()
335
  return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
336
 
337
  except Exception as e:
 
590
  return f"❌ Upload failed: {exc}"
591
 
592
 
593
+ # ── Auto-training trigger ─────────────────────────────────────────────────────
594
+
595
+ def _count_corrections() -> int:
596
+ """Return number of entries in corrections.jsonl on the Hub."""
597
+ if _hf_api is None:
598
+ return 0
599
+ try:
600
+ from huggingface_hub import hf_hub_download
601
+ local = hf_hub_download(
602
+ repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
603
+ repo_type="dataset", token=HF_TOKEN,
604
+ )
605
+ with open(local, encoding="utf-8") as f:
606
+ return sum(1 for l in f if l.strip())
607
+ except Exception:
608
+ return 0
609
+
610
+
611
+ def _trigger_kaggle_training(lang: str = "bam") -> str:
612
+ """Fire the Kaggle kernel via REST API if credentials are configured."""
613
+ if not KAGGLE_USERNAME or not KAGGLE_KEY:
614
+ return "⚠️ KAGGLE_USERNAME / KAGGLE_KEY not set in Space secrets — auto-trigger disabled."
615
+ try:
616
+ import urllib.request, urllib.parse, base64
617
+ token = base64.b64encode(f"{KAGGLE_USERNAME}:{KAGGLE_KEY}".encode()).decode()
618
+ url = f"https://www.kaggle.com/api/v1/kernels/{KAGGLE_KERNEL_SLUG}/run"
619
+ body = json.dumps({"enableGpu": True}).encode()
620
+ req = urllib.request.Request(
621
+ url, data=body, method="POST",
622
+ headers={
623
+ "Authorization": f"Basic {token}",
624
+ "Content-Type": "application/json",
625
+ },
626
+ )
627
+ with urllib.request.urlopen(req, timeout=15) as r:
628
+ resp = json.loads(r.read())
629
+ return f"✅ Kaggle training triggered! Run ID: {resp.get('currentRunningVersion', 'started')}"
630
+ except Exception as e:
631
+ return f"❌ Kaggle trigger failed: {e}"
632
+
633
+
634
+ def _maybe_auto_trigger() -> None:
635
+ """Called after each correction save. Triggers Kaggle if threshold met."""
636
+ if not KAGGLE_USERNAME or not KAGGLE_KEY:
637
+ return
638
+ count = _count_corrections()
639
+ if count > 0 and count % AUTO_TRAIN_THRESHOLD == 0:
640
+ threading.Thread(target=_trigger_kaggle_training, daemon=True).start()
641
+
642
+
643
+ # ── Bulk upload handler ────────────────────────────────────────────────────────
644
+
645
+ def _bulk_upload(lang_label: str, zip_file, csv_text: str) -> str:
646
+ """
647
+ Accept a ZIP of audio files + a CSV (filename,transcription) and batch-insert
648
+ all samples into corrections.jsonl. Audio stored under audio/ in the Hub repo.
649
+ """
650
+ import zipfile, csv
651
+
652
+ if _hf_api is None:
653
+ return "⚠️ HF_TOKEN not set — cannot upload."
654
+ if zip_file is None and not csv_text.strip():
655
+ return "⚠️ Upload a ZIP and/or paste a CSV."
656
+
657
+ lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
658
+ rows = [] # (audio_bytes_or_None, filename, transcription)
659
+
660
+ # Parse CSV
661
+ transcript_map: dict[str, str] = {}
662
+ if csv_text.strip():
663
+ for row in csv.reader(csv_text.strip().splitlines()):
664
+ if len(row) >= 2:
665
+ transcript_map[row[0].strip()] = row[1].strip()
666
+
667
+ # Extract ZIP
668
+ if zip_file is not None:
669
+ try:
670
+ with zipfile.ZipFile(zip_file, "r") as zf:
671
+ for name in zf.namelist():
672
+ if not name.lower().endswith((".wav", ".mp3", ".ogg", ".flac", ".m4a")):
673
+ continue
674
+ text = transcript_map.get(name) or transcript_map.get(Path(name).name) or ""
675
+ if not text:
676
+ continue
677
+ rows.append((zf.read(name), Path(name).name, text))
678
+ except Exception as e:
679
+ return f"❌ ZIP read error: {e}"
680
+ elif transcript_map:
681
+ # CSV only — audio-less vocab entries
682
+ for fname, text in transcript_map.items():
683
+ rows.append((None, fname, text))
684
+
685
+ if not rows:
686
+ return "⚠️ No matching (audio, transcription) pairs found. Check filenames match CSV."
687
+
688
+ # Upload batch
689
+ records = []
690
+ errors = 0
691
+ for audio_bytes, fname, text in rows:
692
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
693
+ audio_path = f"audio/{lang}_{ts}.wav"
694
+ try:
695
+ if audio_bytes:
696
+ _hf_api.upload_file(
697
+ path_or_fileobj=io.BytesIO(audio_bytes),
698
+ path_in_repo=audio_path,
699
+ repo_id=FEEDBACK_REPO_ID,
700
+ repo_type="dataset",
701
+ )
702
+ records.append({
703
+ "id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
704
+ "language": lang,
705
+ "audio_file": audio_path if audio_bytes else "",
706
+ "transcription": text, "corrected_text": text,
707
+ "source": f"bulk_upload:{fname}", "is_correction": False,
708
+ "model": WHISPER_MODEL_ID,
709
+ })
710
+ except Exception:
711
+ errors += 1
712
+
713
+ # Append all to corrections.jsonl
714
+ from huggingface_hub import hf_hub_download
715
+ for attempt in range(2):
716
+ try:
717
+ local = hf_hub_download(
718
+ repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
719
+ repo_type="dataset", token=HF_TOKEN,
720
+ )
721
+ with open(local, encoding="utf-8") as f:
722
+ existing = f.read()
723
+ except Exception:
724
+ existing = ""
725
+ new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
726
+ updated = existing + new_lines
727
+ try:
728
+ _hf_api.upload_file(
729
+ path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
730
+ path_in_repo="corrections.jsonl",
731
+ repo_id=FEEDBACK_REPO_ID,
732
+ repo_type="dataset",
733
+ )
734
+ break
735
+ except Exception as e:
736
+ if attempt == 1:
737
+ return f"⚠️ Audio uploaded but corrections.jsonl failed: {e}"
738
+
739
+ total = updated.count("\n")
740
+ _maybe_auto_trigger()
741
+ return (
742
+ f"✅ Bulk upload complete!\n"
743
+ f" Uploaded : {len(records)} samples ({errors} errors)\n"
744
+ f" Dataset : {total} total corrections\n"
745
+ f" Auto-train threshold: {AUTO_TRAIN_THRESHOLD} entries"
746
+ )
747
+
748
+
749
+ # ── Internet self-teaching handlers ───────────────────────────────────────────
750
+
751
+ def _harvest_wikipedia(lang_label: str, max_articles: int = 100) -> str:
752
+ """Fetch Wikipedia text for this language and append to vocabulary.jsonl."""
753
+ if _hf_api is None:
754
+ return "⚠️ HF_TOKEN not set."
755
+ lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
756
+ if lang not in ("bam", "ful"):
757
+ return "⚠️ Wikipedia harvest only supported for Bambara and Fula."
758
+
759
+ from src.data.web_harvester import harvest_wikipedia_text
760
+ entries = harvest_wikipedia_text(lang, max_articles=max_articles)
761
+ if not entries:
762
+ return "⚠️ No text harvested — check network or try again."
763
+
764
+ # Append to vocabulary.jsonl
765
+ from huggingface_hub import hf_hub_download
766
+ for attempt in range(2):
767
+ try:
768
+ local = hf_hub_download(
769
+ repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
770
+ repo_type="dataset", token=HF_TOKEN,
771
+ )
772
+ with open(local, encoding="utf-8") as f:
773
+ existing = f.read()
774
+ except Exception:
775
+ existing = ""
776
+ new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries)
777
+ updated = existing + new_lines
778
+ try:
779
+ _hf_api.upload_file(
780
+ path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
781
+ path_in_repo="vocabulary.jsonl",
782
+ repo_id=FEEDBACK_REPO_ID,
783
+ repo_type="dataset",
784
+ )
785
+ break
786
+ except Exception as e:
787
+ if attempt == 1:
788
+ return f"❌ Upload failed: {e}"
789
+
790
+ return (
791
+ f"✅ Wikipedia harvest complete!\n"
792
+ f" Language : {lang_label}\n"
793
+ f" Sentences added : {len(entries)}\n"
794
+ f" Total vocabulary entries: {updated.count(chr(10))}"
795
+ )
796
+
797
+
798
+ def _harvest_hf_dataset(lang_label: str, max_samples: int = 500) -> str:
799
+ """Pull audio+transcription from public HF datasets into corrections.jsonl."""
800
+ if _hf_api is None:
801
+ return "⚠️ HF_TOKEN not set."
802
+ lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
803
+ if lang not in ("bam", "ful"):
804
+ return "⚠️ HF dataset harvest only supported for Bambara and Fula."
805
+
806
+ from src.data.web_harvester import harvest_hf_audio, HF_ASR_SOURCES
807
+ sources = HF_ASR_SOURCES.get(lang, [])
808
+ if not sources:
809
+ return f"⚠️ No HF dataset configured for {lang}."
810
+
811
+ records = []
812
+ errors = 0
813
+ for wav_bytes, text, repo_path in harvest_hf_audio(lang, HF_TOKEN):
814
+ try:
815
+ _hf_api.upload_file(
816
+ path_or_fileobj=io.BytesIO(wav_bytes),
817
+ path_in_repo=repo_path,
818
+ repo_id=FEEDBACK_REPO_ID,
819
+ repo_type="dataset",
820
+ )
821
+ ts = repo_path.split("_")[-1].replace(".wav", "")
822
+ records.append({
823
+ "id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
824
+ "language": lang,
825
+ "audio_file": repo_path,
826
+ "transcription": text, "corrected_text": text,
827
+ "source": f"hf_harvest:{sources[0]['repo']}",
828
+ "is_correction": False, "model": WHISPER_MODEL_ID,
829
+ })
830
+ if len(records) >= max_samples:
831
+ break
832
+ except Exception:
833
+ errors += 1
834
+ if errors > 20:
835
+ break
836
+
837
+ if not records:
838
+ return "⚠️ No samples harvested. Dataset may require accepting terms on HuggingFace first."
839
+
840
+ # Append to corrections.jsonl
841
+ from huggingface_hub import hf_hub_download
842
+ for attempt in range(2):
843
+ try:
844
+ local = hf_hub_download(
845
+ repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
846
+ repo_type="dataset", token=HF_TOKEN,
847
+ )
848
+ with open(local, encoding="utf-8") as f:
849
+ existing = f.read()
850
+ except Exception:
851
+ existing = ""
852
+ new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
853
+ updated = existing + new_lines
854
+ try:
855
+ _hf_api.upload_file(
856
+ path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
857
+ path_in_repo="corrections.jsonl",
858
+ repo_id=FEEDBACK_REPO_ID,
859
+ repo_type="dataset",
860
+ )
861
+ break
862
+ except Exception as e:
863
+ if attempt == 1:
864
+ return f"❌ corrections.jsonl update failed: {e}"
865
+
866
+ total = updated.count("\n")
867
+ _maybe_auto_trigger()
868
+ return (
869
+ f"✅ HF dataset harvest complete!\n"
870
+ f" Source : {sources[0]['repo']}\n"
871
+ f" Imported : {len(records)} samples ({errors} errors)\n"
872
+ f" Dataset : {total} total corrections\n"
873
+ )
874
+
875
+
876
  # ── Main ask handler ──────────────────────────────────────────────────────────
877
 
878
  def handle_ask(audio_path, language_label):
 
1163
  reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
1164
  reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
1165
 
1166
+ # ── Tab 5: Bulk Upload ────────────────────────────────────────────
1167
+ with gr.TabItem("📦 Bulk Upload"):
1168
+ gr.Markdown(
1169
+ "## Upload many audio samples at once\n\n"
1170
+ "**Step 1** — Prepare a ZIP file containing your audio files (WAV/MP3).\n\n"
1171
+ "**Step 2** — Prepare a CSV with two columns: `filename,transcription`\n"
1172
+ "```\nbam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini\n```\n\n"
1173
+ "**Step 3** — Select language, upload ZIP, paste CSV, click Upload."
1174
+ )
1175
+ with gr.Row():
1176
+ with gr.Column():
1177
+ bulk_lang = gr.Dropdown(
1178
+ choices=["Bambara (bam)", "Fula (ful)"],
1179
+ value="Bambara (bam)", label="Language"
1180
+ )
1181
+ bulk_zip = gr.File(
1182
+ label="ZIP file (audio files)", file_types=[".zip"]
1183
+ )
1184
+ bulk_csv = gr.Textbox(
1185
+ lines=10,
1186
+ label="CSV — filename,transcription (one per line)",
1187
+ placeholder="bam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini",
1188
+ )
1189
+ bulk_btn = gr.Button("📤 Upload Batch", variant="primary")
1190
+ bulk_status = gr.Textbox(label="Status", interactive=False, lines=5)
1191
+ bulk_btn.click(
1192
+ fn=_bulk_upload,
1193
+ inputs=[bulk_lang, bulk_zip, bulk_csv],
1194
+ outputs=[bulk_status],
1195
+ )
1196
+
1197
+ # ── Tab 6: Self-Teaching ──────────────────────────────────────────
1198
+ with gr.TabItem("🌐 Self-Teaching"):
1199
+ gr.Markdown(
1200
+ "## Teach the model from the internet\n\n"
1201
+ "These tools pull publicly available Bambara and Fula language data "
1202
+ "directly into your training dataset — no manual work required."
1203
+ )
1204
+ with gr.Row():
1205
+ # Wikipedia harvest
1206
+ with gr.Column():
1207
+ gr.Markdown(
1208
+ "### 📖 Wikipedia Text Harvest\n"
1209
+ "Pulls sentence-length text from Bambara Wikipedia (868 articles) "
1210
+ "or Fula Wikipedia (17,000+ articles) into `vocabulary.jsonl`.\n\n"
1211
+ "Use this to expand vocabulary coverage before a training run."
1212
+ )
1213
+ wiki_lang = gr.Dropdown(
1214
+ choices=["Bambara (bam)", "Fula (ful)"],
1215
+ value="Bambara (bam)", label="Language"
1216
+ )
1217
+ wiki_articles = gr.Slider(
1218
+ minimum=10, maximum=500, value=100, step=10,
1219
+ label="Max articles to fetch"
1220
+ )
1221
+ wiki_btn = gr.Button("📖 Harvest Wikipedia Text", variant="secondary")
1222
+ wiki_status = gr.Textbox(label="Status", interactive=False, lines=4)
1223
+ wiki_btn.click(
1224
+ fn=_harvest_wikipedia,
1225
+ inputs=[wiki_lang, wiki_articles],
1226
+ outputs=[wiki_status],
1227
+ )
1228
+
1229
+ # HF dataset harvest
1230
+ with gr.Column():
1231
+ gr.Markdown(
1232
+ "### 🤗 HuggingFace Dataset Import\n"
1233
+ "Pulls real audio + transcriptions from:\n"
1234
+ "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
1235
+ "- **Fula**: `google/fleurs ff_sn`\n\n"
1236
+ "Samples are added to `corrections.jsonl` and counted toward "
1237
+ f"the auto-training threshold ({AUTO_TRAIN_THRESHOLD} entries)."
1238
+ )
1239
+ hf_lang = gr.Dropdown(
1240
+ choices=["Bambara (bam)", "Fula (ful)"],
1241
+ value="Bambara (bam)", label="Language"
1242
+ )
1243
+ hf_samples = gr.Slider(
1244
+ minimum=50, maximum=2000, value=500, step=50,
1245
+ label="Max samples to import"
1246
+ )
1247
+ hf_btn = gr.Button("🤗 Import from HuggingFace", variant="primary")
1248
+ hf_status = gr.Textbox(label="Status", interactive=False, lines=5)
1249
+ hf_btn.click(
1250
+ fn=_harvest_hf_dataset,
1251
+ inputs=[hf_lang, hf_samples],
1252
+ outputs=[hf_status],
1253
+ )
1254
+
1255
+ gr.Markdown("---")
1256
+ gr.Markdown(
1257
+ "### ⚡ Auto-Training\n"
1258
+ f"When `corrections.jsonl` reaches a multiple of **{AUTO_TRAIN_THRESHOLD}** entries, "
1259
+ "the Kaggle training notebook is triggered automatically.\n\n"
1260
+ "To enable: add `KAGGLE_USERNAME` and `KAGGLE_KEY` in Space Settings → Secrets.\n\n"
1261
+ f"Kernel: `{KAGGLE_KERNEL_SLUG}`"
1262
+ )
1263
+ with gr.Row():
1264
+ trigger_lang = gr.Dropdown(
1265
+ choices=["Bambara (bam)", "Fula (ful)"],
1266
+ value="Bambara (bam)", label="Language to train"
1267
+ )
1268
+ trigger_btn = gr.Button("⚡ Trigger Training Now", variant="secondary")
1269
+ trigger_out = gr.Textbox(label="Status", interactive=False, lines=2)
1270
+ trigger_btn.click(
1271
+ fn=lambda l: _trigger_kaggle_training(SUPPORTED_LANGUAGES.get(l, "bam")),
1272
+ inputs=[trigger_lang],
1273
+ outputs=[trigger_out],
1274
+ )
1275
+
1276
  return demo
1277
 
1278
 
src/data/web_harvester.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web harvester — pulls Bambara/Fula data from public internet sources
3
+ into the sahel-agri-feedback HF dataset repo.
4
+
5
+ Sources:
6
+ - RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples)
7
+ - google/fleurs ff_sn (HF, Fula audio+text)
8
+ - bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only → vocabulary.jsonl)
9
+
10
+ All writes go through the same corrections.jsonl / vocabulary.jsonl files
11
+ that the Kaggle training notebook reads — no special handling needed.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import io
16
+ import json
17
+ import time
18
+ from datetime import datetime, timezone
19
+ from typing import Generator
20
+
21
+ WIKI_APIS = {
22
+ "bam": "https://bm.wikipedia.org/w/api.php",
23
+ "ful": "https://ff.wikipedia.org/w/api.php",
24
+ }
25
+
26
+ HF_ASR_SOURCES = {
27
+ "bam": [
28
+ {"repo": "RobotsMali/jeli-asr", "config": "jeli-asr", "split": "train",
29
+ "audio_col": "audio", "text_col": "bam", "max": 5_000},
30
+ ],
31
+ "ful": [
32
+ {"repo": "google/fleurs", "config": "ff_sn", "split": "train",
33
+ "audio_col": "audio", "text_col": "transcription", "max": 2_000},
34
+ ],
35
+ }
36
+
37
+
38
+ # ── Wikipedia text harvest ────────────────────────────────────────────────────
39
+
40
+ def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
41
+ """
42
+ Fetch up to max_articles article extracts from the language's Wikipedia.
43
+ Returns list of {word, translation, language} dicts suitable for vocabulary.jsonl.
44
+ """
45
+ import urllib.request, urllib.parse
46
+
47
+ api_url = WIKI_APIS.get(lang)
48
+ if not api_url:
49
+ return []
50
+
51
+ # Step 1: get a list of article titles
52
+ params = urllib.parse.urlencode({
53
+ "action": "query",
54
+ "list": "allpages",
55
+ "aplimit": max_articles,
56
+ "apfilterredir": "nonredirects",
57
+ "format": "json",
58
+ })
59
+ with urllib.request.urlopen(f"{api_url}?{params}", timeout=15) as r:
60
+ data = json.loads(r.read())
61
+
62
+ titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
63
+ if not titles:
64
+ return []
65
+
66
+ # Step 2: fetch plain-text extracts in batches of 20
67
+ entries = []
68
+ for i in range(0, len(titles), 20):
69
+ batch = titles[i:i + 20]
70
+ params2 = urllib.parse.urlencode({
71
+ "action": "query",
72
+ "titles": "|".join(batch),
73
+ "prop": "extracts",
74
+ "exsentences": 3,
75
+ "exlimit": len(batch),
76
+ "explaintext": True,
77
+ "format": "json",
78
+ })
79
+ try:
80
+ with urllib.request.urlopen(f"{api_url}?{params2}", timeout=15) as r:
81
+ data2 = json.loads(r.read())
82
+ for page in data2.get("query", {}).get("pages", {}).values():
83
+ extract = (page.get("extract") or "").strip()
84
+ title = page.get("title", "").strip()
85
+ if not extract or not title:
86
+ continue
87
+ # Split into sentences, keep those 3–20 words
88
+ for sentence in extract.replace("\n", " ").split("."):
89
+ sentence = sentence.strip()
90
+ words = sentence.split()
91
+ if 3 <= len(words) <= 20:
92
+ entries.append({
93
+ "word": sentence,
94
+ "translation": title, # use article title as loose context
95
+ "language": lang,
96
+ "source": "wikipedia",
97
+ })
98
+ except Exception:
99
+ pass
100
+ time.sleep(0.3) # be polite to Wikipedia servers
101
+
102
+ return entries
103
+
104
+
105
+ # ── HF dataset audio harvest ──────────────────────────────────────────────────
106
+
107
+ def harvest_hf_audio(
108
+ lang: str,
109
+ hf_token: str | None,
110
+ progress_cb=None,
111
+ ) -> Generator[tuple[bytes, str, str], None, None]:
112
+ """
113
+ Yield (wav_bytes, transcription, audio_repo_path) for each sample
114
+ in the configured HF ASR sources for this language.
115
+
116
+ Caller is responsible for writing to corrections.jsonl + uploading audio.
117
+ progress_cb(current, total, message) called periodically if provided.
118
+ """
119
+ import numpy as np
120
+
121
+ sources = HF_ASR_SOURCES.get(lang, [])
122
+ for src in sources:
123
+ try:
124
+ from datasets import load_dataset, Audio as HFAudio
125
+ except ImportError:
126
+ continue
127
+
128
+ try:
129
+ ds = load_dataset(
130
+ src["repo"], src["config"],
131
+ split=src["split"],
132
+ streaming=True,
133
+ token=hf_token,
134
+ trust_remote_code=False,
135
+ )
136
+ ds = ds.cast_column(src["audio_col"], HFAudio(sampling_rate=16_000))
137
+ total = src["max"]
138
+ for i, sample in enumerate(ds.take(total)):
139
+ if progress_cb:
140
+ progress_cb(i + 1, total, f"{src['repo']} ({lang})")
141
+ try:
142
+ audio_arr = np.array(sample[src["audio_col"]]["array"], dtype=np.float32)
143
+ text = (sample.get(src["text_col"]) or "").strip()
144
+ if not text or len(audio_arr) < 3_200: # skip < 0.2s
145
+ continue
146
+ # Convert to WAV bytes
147
+ wav_bytes = _numpy_to_wav_bytes(audio_arr, 16_000)
148
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
149
+ repo_path = f"audio/{lang}_{ts}.wav"
150
+ yield wav_bytes, text, repo_path
151
+ except Exception:
152
+ continue
153
+ except Exception:
154
+ continue
155
+
156
+
157
+ def _numpy_to_wav_bytes(audio: "np.ndarray", sr: int) -> bytes:
158
+ import struct, io as _io
159
+ audio_pcm = (audio * 32767).clip(-32768, 32767).astype("<i2")
160
+ data_bytes = audio_pcm.tobytes()
161
+ buf = _io.BytesIO()
162
+ # WAV header
163
+ buf.write(b"RIFF")
164
+ buf.write(struct.pack("<I", 36 + len(data_bytes)))
165
+ buf.write(b"WAVE")
166
+ buf.write(b"fmt ")
167
+ buf.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
168
+ buf.write(b"data")
169
+ buf.write(struct.pack("<I", len(data_bytes)))
170
+ buf.write(data_bytes)
171
+ return buf.getvalue()