Spaces:
Sleeping
Add mass training + internet self-teaching features
Browse filesNew Tab 5 — Bulk Upload:
- Upload ZIP of audio files + CSV (filename,transcription) in one operation
- Batch-inserts all samples into corrections.jsonl + uploads audio to Hub
New Tab 6 — Self-Teaching:
- Wikipedia harvest: bm/ff Wikipedia API → sentence text → vocabulary.jsonl
(868 Bambara articles, 17k+ Fula articles)
- HF dataset import: RobotsMali/jeli-asr (33k Bambara) + google/fleurs ff_sn
(Fula) → corrections.jsonl, one click
- Auto-training trigger: fires Kaggle kernel via REST API when
corrections.jsonl reaches AUTO_TRAIN_THRESHOLD (default 50) entries
Requires KAGGLE_USERNAME + KAGGLE_KEY Space secrets
New src/data/web_harvester.py:
- harvest_wikipedia_text(lang, max_articles) — MediaWiki API, no deps
- harvest_hf_audio(lang, token) — generator yielding (wav_bytes, text, path)
- _numpy_to_wav_bytes() — stdlib-only WAV encoder
Auto-trigger also fires after each Tab 2 correction save.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app.py +407 -6
- src/data/web_harvester.py +171 -0
|
@@ -3,11 +3,14 @@ Sahel-Agri Voice AI — HuggingFace Spaces (ZeroGPU)
|
|
| 3 |
Two-way voice assistant: Bambara / Fula / French / English → voice response
|
| 4 |
|
| 5 |
Environment variables (set in Space Settings → Secrets):
|
| 6 |
-
HF_TOKEN
|
| 7 |
-
FEEDBACK_REPO_ID
|
| 8 |
-
ADAPTER_REPO_ID
|
| 9 |
-
WHISPER_MODEL_ID
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
|
@@ -33,7 +36,11 @@ FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedba
|
|
| 33 |
ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
|
| 34 |
# whisper-small: ~10s on cpu-basic, good multilingual quality.
|
| 35 |
# Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
|
| 36 |
-
WHISPER_MODEL_ID
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
|
| 39 |
_ON_SPACES = os.environ.get("SPACE_ID") is not None
|
|
@@ -324,6 +331,7 @@ def _save_feedback_to_hub(
|
|
| 324 |
return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
|
| 325 |
|
| 326 |
total = updated.count("\n")
|
|
|
|
| 327 |
return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
|
| 328 |
|
| 329 |
except Exception as e:
|
|
@@ -582,6 +590,289 @@ def _save_audio_for_training(lang_label: str, audio_path: str | None, transcript
|
|
| 582 |
return f"❌ Upload failed: {exc}"
|
| 583 |
|
| 584 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
# ── Main ask handler ──────────────────────────────────────────────────────────
|
| 586 |
|
| 587 |
def handle_ask(audio_path, language_label):
|
|
@@ -872,6 +1163,116 @@ def build_ui() -> gr.Blocks:
|
|
| 872 |
reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
|
| 873 |
reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
|
| 874 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
return demo
|
| 876 |
|
| 877 |
|
|
|
|
| 3 |
Two-way voice assistant: Bambara / Fula / French / English → voice response
|
| 4 |
|
| 5 |
Environment variables (set in Space Settings → Secrets):
|
| 6 |
+
HF_TOKEN — HF write-access token
|
| 7 |
+
FEEDBACK_REPO_ID — e.g. ous-sow/sahel-agri-feedback (dataset, private)
|
| 8 |
+
ADAPTER_REPO_ID — e.g. ous-sow/sahel-agri-adapters (model, private)
|
| 9 |
+
WHISPER_MODEL_ID — default: openai/whisper-small
|
| 10 |
+
KAGGLE_USERNAME — Kaggle username (for auto-trigger training)
|
| 11 |
+
KAGGLE_KEY — Kaggle API key (for auto-trigger training)
|
| 12 |
+
KAGGLE_KERNEL_SLUG — default: ous-sow/sahel-voice-master-trainer
|
| 13 |
+
AUTO_TRAIN_THRESHOLD — corrections count that triggers auto-training (default: 50)
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
|
|
|
| 36 |
ADAPTER_REPO_ID = os.environ.get("ADAPTER_REPO_ID", "ous-sow/sahel-agri-adapters")
|
| 37 |
# whisper-small: ~10s on cpu-basic, good multilingual quality.
|
| 38 |
# Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
|
| 39 |
+
WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID", "openai/whisper-small")
|
| 40 |
+
KAGGLE_USERNAME = os.environ.get("KAGGLE_USERNAME", "")
|
| 41 |
+
KAGGLE_KEY = os.environ.get("KAGGLE_KEY", "")
|
| 42 |
+
KAGGLE_KERNEL_SLUG = os.environ.get("KAGGLE_KERNEL_SLUG", "ous-sow/sahel-voice-master-trainer")
|
| 43 |
+
AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
|
| 44 |
|
| 45 |
# On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
|
| 46 |
_ON_SPACES = os.environ.get("SPACE_ID") is not None
|
|
|
|
| 331 |
return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
|
| 332 |
|
| 333 |
total = updated.count("\n")
|
| 334 |
+
_maybe_auto_trigger()
|
| 335 |
return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
|
| 336 |
|
| 337 |
except Exception as e:
|
|
|
|
| 590 |
return f"❌ Upload failed: {exc}"
|
| 591 |
|
| 592 |
|
| 593 |
+
# ── Auto-training trigger ─────────────────────────────────────────────────────
|
| 594 |
+
|
| 595 |
+
def _count_corrections() -> int:
|
| 596 |
+
"""Return number of entries in corrections.jsonl on the Hub."""
|
| 597 |
+
if _hf_api is None:
|
| 598 |
+
return 0
|
| 599 |
+
try:
|
| 600 |
+
from huggingface_hub import hf_hub_download
|
| 601 |
+
local = hf_hub_download(
|
| 602 |
+
repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
|
| 603 |
+
repo_type="dataset", token=HF_TOKEN,
|
| 604 |
+
)
|
| 605 |
+
with open(local, encoding="utf-8") as f:
|
| 606 |
+
return sum(1 for l in f if l.strip())
|
| 607 |
+
except Exception:
|
| 608 |
+
return 0
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
def _trigger_kaggle_training(lang: str = "bam") -> str:
|
| 612 |
+
"""Fire the Kaggle kernel via REST API if credentials are configured."""
|
| 613 |
+
if not KAGGLE_USERNAME or not KAGGLE_KEY:
|
| 614 |
+
return "⚠️ KAGGLE_USERNAME / KAGGLE_KEY not set in Space secrets — auto-trigger disabled."
|
| 615 |
+
try:
|
| 616 |
+
import urllib.request, urllib.parse, base64
|
| 617 |
+
token = base64.b64encode(f"{KAGGLE_USERNAME}:{KAGGLE_KEY}".encode()).decode()
|
| 618 |
+
url = f"https://www.kaggle.com/api/v1/kernels/{KAGGLE_KERNEL_SLUG}/run"
|
| 619 |
+
body = json.dumps({"enableGpu": True}).encode()
|
| 620 |
+
req = urllib.request.Request(
|
| 621 |
+
url, data=body, method="POST",
|
| 622 |
+
headers={
|
| 623 |
+
"Authorization": f"Basic {token}",
|
| 624 |
+
"Content-Type": "application/json",
|
| 625 |
+
},
|
| 626 |
+
)
|
| 627 |
+
with urllib.request.urlopen(req, timeout=15) as r:
|
| 628 |
+
resp = json.loads(r.read())
|
| 629 |
+
return f"✅ Kaggle training triggered! Run ID: {resp.get('currentRunningVersion', 'started')}"
|
| 630 |
+
except Exception as e:
|
| 631 |
+
return f"❌ Kaggle trigger failed: {e}"
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
def _maybe_auto_trigger() -> None:
|
| 635 |
+
"""Called after each correction save. Triggers Kaggle if threshold met."""
|
| 636 |
+
if not KAGGLE_USERNAME or not KAGGLE_KEY:
|
| 637 |
+
return
|
| 638 |
+
count = _count_corrections()
|
| 639 |
+
if count > 0 and count % AUTO_TRAIN_THRESHOLD == 0:
|
| 640 |
+
threading.Thread(target=_trigger_kaggle_training, daemon=True).start()
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
# ── Bulk upload handler ────────────────────────────────────────────────────────
|
| 644 |
+
|
| 645 |
+
def _bulk_upload(lang_label: str, zip_file, csv_text: str) -> str:
|
| 646 |
+
"""
|
| 647 |
+
Accept a ZIP of audio files + a CSV (filename,transcription) and batch-insert
|
| 648 |
+
all samples into corrections.jsonl. Audio stored under audio/ in the Hub repo.
|
| 649 |
+
"""
|
| 650 |
+
import zipfile, csv
|
| 651 |
+
|
| 652 |
+
if _hf_api is None:
|
| 653 |
+
return "⚠️ HF_TOKEN not set — cannot upload."
|
| 654 |
+
if zip_file is None and not csv_text.strip():
|
| 655 |
+
return "⚠️ Upload a ZIP and/or paste a CSV."
|
| 656 |
+
|
| 657 |
+
lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
|
| 658 |
+
rows = [] # (audio_bytes_or_None, filename, transcription)
|
| 659 |
+
|
| 660 |
+
# Parse CSV
|
| 661 |
+
transcript_map: dict[str, str] = {}
|
| 662 |
+
if csv_text.strip():
|
| 663 |
+
for row in csv.reader(csv_text.strip().splitlines()):
|
| 664 |
+
if len(row) >= 2:
|
| 665 |
+
transcript_map[row[0].strip()] = row[1].strip()
|
| 666 |
+
|
| 667 |
+
# Extract ZIP
|
| 668 |
+
if zip_file is not None:
|
| 669 |
+
try:
|
| 670 |
+
with zipfile.ZipFile(zip_file, "r") as zf:
|
| 671 |
+
for name in zf.namelist():
|
| 672 |
+
if not name.lower().endswith((".wav", ".mp3", ".ogg", ".flac", ".m4a")):
|
| 673 |
+
continue
|
| 674 |
+
text = transcript_map.get(name) or transcript_map.get(Path(name).name) or ""
|
| 675 |
+
if not text:
|
| 676 |
+
continue
|
| 677 |
+
rows.append((zf.read(name), Path(name).name, text))
|
| 678 |
+
except Exception as e:
|
| 679 |
+
return f"❌ ZIP read error: {e}"
|
| 680 |
+
elif transcript_map:
|
| 681 |
+
# CSV only — audio-less vocab entries
|
| 682 |
+
for fname, text in transcript_map.items():
|
| 683 |
+
rows.append((None, fname, text))
|
| 684 |
+
|
| 685 |
+
if not rows:
|
| 686 |
+
return "⚠️ No matching (audio, transcription) pairs found. Check filenames match CSV."
|
| 687 |
+
|
| 688 |
+
# Upload batch
|
| 689 |
+
records = []
|
| 690 |
+
errors = 0
|
| 691 |
+
for audio_bytes, fname, text in rows:
|
| 692 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
|
| 693 |
+
audio_path = f"audio/{lang}_{ts}.wav"
|
| 694 |
+
try:
|
| 695 |
+
if audio_bytes:
|
| 696 |
+
_hf_api.upload_file(
|
| 697 |
+
path_or_fileobj=io.BytesIO(audio_bytes),
|
| 698 |
+
path_in_repo=audio_path,
|
| 699 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 700 |
+
repo_type="dataset",
|
| 701 |
+
)
|
| 702 |
+
records.append({
|
| 703 |
+
"id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
|
| 704 |
+
"language": lang,
|
| 705 |
+
"audio_file": audio_path if audio_bytes else "",
|
| 706 |
+
"transcription": text, "corrected_text": text,
|
| 707 |
+
"source": f"bulk_upload:{fname}", "is_correction": False,
|
| 708 |
+
"model": WHISPER_MODEL_ID,
|
| 709 |
+
})
|
| 710 |
+
except Exception:
|
| 711 |
+
errors += 1
|
| 712 |
+
|
| 713 |
+
# Append all to corrections.jsonl
|
| 714 |
+
from huggingface_hub import hf_hub_download
|
| 715 |
+
for attempt in range(2):
|
| 716 |
+
try:
|
| 717 |
+
local = hf_hub_download(
|
| 718 |
+
repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
|
| 719 |
+
repo_type="dataset", token=HF_TOKEN,
|
| 720 |
+
)
|
| 721 |
+
with open(local, encoding="utf-8") as f:
|
| 722 |
+
existing = f.read()
|
| 723 |
+
except Exception:
|
| 724 |
+
existing = ""
|
| 725 |
+
new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
|
| 726 |
+
updated = existing + new_lines
|
| 727 |
+
try:
|
| 728 |
+
_hf_api.upload_file(
|
| 729 |
+
path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
|
| 730 |
+
path_in_repo="corrections.jsonl",
|
| 731 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 732 |
+
repo_type="dataset",
|
| 733 |
+
)
|
| 734 |
+
break
|
| 735 |
+
except Exception as e:
|
| 736 |
+
if attempt == 1:
|
| 737 |
+
return f"⚠️ Audio uploaded but corrections.jsonl failed: {e}"
|
| 738 |
+
|
| 739 |
+
total = updated.count("\n")
|
| 740 |
+
_maybe_auto_trigger()
|
| 741 |
+
return (
|
| 742 |
+
f"✅ Bulk upload complete!\n"
|
| 743 |
+
f" Uploaded : {len(records)} samples ({errors} errors)\n"
|
| 744 |
+
f" Dataset : {total} total corrections\n"
|
| 745 |
+
f" Auto-train threshold: {AUTO_TRAIN_THRESHOLD} entries"
|
| 746 |
+
)
|
| 747 |
+
|
| 748 |
+
|
| 749 |
+
# ── Internet self-teaching handlers ───────────────────────────────────────────
|
| 750 |
+
|
| 751 |
+
def _harvest_wikipedia(lang_label: str, max_articles: int = 100) -> str:
|
| 752 |
+
"""Fetch Wikipedia text for this language and append to vocabulary.jsonl."""
|
| 753 |
+
if _hf_api is None:
|
| 754 |
+
return "⚠️ HF_TOKEN not set."
|
| 755 |
+
lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
|
| 756 |
+
if lang not in ("bam", "ful"):
|
| 757 |
+
return "⚠️ Wikipedia harvest only supported for Bambara and Fula."
|
| 758 |
+
|
| 759 |
+
from src.data.web_harvester import harvest_wikipedia_text
|
| 760 |
+
entries = harvest_wikipedia_text(lang, max_articles=max_articles)
|
| 761 |
+
if not entries:
|
| 762 |
+
return "⚠️ No text harvested — check network or try again."
|
| 763 |
+
|
| 764 |
+
# Append to vocabulary.jsonl
|
| 765 |
+
from huggingface_hub import hf_hub_download
|
| 766 |
+
for attempt in range(2):
|
| 767 |
+
try:
|
| 768 |
+
local = hf_hub_download(
|
| 769 |
+
repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
|
| 770 |
+
repo_type="dataset", token=HF_TOKEN,
|
| 771 |
+
)
|
| 772 |
+
with open(local, encoding="utf-8") as f:
|
| 773 |
+
existing = f.read()
|
| 774 |
+
except Exception:
|
| 775 |
+
existing = ""
|
| 776 |
+
new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries)
|
| 777 |
+
updated = existing + new_lines
|
| 778 |
+
try:
|
| 779 |
+
_hf_api.upload_file(
|
| 780 |
+
path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
|
| 781 |
+
path_in_repo="vocabulary.jsonl",
|
| 782 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 783 |
+
repo_type="dataset",
|
| 784 |
+
)
|
| 785 |
+
break
|
| 786 |
+
except Exception as e:
|
| 787 |
+
if attempt == 1:
|
| 788 |
+
return f"❌ Upload failed: {e}"
|
| 789 |
+
|
| 790 |
+
return (
|
| 791 |
+
f"✅ Wikipedia harvest complete!\n"
|
| 792 |
+
f" Language : {lang_label}\n"
|
| 793 |
+
f" Sentences added : {len(entries)}\n"
|
| 794 |
+
f" Total vocabulary entries: {updated.count(chr(10))}"
|
| 795 |
+
)
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
def _harvest_hf_dataset(lang_label: str, max_samples: int = 500) -> str:
|
| 799 |
+
"""Pull audio+transcription from public HF datasets into corrections.jsonl."""
|
| 800 |
+
if _hf_api is None:
|
| 801 |
+
return "⚠️ HF_TOKEN not set."
|
| 802 |
+
lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
|
| 803 |
+
if lang not in ("bam", "ful"):
|
| 804 |
+
return "⚠️ HF dataset harvest only supported for Bambara and Fula."
|
| 805 |
+
|
| 806 |
+
from src.data.web_harvester import harvest_hf_audio, HF_ASR_SOURCES
|
| 807 |
+
sources = HF_ASR_SOURCES.get(lang, [])
|
| 808 |
+
if not sources:
|
| 809 |
+
return f"⚠️ No HF dataset configured for {lang}."
|
| 810 |
+
|
| 811 |
+
records = []
|
| 812 |
+
errors = 0
|
| 813 |
+
for wav_bytes, text, repo_path in harvest_hf_audio(lang, HF_TOKEN):
|
| 814 |
+
try:
|
| 815 |
+
_hf_api.upload_file(
|
| 816 |
+
path_or_fileobj=io.BytesIO(wav_bytes),
|
| 817 |
+
path_in_repo=repo_path,
|
| 818 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 819 |
+
repo_type="dataset",
|
| 820 |
+
)
|
| 821 |
+
ts = repo_path.split("_")[-1].replace(".wav", "")
|
| 822 |
+
records.append({
|
| 823 |
+
"id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
|
| 824 |
+
"language": lang,
|
| 825 |
+
"audio_file": repo_path,
|
| 826 |
+
"transcription": text, "corrected_text": text,
|
| 827 |
+
"source": f"hf_harvest:{sources[0]['repo']}",
|
| 828 |
+
"is_correction": False, "model": WHISPER_MODEL_ID,
|
| 829 |
+
})
|
| 830 |
+
if len(records) >= max_samples:
|
| 831 |
+
break
|
| 832 |
+
except Exception:
|
| 833 |
+
errors += 1
|
| 834 |
+
if errors > 20:
|
| 835 |
+
break
|
| 836 |
+
|
| 837 |
+
if not records:
|
| 838 |
+
return "⚠️ No samples harvested. Dataset may require accepting terms on HuggingFace first."
|
| 839 |
+
|
| 840 |
+
# Append to corrections.jsonl
|
| 841 |
+
from huggingface_hub import hf_hub_download
|
| 842 |
+
for attempt in range(2):
|
| 843 |
+
try:
|
| 844 |
+
local = hf_hub_download(
|
| 845 |
+
repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
|
| 846 |
+
repo_type="dataset", token=HF_TOKEN,
|
| 847 |
+
)
|
| 848 |
+
with open(local, encoding="utf-8") as f:
|
| 849 |
+
existing = f.read()
|
| 850 |
+
except Exception:
|
| 851 |
+
existing = ""
|
| 852 |
+
new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
|
| 853 |
+
updated = existing + new_lines
|
| 854 |
+
try:
|
| 855 |
+
_hf_api.upload_file(
|
| 856 |
+
path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
|
| 857 |
+
path_in_repo="corrections.jsonl",
|
| 858 |
+
repo_id=FEEDBACK_REPO_ID,
|
| 859 |
+
repo_type="dataset",
|
| 860 |
+
)
|
| 861 |
+
break
|
| 862 |
+
except Exception as e:
|
| 863 |
+
if attempt == 1:
|
| 864 |
+
return f"❌ corrections.jsonl update failed: {e}"
|
| 865 |
+
|
| 866 |
+
total = updated.count("\n")
|
| 867 |
+
_maybe_auto_trigger()
|
| 868 |
+
return (
|
| 869 |
+
f"✅ HF dataset harvest complete!\n"
|
| 870 |
+
f" Source : {sources[0]['repo']}\n"
|
| 871 |
+
f" Imported : {len(records)} samples ({errors} errors)\n"
|
| 872 |
+
f" Dataset : {total} total corrections\n"
|
| 873 |
+
)
|
| 874 |
+
|
| 875 |
+
|
| 876 |
# ── Main ask handler ──────────────────────────────────────────────────────────
|
| 877 |
|
| 878 |
def handle_ask(audio_path, language_label):
|
|
|
|
| 1163 |
reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
|
| 1164 |
reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
|
| 1165 |
|
| 1166 |
+
# ── Tab 5: Bulk Upload ────────────────────────────────────────────
|
| 1167 |
+
with gr.TabItem("📦 Bulk Upload"):
|
| 1168 |
+
gr.Markdown(
|
| 1169 |
+
"## Upload many audio samples at once\n\n"
|
| 1170 |
+
"**Step 1** — Prepare a ZIP file containing your audio files (WAV/MP3).\n\n"
|
| 1171 |
+
"**Step 2** — Prepare a CSV with two columns: `filename,transcription`\n"
|
| 1172 |
+
"```\nbam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini\n```\n\n"
|
| 1173 |
+
"**Step 3** — Select language, upload ZIP, paste CSV, click Upload."
|
| 1174 |
+
)
|
| 1175 |
+
with gr.Row():
|
| 1176 |
+
with gr.Column():
|
| 1177 |
+
bulk_lang = gr.Dropdown(
|
| 1178 |
+
choices=["Bambara (bam)", "Fula (ful)"],
|
| 1179 |
+
value="Bambara (bam)", label="Language"
|
| 1180 |
+
)
|
| 1181 |
+
bulk_zip = gr.File(
|
| 1182 |
+
label="ZIP file (audio files)", file_types=[".zip"]
|
| 1183 |
+
)
|
| 1184 |
+
bulk_csv = gr.Textbox(
|
| 1185 |
+
lines=10,
|
| 1186 |
+
label="CSV — filename,transcription (one per line)",
|
| 1187 |
+
placeholder="bam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini",
|
| 1188 |
+
)
|
| 1189 |
+
bulk_btn = gr.Button("📤 Upload Batch", variant="primary")
|
| 1190 |
+
bulk_status = gr.Textbox(label="Status", interactive=False, lines=5)
|
| 1191 |
+
bulk_btn.click(
|
| 1192 |
+
fn=_bulk_upload,
|
| 1193 |
+
inputs=[bulk_lang, bulk_zip, bulk_csv],
|
| 1194 |
+
outputs=[bulk_status],
|
| 1195 |
+
)
|
| 1196 |
+
|
| 1197 |
+
# ── Tab 6: Self-Teaching ──────────────────────────────────────────
|
| 1198 |
+
with gr.TabItem("🌐 Self-Teaching"):
|
| 1199 |
+
gr.Markdown(
|
| 1200 |
+
"## Teach the model from the internet\n\n"
|
| 1201 |
+
"These tools pull publicly available Bambara and Fula language data "
|
| 1202 |
+
"directly into your training dataset — no manual work required."
|
| 1203 |
+
)
|
| 1204 |
+
with gr.Row():
|
| 1205 |
+
# Wikipedia harvest
|
| 1206 |
+
with gr.Column():
|
| 1207 |
+
gr.Markdown(
|
| 1208 |
+
"### 📖 Wikipedia Text Harvest\n"
|
| 1209 |
+
"Pulls sentence-length text from Bambara Wikipedia (868 articles) "
|
| 1210 |
+
"or Fula Wikipedia (17,000+ articles) into `vocabulary.jsonl`.\n\n"
|
| 1211 |
+
"Use this to expand vocabulary coverage before a training run."
|
| 1212 |
+
)
|
| 1213 |
+
wiki_lang = gr.Dropdown(
|
| 1214 |
+
choices=["Bambara (bam)", "Fula (ful)"],
|
| 1215 |
+
value="Bambara (bam)", label="Language"
|
| 1216 |
+
)
|
| 1217 |
+
wiki_articles = gr.Slider(
|
| 1218 |
+
minimum=10, maximum=500, value=100, step=10,
|
| 1219 |
+
label="Max articles to fetch"
|
| 1220 |
+
)
|
| 1221 |
+
wiki_btn = gr.Button("📖 Harvest Wikipedia Text", variant="secondary")
|
| 1222 |
+
wiki_status = gr.Textbox(label="Status", interactive=False, lines=4)
|
| 1223 |
+
wiki_btn.click(
|
| 1224 |
+
fn=_harvest_wikipedia,
|
| 1225 |
+
inputs=[wiki_lang, wiki_articles],
|
| 1226 |
+
outputs=[wiki_status],
|
| 1227 |
+
)
|
| 1228 |
+
|
| 1229 |
+
# HF dataset harvest
|
| 1230 |
+
with gr.Column():
|
| 1231 |
+
gr.Markdown(
|
| 1232 |
+
"### 🤗 HuggingFace Dataset Import\n"
|
| 1233 |
+
"Pulls real audio + transcriptions from:\n"
|
| 1234 |
+
"- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
|
| 1235 |
+
"- **Fula**: `google/fleurs ff_sn`\n\n"
|
| 1236 |
+
"Samples are added to `corrections.jsonl` and counted toward "
|
| 1237 |
+
f"the auto-training threshold ({AUTO_TRAIN_THRESHOLD} entries)."
|
| 1238 |
+
)
|
| 1239 |
+
hf_lang = gr.Dropdown(
|
| 1240 |
+
choices=["Bambara (bam)", "Fula (ful)"],
|
| 1241 |
+
value="Bambara (bam)", label="Language"
|
| 1242 |
+
)
|
| 1243 |
+
hf_samples = gr.Slider(
|
| 1244 |
+
minimum=50, maximum=2000, value=500, step=50,
|
| 1245 |
+
label="Max samples to import"
|
| 1246 |
+
)
|
| 1247 |
+
hf_btn = gr.Button("🤗 Import from HuggingFace", variant="primary")
|
| 1248 |
+
hf_status = gr.Textbox(label="Status", interactive=False, lines=5)
|
| 1249 |
+
hf_btn.click(
|
| 1250 |
+
fn=_harvest_hf_dataset,
|
| 1251 |
+
inputs=[hf_lang, hf_samples],
|
| 1252 |
+
outputs=[hf_status],
|
| 1253 |
+
)
|
| 1254 |
+
|
| 1255 |
+
gr.Markdown("---")
|
| 1256 |
+
gr.Markdown(
|
| 1257 |
+
"### ⚡ Auto-Training\n"
|
| 1258 |
+
f"When `corrections.jsonl` reaches a multiple of **{AUTO_TRAIN_THRESHOLD}** entries, "
|
| 1259 |
+
"the Kaggle training notebook is triggered automatically.\n\n"
|
| 1260 |
+
"To enable: add `KAGGLE_USERNAME` and `KAGGLE_KEY` in Space Settings → Secrets.\n\n"
|
| 1261 |
+
f"Kernel: `{KAGGLE_KERNEL_SLUG}`"
|
| 1262 |
+
)
|
| 1263 |
+
with gr.Row():
|
| 1264 |
+
trigger_lang = gr.Dropdown(
|
| 1265 |
+
choices=["Bambara (bam)", "Fula (ful)"],
|
| 1266 |
+
value="Bambara (bam)", label="Language to train"
|
| 1267 |
+
)
|
| 1268 |
+
trigger_btn = gr.Button("⚡ Trigger Training Now", variant="secondary")
|
| 1269 |
+
trigger_out = gr.Textbox(label="Status", interactive=False, lines=2)
|
| 1270 |
+
trigger_btn.click(
|
| 1271 |
+
fn=lambda l: _trigger_kaggle_training(SUPPORTED_LANGUAGES.get(l, "bam")),
|
| 1272 |
+
inputs=[trigger_lang],
|
| 1273 |
+
outputs=[trigger_out],
|
| 1274 |
+
)
|
| 1275 |
+
|
| 1276 |
return demo
|
| 1277 |
|
| 1278 |
|
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web harvester — pulls Bambara/Fula data from public internet sources
|
| 3 |
+
into the sahel-agri-feedback HF dataset repo.
|
| 4 |
+
|
| 5 |
+
Sources:
|
| 6 |
+
- RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples)
|
| 7 |
+
- google/fleurs ff_sn (HF, Fula audio+text)
|
| 8 |
+
- bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only → vocabulary.jsonl)
|
| 9 |
+
|
| 10 |
+
All writes go through the same corrections.jsonl / vocabulary.jsonl files
|
| 11 |
+
that the Kaggle training notebook reads — no special handling needed.
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import io
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
from datetime import datetime, timezone
|
| 19 |
+
from typing import Generator
|
| 20 |
+
|
| 21 |
+
WIKI_APIS = {
|
| 22 |
+
"bam": "https://bm.wikipedia.org/w/api.php",
|
| 23 |
+
"ful": "https://ff.wikipedia.org/w/api.php",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
HF_ASR_SOURCES = {
|
| 27 |
+
"bam": [
|
| 28 |
+
{"repo": "RobotsMali/jeli-asr", "config": "jeli-asr", "split": "train",
|
| 29 |
+
"audio_col": "audio", "text_col": "bam", "max": 5_000},
|
| 30 |
+
],
|
| 31 |
+
"ful": [
|
| 32 |
+
{"repo": "google/fleurs", "config": "ff_sn", "split": "train",
|
| 33 |
+
"audio_col": "audio", "text_col": "transcription", "max": 2_000},
|
| 34 |
+
],
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ── Wikipedia text harvest ────────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
|
| 41 |
+
"""
|
| 42 |
+
Fetch up to max_articles article extracts from the language's Wikipedia.
|
| 43 |
+
Returns list of {word, translation, language} dicts suitable for vocabulary.jsonl.
|
| 44 |
+
"""
|
| 45 |
+
import urllib.request, urllib.parse
|
| 46 |
+
|
| 47 |
+
api_url = WIKI_APIS.get(lang)
|
| 48 |
+
if not api_url:
|
| 49 |
+
return []
|
| 50 |
+
|
| 51 |
+
# Step 1: get a list of article titles
|
| 52 |
+
params = urllib.parse.urlencode({
|
| 53 |
+
"action": "query",
|
| 54 |
+
"list": "allpages",
|
| 55 |
+
"aplimit": max_articles,
|
| 56 |
+
"apfilterredir": "nonredirects",
|
| 57 |
+
"format": "json",
|
| 58 |
+
})
|
| 59 |
+
with urllib.request.urlopen(f"{api_url}?{params}", timeout=15) as r:
|
| 60 |
+
data = json.loads(r.read())
|
| 61 |
+
|
| 62 |
+
titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
|
| 63 |
+
if not titles:
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
# Step 2: fetch plain-text extracts in batches of 20
|
| 67 |
+
entries = []
|
| 68 |
+
for i in range(0, len(titles), 20):
|
| 69 |
+
batch = titles[i:i + 20]
|
| 70 |
+
params2 = urllib.parse.urlencode({
|
| 71 |
+
"action": "query",
|
| 72 |
+
"titles": "|".join(batch),
|
| 73 |
+
"prop": "extracts",
|
| 74 |
+
"exsentences": 3,
|
| 75 |
+
"exlimit": len(batch),
|
| 76 |
+
"explaintext": True,
|
| 77 |
+
"format": "json",
|
| 78 |
+
})
|
| 79 |
+
try:
|
| 80 |
+
with urllib.request.urlopen(f"{api_url}?{params2}", timeout=15) as r:
|
| 81 |
+
data2 = json.loads(r.read())
|
| 82 |
+
for page in data2.get("query", {}).get("pages", {}).values():
|
| 83 |
+
extract = (page.get("extract") or "").strip()
|
| 84 |
+
title = page.get("title", "").strip()
|
| 85 |
+
if not extract or not title:
|
| 86 |
+
continue
|
| 87 |
+
# Split into sentences, keep those 3–20 words
|
| 88 |
+
for sentence in extract.replace("\n", " ").split("."):
|
| 89 |
+
sentence = sentence.strip()
|
| 90 |
+
words = sentence.split()
|
| 91 |
+
if 3 <= len(words) <= 20:
|
| 92 |
+
entries.append({
|
| 93 |
+
"word": sentence,
|
| 94 |
+
"translation": title, # use article title as loose context
|
| 95 |
+
"language": lang,
|
| 96 |
+
"source": "wikipedia",
|
| 97 |
+
})
|
| 98 |
+
except Exception:
|
| 99 |
+
pass
|
| 100 |
+
time.sleep(0.3) # be polite to Wikipedia servers
|
| 101 |
+
|
| 102 |
+
return entries
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ── HF dataset audio harvest ──────────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
def harvest_hf_audio(
|
| 108 |
+
lang: str,
|
| 109 |
+
hf_token: str | None,
|
| 110 |
+
progress_cb=None,
|
| 111 |
+
) -> Generator[tuple[bytes, str, str], None, None]:
|
| 112 |
+
"""
|
| 113 |
+
Yield (wav_bytes, transcription, audio_repo_path) for each sample
|
| 114 |
+
in the configured HF ASR sources for this language.
|
| 115 |
+
|
| 116 |
+
Caller is responsible for writing to corrections.jsonl + uploading audio.
|
| 117 |
+
progress_cb(current, total, message) called periodically if provided.
|
| 118 |
+
"""
|
| 119 |
+
import numpy as np
|
| 120 |
+
|
| 121 |
+
sources = HF_ASR_SOURCES.get(lang, [])
|
| 122 |
+
for src in sources:
|
| 123 |
+
try:
|
| 124 |
+
from datasets import load_dataset, Audio as HFAudio
|
| 125 |
+
except ImportError:
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
ds = load_dataset(
|
| 130 |
+
src["repo"], src["config"],
|
| 131 |
+
split=src["split"],
|
| 132 |
+
streaming=True,
|
| 133 |
+
token=hf_token,
|
| 134 |
+
trust_remote_code=False,
|
| 135 |
+
)
|
| 136 |
+
ds = ds.cast_column(src["audio_col"], HFAudio(sampling_rate=16_000))
|
| 137 |
+
total = src["max"]
|
| 138 |
+
for i, sample in enumerate(ds.take(total)):
|
| 139 |
+
if progress_cb:
|
| 140 |
+
progress_cb(i + 1, total, f"{src['repo']} ({lang})")
|
| 141 |
+
try:
|
| 142 |
+
audio_arr = np.array(sample[src["audio_col"]]["array"], dtype=np.float32)
|
| 143 |
+
text = (sample.get(src["text_col"]) or "").strip()
|
| 144 |
+
if not text or len(audio_arr) < 3_200: # skip < 0.2s
|
| 145 |
+
continue
|
| 146 |
+
# Convert to WAV bytes
|
| 147 |
+
wav_bytes = _numpy_to_wav_bytes(audio_arr, 16_000)
|
| 148 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
|
| 149 |
+
repo_path = f"audio/{lang}_{ts}.wav"
|
| 150 |
+
yield wav_bytes, text, repo_path
|
| 151 |
+
except Exception:
|
| 152 |
+
continue
|
| 153 |
+
except Exception:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _numpy_to_wav_bytes(audio: "np.ndarray", sr: int) -> bytes:
|
| 158 |
+
import struct, io as _io
|
| 159 |
+
audio_pcm = (audio * 32767).clip(-32768, 32767).astype("<i2")
|
| 160 |
+
data_bytes = audio_pcm.tobytes()
|
| 161 |
+
buf = _io.BytesIO()
|
| 162 |
+
# WAV header
|
| 163 |
+
buf.write(b"RIFF")
|
| 164 |
+
buf.write(struct.pack("<I", 36 + len(data_bytes)))
|
| 165 |
+
buf.write(b"WAVE")
|
| 166 |
+
buf.write(b"fmt ")
|
| 167 |
+
buf.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
|
| 168 |
+
buf.write(b"data")
|
| 169 |
+
buf.write(struct.pack("<I", len(data_bytes)))
|
| 170 |
+
buf.write(data_bytes)
|
| 171 |
+
return buf.getvalue()
|