Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9 Claude Sonnet 4.6 commited on Apr 9

Commit

eddcaea

1 Parent(s): 98eece8

Add mass training + internet self-teaching features

New Tab 5 — Bulk Upload:
- Upload ZIP of audio files + CSV (filename,transcription) in one operation
- Batch-inserts all samples into corrections.jsonl + uploads audio to Hub

New Tab 6 — Self-Teaching:
- Wikipedia harvest: bm/ff Wikipedia API → sentence text → vocabulary.jsonl
(868 Bambara articles, 17k+ Fula articles)
- HF dataset import: RobotsMali/jeli-asr (33k Bambara) + google/fleurs ff_sn
(Fula) → corrections.jsonl, one click
- Auto-training trigger: fires Kaggle kernel via REST API when
corrections.jsonl reaches AUTO_TRAIN_THRESHOLD (default 50) entries
Requires KAGGLE_USERNAME + KAGGLE_KEY Space secrets

New src/data/web_harvester.py:
- harvest_wikipedia_text(lang, max_articles) — MediaWiki API, no deps
- harvest_hf_audio(lang, token) — generator yielding (wav_bytes, text, path)
- _numpy_to_wav_bytes() — stdlib-only WAV encoder

Auto-trigger also fires after each Tab 2 correction save.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +407 -6
src/data/web_harvester.py +171 -0

app.py CHANGED Viewed

@@ -3,11 +3,14 @@ Sahel-Agri Voice AI — HuggingFace Spaces (ZeroGPU)
 Two-way voice assistant: Bambara / Fula / French / English → voice response
 Environment variables (set in Space Settings → Secrets):
-  HF_TOKEN          — HF write-access token
-  FEEDBACK_REPO_ID  — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
-  ADAPTER_REPO_ID   — e.g. ous-sow/sahel-agri-adapters   (model, private)
-  WHISPER_MODEL_ID  — default: openai/whisper-large-v3-turbo
-                      (use openai/whisper-base for local CPU testing)
 """
 from __future__ import annotations
@@ -33,7 +36,11 @@ FEEDBACK_REPO_ID = os.environ.get("FEEDBACK_REPO_ID", "ous-sow/sahel-agri-feedba
 ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapters")
 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
-WHISPER_MODEL_ID = os.environ.get("WHISPER_MODEL_ID",  "openai/whisper-small")
 # On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
 _ON_SPACES = os.environ.get("SPACE_ID") is not None
@@ -324,6 +331,7 @@ def _save_feedback_to_hub(
                     return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
         total = updated.count("\n")
         return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
     except Exception as e:
@@ -582,6 +590,289 @@ def _save_audio_for_training(lang_label: str, audio_path: str | None, transcript
         return f"❌ Upload failed: {exc}"
 # ── Main ask handler ──────────────────────────────────────────────────────────
 def handle_ask(audio_path, language_label):
@@ -872,6 +1163,116 @@ def build_ui() -> gr.Blocks:
                 reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
                 reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
     return demo

 Two-way voice assistant: Bambara / Fula / French / English → voice response
 Environment variables (set in Space Settings → Secrets):
+  HF_TOKEN               — HF write-access token
+  FEEDBACK_REPO_ID       — e.g. ous-sow/sahel-agri-feedback  (dataset, private)
+  ADAPTER_REPO_ID        — e.g. ous-sow/sahel-agri-adapters   (model, private)
+  WHISPER_MODEL_ID       — default: openai/whisper-small
+  KAGGLE_USERNAME        — Kaggle username (for auto-trigger training)
+  KAGGLE_KEY             — Kaggle API key  (for auto-trigger training)
+  KAGGLE_KERNEL_SLUG     — default: ous-sow/sahel-voice-master-trainer
+  AUTO_TRAIN_THRESHOLD   — corrections count that triggers auto-training (default: 50)
 """
 from __future__ import annotations
 ADAPTER_REPO_ID  = os.environ.get("ADAPTER_REPO_ID",  "ous-sow/sahel-agri-adapters")
 # whisper-small: ~10s on cpu-basic, good multilingual quality.
 # Override via WHISPER_MODEL_ID env var if you upgrade to a GPU Space later.
+WHISPER_MODEL_ID     = os.environ.get("WHISPER_MODEL_ID",    "openai/whisper-small")
+KAGGLE_USERNAME      = os.environ.get("KAGGLE_USERNAME",      "")
+KAGGLE_KEY           = os.environ.get("KAGGLE_KEY",           "")
+KAGGLE_KERNEL_SLUG   = os.environ.get("KAGGLE_KERNEL_SLUG",   "ous-sow/sahel-voice-master-trainer")
+AUTO_TRAIN_THRESHOLD = int(os.environ.get("AUTO_TRAIN_THRESHOLD", "50"))
 # On local CPU (no HF_TOKEN / no spaces package) fall back gracefully
 _ON_SPACES = os.environ.get("SPACE_ID") is not None
                     return f"⚠️ Audio uploaded but corrections.jsonl update failed: {e}"
         total = updated.count("\n")
+        _maybe_auto_trigger()
         return f"✅ Saved to Hub (#{total}) — {FEEDBACK_REPO_ID}"
     except Exception as e:
         return f"❌ Upload failed: {exc}"
+# ── Auto-training trigger ─────────────────────────────────────────────────────
+def _count_corrections() -> int:
+    """Return number of entries in corrections.jsonl on the Hub."""
+    if _hf_api is None:
+        return 0
+    try:
+        from huggingface_hub import hf_hub_download
+        local = hf_hub_download(
+            repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
+            repo_type="dataset", token=HF_TOKEN,
+        )
+        with open(local, encoding="utf-8") as f:
+            return sum(1 for l in f if l.strip())
+    except Exception:
+        return 0
+def _trigger_kaggle_training(lang: str = "bam") -> str:
+    """Fire the Kaggle kernel via REST API if credentials are configured."""
+    if not KAGGLE_USERNAME or not KAGGLE_KEY:
+        return "⚠️ KAGGLE_USERNAME / KAGGLE_KEY not set in Space secrets — auto-trigger disabled."
+    try:
+        import urllib.request, urllib.parse, base64
+        token = base64.b64encode(f"{KAGGLE_USERNAME}:{KAGGLE_KEY}".encode()).decode()
+        url   = f"https://www.kaggle.com/api/v1/kernels/{KAGGLE_KERNEL_SLUG}/run"
+        body  = json.dumps({"enableGpu": True}).encode()
+        req   = urllib.request.Request(
+            url, data=body, method="POST",
+            headers={
+                "Authorization": f"Basic {token}",
+                "Content-Type":  "application/json",
+            },
+        )
+        with urllib.request.urlopen(req, timeout=15) as r:
+            resp = json.loads(r.read())
+        return f"✅ Kaggle training triggered! Run ID: {resp.get('currentRunningVersion', 'started')}"
+    except Exception as e:
+        return f"❌ Kaggle trigger failed: {e}"
+def _maybe_auto_trigger() -> None:
+    """Called after each correction save. Triggers Kaggle if threshold met."""
+    if not KAGGLE_USERNAME or not KAGGLE_KEY:
+        return
+    count = _count_corrections()
+    if count > 0 and count % AUTO_TRAIN_THRESHOLD == 0:
+        threading.Thread(target=_trigger_kaggle_training, daemon=True).start()
+# ── Bulk upload handler ────────────────────────────────────────────────────────
+def _bulk_upload(lang_label: str, zip_file, csv_text: str) -> str:
+    """
+    Accept a ZIP of audio files + a CSV (filename,transcription) and batch-insert
+    all samples into corrections.jsonl.  Audio stored under audio/ in the Hub repo.
+    """
+    import zipfile, csv
+    if _hf_api is None:
+        return "⚠️ HF_TOKEN not set — cannot upload."
+    if zip_file is None and not csv_text.strip():
+        return "⚠️ Upload a ZIP and/or paste a CSV."
+    lang   = SUPPORTED_LANGUAGES.get(lang_label, "bam")
+    rows   = []   # (audio_bytes_or_None, filename, transcription)
+    # Parse CSV
+    transcript_map: dict[str, str] = {}
+    if csv_text.strip():
+        for row in csv.reader(csv_text.strip().splitlines()):
+            if len(row) >= 2:
+                transcript_map[row[0].strip()] = row[1].strip()
+    # Extract ZIP
+    if zip_file is not None:
+        try:
+            with zipfile.ZipFile(zip_file, "r") as zf:
+                for name in zf.namelist():
+                    if not name.lower().endswith((".wav", ".mp3", ".ogg", ".flac", ".m4a")):
+                        continue
+                    text = transcript_map.get(name) or transcript_map.get(Path(name).name) or ""
+                    if not text:
+                        continue
+                    rows.append((zf.read(name), Path(name).name, text))
+        except Exception as e:
+            return f"❌ ZIP read error: {e}"
+    elif transcript_map:
+        # CSV only — audio-less vocab entries
+        for fname, text in transcript_map.items():
+            rows.append((None, fname, text))
+    if not rows:
+        return "⚠️ No matching (audio, transcription) pairs found. Check filenames match CSV."
+    # Upload batch
+    records = []
+    errors  = 0
+    for audio_bytes, fname, text in rows:
+        ts          = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
+        audio_path  = f"audio/{lang}_{ts}.wav"
+        try:
+            if audio_bytes:
+                _hf_api.upload_file(
+                    path_or_fileobj=io.BytesIO(audio_bytes),
+                    path_in_repo=audio_path,
+                    repo_id=FEEDBACK_REPO_ID,
+                    repo_type="dataset",
+                )
+            records.append({
+                "id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
+                "language": lang,
+                "audio_file": audio_path if audio_bytes else "",
+                "transcription": text, "corrected_text": text,
+                "source": f"bulk_upload:{fname}", "is_correction": False,
+                "model": WHISPER_MODEL_ID,
+            })
+        except Exception:
+            errors += 1
+    # Append all to corrections.jsonl
+    from huggingface_hub import hf_hub_download
+    for attempt in range(2):
+        try:
+            local = hf_hub_download(
+                repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
+                repo_type="dataset", token=HF_TOKEN,
+            )
+            with open(local, encoding="utf-8") as f:
+                existing = f.read()
+        except Exception:
+            existing = ""
+        new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
+        updated   = existing + new_lines
+        try:
+            _hf_api.upload_file(
+                path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
+                path_in_repo="corrections.jsonl",
+                repo_id=FEEDBACK_REPO_ID,
+                repo_type="dataset",
+            )
+            break
+        except Exception as e:
+            if attempt == 1:
+                return f"⚠️ Audio uploaded but corrections.jsonl failed: {e}"
+    total = updated.count("\n")
+    _maybe_auto_trigger()
+    return (
+        f"✅ Bulk upload complete!\n"
+        f"  Uploaded : {len(records)} samples ({errors} errors)\n"
+        f"  Dataset  : {total} total corrections\n"
+        f"  Auto-train threshold: {AUTO_TRAIN_THRESHOLD} entries"
+    )
+# ── Internet self-teaching handlers ───────────────────────────────────────────
+def _harvest_wikipedia(lang_label: str, max_articles: int = 100) -> str:
+    """Fetch Wikipedia text for this language and append to vocabulary.jsonl."""
+    if _hf_api is None:
+        return "⚠️ HF_TOKEN not set."
+    lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
+    if lang not in ("bam", "ful"):
+        return "⚠️ Wikipedia harvest only supported for Bambara and Fula."
+    from src.data.web_harvester import harvest_wikipedia_text
+    entries = harvest_wikipedia_text(lang, max_articles=max_articles)
+    if not entries:
+        return "⚠️ No text harvested — check network or try again."
+    # Append to vocabulary.jsonl
+    from huggingface_hub import hf_hub_download
+    for attempt in range(2):
+        try:
+            local = hf_hub_download(
+                repo_id=FEEDBACK_REPO_ID, filename="vocabulary.jsonl",
+                repo_type="dataset", token=HF_TOKEN,
+            )
+            with open(local, encoding="utf-8") as f:
+                existing = f.read()
+        except Exception:
+            existing = ""
+        new_lines = "".join(json.dumps(e, ensure_ascii=False) + "\n" for e in entries)
+        updated   = existing + new_lines
+        try:
+            _hf_api.upload_file(
+                path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
+                path_in_repo="vocabulary.jsonl",
+                repo_id=FEEDBACK_REPO_ID,
+                repo_type="dataset",
+            )
+            break
+        except Exception as e:
+            if attempt == 1:
+                return f"❌ Upload failed: {e}"
+    return (
+        f"✅ Wikipedia harvest complete!\n"
+        f"  Language : {lang_label}\n"
+        f"  Sentences added : {len(entries)}\n"
+        f"  Total vocabulary entries: {updated.count(chr(10))}"
+    )
+def _harvest_hf_dataset(lang_label: str, max_samples: int = 500) -> str:
+    """Pull audio+transcription from public HF datasets into corrections.jsonl."""
+    if _hf_api is None:
+        return "⚠️ HF_TOKEN not set."
+    lang = SUPPORTED_LANGUAGES.get(lang_label, "bam")
+    if lang not in ("bam", "ful"):
+        return "⚠️ HF dataset harvest only supported for Bambara and Fula."
+    from src.data.web_harvester import harvest_hf_audio, HF_ASR_SOURCES
+    sources = HF_ASR_SOURCES.get(lang, [])
+    if not sources:
+        return f"⚠️ No HF dataset configured for {lang}."
+    records = []
+    errors  = 0
+    for wav_bytes, text, repo_path in harvest_hf_audio(lang, HF_TOKEN):
+        try:
+            _hf_api.upload_file(
+                path_or_fileobj=io.BytesIO(wav_bytes),
+                path_in_repo=repo_path,
+                repo_id=FEEDBACK_REPO_ID,
+                repo_type="dataset",
+            )
+            ts = repo_path.split("_")[-1].replace(".wav", "")
+            records.append({
+                "id": ts, "timestamp": datetime.now(timezone.utc).isoformat(),
+                "language": lang,
+                "audio_file": repo_path,
+                "transcription": text, "corrected_text": text,
+                "source": f"hf_harvest:{sources[0]['repo']}",
+                "is_correction": False, "model": WHISPER_MODEL_ID,
+            })
+            if len(records) >= max_samples:
+                break
+        except Exception:
+            errors += 1
+            if errors > 20:
+                break
+    if not records:
+        return "⚠️ No samples harvested. Dataset may require accepting terms on HuggingFace first."
+    # Append to corrections.jsonl
+    from huggingface_hub import hf_hub_download
+    for attempt in range(2):
+        try:
+            local = hf_hub_download(
+                repo_id=FEEDBACK_REPO_ID, filename="corrections.jsonl",
+                repo_type="dataset", token=HF_TOKEN,
+            )
+            with open(local, encoding="utf-8") as f:
+                existing = f.read()
+        except Exception:
+            existing = ""
+        new_lines = "".join(json.dumps(r, ensure_ascii=False) + "\n" for r in records)
+        updated   = existing + new_lines
+        try:
+            _hf_api.upload_file(
+                path_or_fileobj=io.BytesIO(updated.encode("utf-8")),
+                path_in_repo="corrections.jsonl",
+                repo_id=FEEDBACK_REPO_ID,
+                repo_type="dataset",
+            )
+            break
+        except Exception as e:
+            if attempt == 1:
+                return f"❌ corrections.jsonl update failed: {e}"
+    total = updated.count("\n")
+    _maybe_auto_trigger()
+    return (
+        f"✅ HF dataset harvest complete!\n"
+        f"  Source   : {sources[0]['repo']}\n"
+        f"  Imported : {len(records)} samples ({errors} errors)\n"
+        f"  Dataset  : {total} total corrections\n"
+    )
 # ── Main ask handler ──────────────────────────────────────────────────────────
 def handle_ask(audio_path, language_label):
                 reload_btn.click(fn=_reload_adapters_from_hub, outputs=[reload_out])
                 reload_btn.click(fn=_get_adapter_status, outputs=[adapter_status_md])
+            # ── Tab 5: Bulk Upload ────────────────────────────────────────────
+            with gr.TabItem("📦 Bulk Upload"):
+                gr.Markdown(
+                    "## Upload many audio samples at once\n\n"
+                    "**Step 1** — Prepare a ZIP file containing your audio files (WAV/MP3).\n\n"
+                    "**Step 2** — Prepare a CSV with two columns: `filename,transcription`\n"
+                    "```\nbam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini\n```\n\n"
+                    "**Step 3** — Select language, upload ZIP, paste CSV, click Upload."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        bulk_lang = gr.Dropdown(
+                            choices=["Bambara (bam)", "Fula (ful)"],
+                            value="Bambara (bam)", label="Language"
+                        )
+                        bulk_zip = gr.File(
+                            label="ZIP file (audio files)", file_types=[".zip"]
+                        )
+                        bulk_csv = gr.Textbox(
+                            lines=10,
+                            label="CSV — filename,transcription (one per line)",
+                            placeholder="bam_001.wav,I ni ce a tɔ\nbam_002.wav,Sanji bɛ na sini",
+                        )
+                        bulk_btn = gr.Button("📤 Upload Batch", variant="primary")
+                        bulk_status = gr.Textbox(label="Status", interactive=False, lines=5)
+                bulk_btn.click(
+                    fn=_bulk_upload,
+                    inputs=[bulk_lang, bulk_zip, bulk_csv],
+                    outputs=[bulk_status],
+                )
+            # ── Tab 6: Self-Teaching ──────────────────────────────────────────
+            with gr.TabItem("🌐 Self-Teaching"):
+                gr.Markdown(
+                    "## Teach the model from the internet\n\n"
+                    "These tools pull publicly available Bambara and Fula language data "
+                    "directly into your training dataset — no manual work required."
+                )
+                with gr.Row():
+                    # Wikipedia harvest
+                    with gr.Column():
+                        gr.Markdown(
+                            "### 📖 Wikipedia Text Harvest\n"
+                            "Pulls sentence-length text from Bambara Wikipedia (868 articles) "
+                            "or Fula Wikipedia (17,000+ articles) into `vocabulary.jsonl`.\n\n"
+                            "Use this to expand vocabulary coverage before a training run."
+                        )
+                        wiki_lang = gr.Dropdown(
+                            choices=["Bambara (bam)", "Fula (ful)"],
+                            value="Bambara (bam)", label="Language"
+                        )
+                        wiki_articles = gr.Slider(
+                            minimum=10, maximum=500, value=100, step=10,
+                            label="Max articles to fetch"
+                        )
+                        wiki_btn = gr.Button("📖 Harvest Wikipedia Text", variant="secondary")
+                        wiki_status = gr.Textbox(label="Status", interactive=False, lines=4)
+                        wiki_btn.click(
+                            fn=_harvest_wikipedia,
+                            inputs=[wiki_lang, wiki_articles],
+                            outputs=[wiki_status],
+                        )
+                    # HF dataset harvest
+                    with gr.Column():
+                        gr.Markdown(
+                            "### 🤗 HuggingFace Dataset Import\n"
+                            "Pulls real audio + transcriptions from:\n"
+                            "- **Bambara**: `RobotsMali/jeli-asr` (33,000 samples)\n"
+                            "- **Fula**: `google/fleurs ff_sn`\n\n"
+                            "Samples are added to `corrections.jsonl` and counted toward "
+                            f"the auto-training threshold ({AUTO_TRAIN_THRESHOLD} entries)."
+                        )
+                        hf_lang = gr.Dropdown(
+                            choices=["Bambara (bam)", "Fula (ful)"],
+                            value="Bambara (bam)", label="Language"
+                        )
+                        hf_samples = gr.Slider(
+                            minimum=50, maximum=2000, value=500, step=50,
+                            label="Max samples to import"
+                        )
+                        hf_btn = gr.Button("🤗 Import from HuggingFace", variant="primary")
+                        hf_status = gr.Textbox(label="Status", interactive=False, lines=5)
+                        hf_btn.click(
+                            fn=_harvest_hf_dataset,
+                            inputs=[hf_lang, hf_samples],
+                            outputs=[hf_status],
+                        )
+                gr.Markdown("---")
+                gr.Markdown(
+                    "### ⚡ Auto-Training\n"
+                    f"When `corrections.jsonl` reaches a multiple of **{AUTO_TRAIN_THRESHOLD}** entries, "
+                    "the Kaggle training notebook is triggered automatically.\n\n"
+                    "To enable: add `KAGGLE_USERNAME` and `KAGGLE_KEY` in Space Settings → Secrets.\n\n"
+                    f"Kernel: `{KAGGLE_KERNEL_SLUG}`"
+                )
+                with gr.Row():
+                    trigger_lang = gr.Dropdown(
+                        choices=["Bambara (bam)", "Fula (ful)"],
+                        value="Bambara (bam)", label="Language to train"
+                    )
+                    trigger_btn = gr.Button("⚡ Trigger Training Now", variant="secondary")
+                trigger_out = gr.Textbox(label="Status", interactive=False, lines=2)
+                trigger_btn.click(
+                    fn=lambda l: _trigger_kaggle_training(SUPPORTED_LANGUAGES.get(l, "bam")),
+                    inputs=[trigger_lang],
+                    outputs=[trigger_out],
+                )
     return demo

src/data/web_harvester.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+Web harvester — pulls Bambara/Fula data from public internet sources
+into the sahel-agri-feedback HF dataset repo.
+Sources:
+  - RobotsMali/jeli-asr         (HF, 33k Bambara audio+text samples)
+  - google/fleurs ff_sn          (HF, Fula audio+text)
+  - bm.wikipedia.org / ff.wikipedia.org  (Wikipedia API, text only → vocabulary.jsonl)
+All writes go through the same corrections.jsonl / vocabulary.jsonl files
+that the Kaggle training notebook reads — no special handling needed.
+"""
+from __future__ import annotations
+import io
+import json
+import time
+from datetime import datetime, timezone
+from typing import Generator
+WIKI_APIS = {
+    "bam": "https://bm.wikipedia.org/w/api.php",
+    "ful": "https://ff.wikipedia.org/w/api.php",
+}
+HF_ASR_SOURCES = {
+    "bam": [
+        {"repo": "RobotsMali/jeli-asr",    "config": "jeli-asr", "split": "train",
+         "audio_col": "audio", "text_col": "bam", "max": 5_000},
+    ],
+    "ful": [
+        {"repo": "google/fleurs",           "config": "ff_sn",    "split": "train",
+         "audio_col": "audio", "text_col": "transcription", "max": 2_000},
+    ],
+}
+# ── Wikipedia text harvest ────────────────────────────────────────────────────
+def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
+    """
+    Fetch up to max_articles article extracts from the language's Wikipedia.
+    Returns list of {word, translation, language} dicts suitable for vocabulary.jsonl.
+    """
+    import urllib.request, urllib.parse
+    api_url = WIKI_APIS.get(lang)
+    if not api_url:
+        return []
+    # Step 1: get a list of article titles
+    params = urllib.parse.urlencode({
+        "action": "query",
+        "list": "allpages",
+        "aplimit": max_articles,
+        "apfilterredir": "nonredirects",
+        "format": "json",
+    })
+    with urllib.request.urlopen(f"{api_url}?{params}", timeout=15) as r:
+        data = json.loads(r.read())
+    titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
+    if not titles:
+        return []
+    # Step 2: fetch plain-text extracts in batches of 20
+    entries = []
+    for i in range(0, len(titles), 20):
+        batch = titles[i:i + 20]
+        params2 = urllib.parse.urlencode({
+            "action": "query",
+            "titles": "|".join(batch),
+            "prop": "extracts",
+            "exsentences": 3,
+            "exlimit": len(batch),
+            "explaintext": True,
+            "format": "json",
+        })
+        try:
+            with urllib.request.urlopen(f"{api_url}?{params2}", timeout=15) as r:
+                data2 = json.loads(r.read())
+            for page in data2.get("query", {}).get("pages", {}).values():
+                extract = (page.get("extract") or "").strip()
+                title   = page.get("title", "").strip()
+                if not extract or not title:
+                    continue
+                # Split into sentences, keep those 3–20 words
+                for sentence in extract.replace("\n", " ").split("."):
+                    sentence = sentence.strip()
+                    words = sentence.split()
+                    if 3 <= len(words) <= 20:
+                        entries.append({
+                            "word": sentence,
+                            "translation": title,   # use article title as loose context
+                            "language": lang,
+                            "source": "wikipedia",
+                        })
+        except Exception:
+            pass
+        time.sleep(0.3)  # be polite to Wikipedia servers
+    return entries
+# ── HF dataset audio harvest ──────────────────────────────────────────────────
+def harvest_hf_audio(
+    lang: str,
+    hf_token: str | None,
+    progress_cb=None,
+) -> Generator[tuple[bytes, str, str], None, None]:
+    """
+    Yield (wav_bytes, transcription, audio_repo_path) for each sample
+    in the configured HF ASR sources for this language.
+    Caller is responsible for writing to corrections.jsonl + uploading audio.
+    progress_cb(current, total, message) called periodically if provided.
+    """
+    import numpy as np
+    sources = HF_ASR_SOURCES.get(lang, [])
+    for src in sources:
+        try:
+            from datasets import load_dataset, Audio as HFAudio
+        except ImportError:
+            continue
+        try:
+            ds = load_dataset(
+                src["repo"], src["config"],
+                split=src["split"],
+                streaming=True,
+                token=hf_token,
+                trust_remote_code=False,
+            )
+            ds = ds.cast_column(src["audio_col"], HFAudio(sampling_rate=16_000))
+            total = src["max"]
+            for i, sample in enumerate(ds.take(total)):
+                if progress_cb:
+                    progress_cb(i + 1, total, f"{src['repo']} ({lang})")
+                try:
+                    audio_arr = np.array(sample[src["audio_col"]]["array"], dtype=np.float32)
+                    text = (sample.get(src["text_col"]) or "").strip()
+                    if not text or len(audio_arr) < 3_200:  # skip < 0.2s
+                        continue
+                    # Convert to WAV bytes
+                    wav_bytes = _numpy_to_wav_bytes(audio_arr, 16_000)
+                    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_%f")
+                    repo_path = f"audio/{lang}_{ts}.wav"
+                    yield wav_bytes, text, repo_path
+                except Exception:
+                    continue
+        except Exception:
+            continue
+def _numpy_to_wav_bytes(audio: "np.ndarray", sr: int) -> bytes:
+    import struct, io as _io
+    audio_pcm = (audio * 32767).clip(-32768, 32767).astype("<i2")
+    data_bytes = audio_pcm.tobytes()
+    buf = _io.BytesIO()
+    # WAV header
+    buf.write(b"RIFF")
+    buf.write(struct.pack("<I", 36 + len(data_bytes)))
+    buf.write(b"WAVE")
+    buf.write(b"fmt ")
+    buf.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
+    buf.write(b"data")
+    buf.write(struct.pack("<I", len(data_bytes)))
+    buf.write(data_bytes)
+    return buf.getvalue()