""" Web harvester — pulls Bambara/Fula data from public internet sources into the sahel-agri-feedback HF dataset repo. Sources: - RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples) - google/fleurs ff_sn (HF, Fula audio+text) - bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only -> vocabulary.jsonl) HF audio datasets are registered by reference in dataset_sources.jsonl — the Kaggle notebook loads them directly at training time. This avoids re-uploading gigabytes of audio through the Space (which would timeout every time). """ from __future__ import annotations import io import json import time WIKI_APIS = { "bam": "https://bm.wikipedia.org/w/api.php", "ful": "https://ff.wikipedia.org/w/api.php", } # Wikipedia requires a descriptive User-Agent or returns 403 _UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)" # Datasets registered by reference — Kaggle notebook loads them directly HF_DATASET_REGISTRY = { "bam": [ { "repo": "RobotsMali/jeli-asr", "config": "jeli-asr", "split": "train", "audio_col": "audio", "text_col": "bam", "max": 5_000, "license": "cc-by-4.0", }, ], "ful": [ { "repo": "google/WaxalNLP", "config": "ful_asr", "split": "train", "audio_col": "audio", "text_col": "transcription", "max": 2_000, "license": "cc-by-4.0", }, { "repo": "Pullo-Africa-Protagonist/Fula-pular", "config": "default", "split": "train", "audio_col": "audio", "text_col": "transcription", "max": 5_000, "license": "cc-by-4.0", "note": "9,761 Pular (Guinea) audio rows — primary ASR training source", }, { "repo": "guizme/adlam_fulfulde", "config": "default", "split": "train", "audio_col": "audio", "text_col": "transcription", "max": 51, "license": "cc-by-4.0", "adlam": True, "note": "51 Adlam-script audio rows — converted to Latin before training", }, # OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0) # 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech, # multi-device, ages 5–76). Best available clean Guinea Pular ASR data. # Download manually from https://openslr.org/106/ and upload to HF before training. # Uncomment once the dataset repo is populated. # { # "repo": "ous-sow/slr106-pular", # "config": "default", # "split": "train", # "audio_col": "audio", # "text_col": "transcription", # "max": 10_000, # "license": "cc-by-sa-4.0", # "note": "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers", # }, # OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0) # ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation # set of 300 clips. Noisier than SLR106 but larger. # Uncomment once uploaded to HF. # { # "repo": "ous-sow/slr105-pular", # "config": "default", # "split": "validation", # "audio_col": "audio", # "text_col": "transcription", # "max": 300, # "license": "cc-by-sa-4.0", # "note": "OpenSLR SLR105 Guinea radio — Pular-tagged validation split", # }, ], } # ── Wikipedia text harvest ──────────────────────────────────────────────────── def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]: """ Fetch up to max_articles article extracts from the language Wikipedia. Returns list of {word, translation, language, source} dicts for vocabulary.jsonl. """ import urllib.request, urllib.parse api_url = WIKI_APIS.get(lang) if not api_url: return [] def _get(params: dict) -> dict: url = f"{api_url}?{urllib.parse.urlencode(params)}" req = urllib.request.Request(url, headers={"User-Agent": _UA}) with urllib.request.urlopen(req, timeout=20) as r: return json.loads(r.read()) # Step 1: get article titles data = _get({"action": "query", "list": "allpages", "aplimit": max_articles, "apfilterredir": "nonredirects", "format": "json"}) titles = [p["title"] for p in data.get("query", {}).get("allpages", [])] if not titles: return [] # Step 2: fetch plain-text extracts in batches of 20 entries = [] for i in range(0, len(titles), 20): batch = titles[i:i + 20] try: data2 = _get({"action": "query", "titles": "|".join(batch), "prop": "extracts", "exsentences": 3, "exlimit": len(batch), "explaintext": True, "format": "json"}) for page in data2.get("query", {}).get("pages", {}).values(): extract = (page.get("extract") or "").strip() title = page.get("title", "").strip() if not extract or not title: continue for sentence in extract.replace("\n", " ").split("."): sentence = sentence.strip() words = sentence.split() if 3 <= len(words) <= 20: entries.append({ "word": sentence, "translation": title, "language": lang, "source": "wikipedia", }) except Exception: pass time.sleep(0.3) return entries # ── HF dataset registration (reference-based, no re-upload) ────────────────── def get_hf_dataset_refs(lang: str) -> list[dict]: """Return the dataset reference dicts for this language.""" return HF_DATASET_REGISTRY.get(lang, [])