Spaces:
Running
Running
| """ | |
| Web harvester — pulls Bambara/Fula data from public internet sources | |
| into the sahel-agri-feedback HF dataset repo. | |
| Sources: | |
| - RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples) | |
| - google/fleurs ff_sn (HF, Fula audio+text) | |
| - bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only -> vocabulary.jsonl) | |
| HF audio datasets are registered by reference in dataset_sources.jsonl — the | |
| Kaggle notebook loads them directly at training time. This avoids re-uploading | |
| gigabytes of audio through the Space (which would timeout every time). | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import json | |
| import time | |
| WIKI_APIS = { | |
| "bam": "https://bm.wikipedia.org/w/api.php", | |
| "ful": "https://ff.wikipedia.org/w/api.php", | |
| } | |
| # Wikipedia requires a descriptive User-Agent or returns 403 | |
| _UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)" | |
| # Datasets registered by reference — Kaggle notebook loads them directly | |
| HF_DATASET_REGISTRY = { | |
| "bam": [ | |
| { | |
| "repo": "RobotsMali/jeli-asr", | |
| "config": "jeli-asr", | |
| "split": "train", | |
| "audio_col": "audio", | |
| "text_col": "bam", | |
| "max": 5_000, | |
| "license": "cc-by-4.0", | |
| }, | |
| ], | |
| "ful": [ | |
| { | |
| "repo": "google/WaxalNLP", | |
| "config": "ful_asr", | |
| "split": "train", | |
| "audio_col": "audio", | |
| "text_col": "transcription", | |
| "max": 2_000, | |
| "license": "cc-by-4.0", | |
| }, | |
| { | |
| "repo": "Pullo-Africa-Protagonist/Fula-pular", | |
| "config": "default", | |
| "split": "train", | |
| "audio_col": "audio", | |
| "text_col": "transcription", | |
| "max": 5_000, | |
| "license": "cc-by-4.0", | |
| "note": "9,761 Pular (Guinea) audio rows — primary ASR training source", | |
| }, | |
| { | |
| "repo": "guizme/adlam_fulfulde", | |
| "config": "default", | |
| "split": "train", | |
| "audio_col": "audio", | |
| "text_col": "transcription", | |
| "max": 51, | |
| "license": "cc-by-4.0", | |
| "adlam": True, | |
| "note": "51 Adlam-script audio rows — converted to Latin before training", | |
| }, | |
| # OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0) | |
| # 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech, | |
| # multi-device, ages 5–76). Best available clean Guinea Pular ASR data. | |
| # Download manually from https://openslr.org/106/ and upload to HF before training. | |
| # Uncomment once the dataset repo is populated. | |
| # { | |
| # "repo": "ous-sow/slr106-pular", | |
| # "config": "default", | |
| # "split": "train", | |
| # "audio_col": "audio", | |
| # "text_col": "transcription", | |
| # "max": 10_000, | |
| # "license": "cc-by-sa-4.0", | |
| # "note": "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers", | |
| # }, | |
| # OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0) | |
| # ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation | |
| # set of 300 clips. Noisier than SLR106 but larger. | |
| # Uncomment once uploaded to HF. | |
| # { | |
| # "repo": "ous-sow/slr105-pular", | |
| # "config": "default", | |
| # "split": "validation", | |
| # "audio_col": "audio", | |
| # "text_col": "transcription", | |
| # "max": 300, | |
| # "license": "cc-by-sa-4.0", | |
| # "note": "OpenSLR SLR105 Guinea radio — Pular-tagged validation split", | |
| # }, | |
| ], | |
| } | |
| # ── Wikipedia text harvest ──────────────────────────────────────────────────── | |
| def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]: | |
| """ | |
| Fetch up to max_articles article extracts from the language Wikipedia. | |
| Returns list of {word, translation, language, source} dicts for vocabulary.jsonl. | |
| """ | |
| import urllib.request, urllib.parse | |
| api_url = WIKI_APIS.get(lang) | |
| if not api_url: | |
| return [] | |
| def _get(params: dict) -> dict: | |
| url = f"{api_url}?{urllib.parse.urlencode(params)}" | |
| req = urllib.request.Request(url, headers={"User-Agent": _UA}) | |
| with urllib.request.urlopen(req, timeout=20) as r: | |
| return json.loads(r.read()) | |
| # Step 1: get article titles | |
| data = _get({"action": "query", "list": "allpages", "aplimit": max_articles, | |
| "apfilterredir": "nonredirects", "format": "json"}) | |
| titles = [p["title"] for p in data.get("query", {}).get("allpages", [])] | |
| if not titles: | |
| return [] | |
| # Step 2: fetch plain-text extracts in batches of 20 | |
| entries = [] | |
| for i in range(0, len(titles), 20): | |
| batch = titles[i:i + 20] | |
| try: | |
| data2 = _get({"action": "query", "titles": "|".join(batch), | |
| "prop": "extracts", "exsentences": 3, | |
| "exlimit": len(batch), "explaintext": True, "format": "json"}) | |
| for page in data2.get("query", {}).get("pages", {}).values(): | |
| extract = (page.get("extract") or "").strip() | |
| title = page.get("title", "").strip() | |
| if not extract or not title: | |
| continue | |
| for sentence in extract.replace("\n", " ").split("."): | |
| sentence = sentence.strip() | |
| words = sentence.split() | |
| if 3 <= len(words) <= 20: | |
| entries.append({ | |
| "word": sentence, | |
| "translation": title, | |
| "language": lang, | |
| "source": "wikipedia", | |
| }) | |
| except Exception: | |
| pass | |
| time.sleep(0.3) | |
| return entries | |
| # ── HF dataset registration (reference-based, no re-upload) ────────────────── | |
| def get_hf_dataset_refs(lang: str) -> list[dict]: | |
| """Return the dataset reference dicts for this language.""" | |
| return HF_DATASET_REGISTRY.get(lang, []) | |