ground-zero / src /data /web_harvester.py
jefffffff9
Fix jiwer crash on post-normalisation empty refs; register SLR106/105 datasets
6682858
"""
Web harvester — pulls Bambara/Fula data from public internet sources
into the sahel-agri-feedback HF dataset repo.
Sources:
- RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples)
- google/fleurs ff_sn (HF, Fula audio+text)
- bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only -> vocabulary.jsonl)
HF audio datasets are registered by reference in dataset_sources.jsonl — the
Kaggle notebook loads them directly at training time. This avoids re-uploading
gigabytes of audio through the Space (which would timeout every time).
"""
from __future__ import annotations
import io
import json
import time
WIKI_APIS = {
"bam": "https://bm.wikipedia.org/w/api.php",
"ful": "https://ff.wikipedia.org/w/api.php",
}
# Wikipedia requires a descriptive User-Agent or returns 403
_UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)"
# Datasets registered by reference — Kaggle notebook loads them directly
HF_DATASET_REGISTRY = {
"bam": [
{
"repo": "RobotsMali/jeli-asr",
"config": "jeli-asr",
"split": "train",
"audio_col": "audio",
"text_col": "bam",
"max": 5_000,
"license": "cc-by-4.0",
},
],
"ful": [
{
"repo": "google/WaxalNLP",
"config": "ful_asr",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 2_000,
"license": "cc-by-4.0",
},
{
"repo": "Pullo-Africa-Protagonist/Fula-pular",
"config": "default",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 5_000,
"license": "cc-by-4.0",
"note": "9,761 Pular (Guinea) audio rows — primary ASR training source",
},
{
"repo": "guizme/adlam_fulfulde",
"config": "default",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 51,
"license": "cc-by-4.0",
"adlam": True,
"note": "51 Adlam-script audio rows — converted to Latin before training",
},
# OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0)
# 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech,
# multi-device, ages 5–76). Best available clean Guinea Pular ASR data.
# Download manually from https://openslr.org/106/ and upload to HF before training.
# Uncomment once the dataset repo is populated.
# {
# "repo": "ous-sow/slr106-pular",
# "config": "default",
# "split": "train",
# "audio_col": "audio",
# "text_col": "transcription",
# "max": 10_000,
# "license": "cc-by-sa-4.0",
# "note": "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers",
# },
# OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0)
# ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation
# set of 300 clips. Noisier than SLR106 but larger.
# Uncomment once uploaded to HF.
# {
# "repo": "ous-sow/slr105-pular",
# "config": "default",
# "split": "validation",
# "audio_col": "audio",
# "text_col": "transcription",
# "max": 300,
# "license": "cc-by-sa-4.0",
# "note": "OpenSLR SLR105 Guinea radio — Pular-tagged validation split",
# },
],
}
# ── Wikipedia text harvest ────────────────────────────────────────────────────
def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
"""
Fetch up to max_articles article extracts from the language Wikipedia.
Returns list of {word, translation, language, source} dicts for vocabulary.jsonl.
"""
import urllib.request, urllib.parse
api_url = WIKI_APIS.get(lang)
if not api_url:
return []
def _get(params: dict) -> dict:
url = f"{api_url}?{urllib.parse.urlencode(params)}"
req = urllib.request.Request(url, headers={"User-Agent": _UA})
with urllib.request.urlopen(req, timeout=20) as r:
return json.loads(r.read())
# Step 1: get article titles
data = _get({"action": "query", "list": "allpages", "aplimit": max_articles,
"apfilterredir": "nonredirects", "format": "json"})
titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
if not titles:
return []
# Step 2: fetch plain-text extracts in batches of 20
entries = []
for i in range(0, len(titles), 20):
batch = titles[i:i + 20]
try:
data2 = _get({"action": "query", "titles": "|".join(batch),
"prop": "extracts", "exsentences": 3,
"exlimit": len(batch), "explaintext": True, "format": "json"})
for page in data2.get("query", {}).get("pages", {}).values():
extract = (page.get("extract") or "").strip()
title = page.get("title", "").strip()
if not extract or not title:
continue
for sentence in extract.replace("\n", " ").split("."):
sentence = sentence.strip()
words = sentence.split()
if 3 <= len(words) <= 20:
entries.append({
"word": sentence,
"translation": title,
"language": lang,
"source": "wikipedia",
})
except Exception:
pass
time.sleep(0.3)
return entries
# ── HF dataset registration (reference-based, no re-upload) ──────────────────
def get_hf_dataset_refs(lang: str) -> list[dict]:
"""Return the dataset reference dicts for this language."""
return HF_DATASET_REGISTRY.get(lang, [])