Spaces:

MataStrategy
/

ground-zero

Running

ground-zero / src /data /web_harvester.py

jefffffff9

Fix jiwer crash on post-normalisation empty refs; register SLR106/105 datasets

6682858 21 days ago

6.55 kB

	"""
	Web harvester — pulls Bambara/Fula data from public internet sources
	into the sahel-agri-feedback HF dataset repo.

	Sources:
	- RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples)
	- google/fleurs ff_sn (HF, Fula audio+text)
	- bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only -> vocabulary.jsonl)

	HF audio datasets are registered by reference in dataset_sources.jsonl — the
	Kaggle notebook loads them directly at training time. This avoids re-uploading
	gigabytes of audio through the Space (which would timeout every time).
	"""
	from __future__ import annotations

	import io
	import json
	import time

	WIKI_APIS = {
	"bam": "https://bm.wikipedia.org/w/api.php",
	"ful": "https://ff.wikipedia.org/w/api.php",
	}

	# Wikipedia requires a descriptive User-Agent or returns 403
	_UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)"

	# Datasets registered by reference — Kaggle notebook loads them directly
	HF_DATASET_REGISTRY = {
	"bam": [
	{
	"repo": "RobotsMali/jeli-asr",
	"config": "jeli-asr",
	"split": "train",
	"audio_col": "audio",
	"text_col": "bam",
	"max": 5_000,
	"license": "cc-by-4.0",
	},
	],
	"ful": [
	{
	"repo": "google/WaxalNLP",
	"config": "ful_asr",
	"split": "train",
	"audio_col": "audio",
	"text_col": "transcription",
	"max": 2_000,
	"license": "cc-by-4.0",
	},
	{
	"repo": "Pullo-Africa-Protagonist/Fula-pular",
	"config": "default",
	"split": "train",
	"audio_col": "audio",
	"text_col": "transcription",
	"max": 5_000,
	"license": "cc-by-4.0",
	"note": "9,761 Pular (Guinea) audio rows — primary ASR training source",
	},
	{
	"repo": "guizme/adlam_fulfulde",
	"config": "default",
	"split": "train",
	"audio_col": "audio",
	"text_col": "transcription",
	"max": 51,
	"license": "cc-by-4.0",
	"adlam": True,
	"note": "51 Adlam-script audio rows — converted to Latin before training",
	},
	# OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0)
	# 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech,
	# multi-device, ages 5–76). Best available clean Guinea Pular ASR data.
	# Download manually from https://openslr.org/106/ and upload to HF before training.
	# Uncomment once the dataset repo is populated.
	# {
	# "repo": "ous-sow/slr106-pular",
	# "config": "default",
	# "split": "train",
	# "audio_col": "audio",
	# "text_col": "transcription",
	# "max": 10_000,
	# "license": "cc-by-sa-4.0",
	# "note": "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers",
	# },
	# OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0)
	# ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation
	# set of 300 clips. Noisier than SLR106 but larger.
	# Uncomment once uploaded to HF.
	# {
	# "repo": "ous-sow/slr105-pular",
	# "config": "default",
	# "split": "validation",
	# "audio_col": "audio",
	# "text_col": "transcription",
	# "max": 300,
	# "license": "cc-by-sa-4.0",
	# "note": "OpenSLR SLR105 Guinea radio — Pular-tagged validation split",
	# },
	],
	}


	# ── Wikipedia text harvest ────────────────────────────────────────────────────

	def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
	"""
	Fetch up to max_articles article extracts from the language Wikipedia.
	Returns list of {word, translation, language, source} dicts for vocabulary.jsonl.
	"""
	import urllib.request, urllib.parse

	api_url = WIKI_APIS.get(lang)
	if not api_url:
	return []

	def _get(params: dict) -> dict:
	url = f"{api_url}?{urllib.parse.urlencode(params)}"
	req = urllib.request.Request(url, headers={"User-Agent": _UA})
	with urllib.request.urlopen(req, timeout=20) as r:
	return json.loads(r.read())

	# Step 1: get article titles
	data = _get({"action": "query", "list": "allpages", "aplimit": max_articles,
	"apfilterredir": "nonredirects", "format": "json"})
	titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
	if not titles:
	return []

	# Step 2: fetch plain-text extracts in batches of 20
	entries = []
	for i in range(0, len(titles), 20):
	batch = titles[i:i + 20]
	try:
	data2 = _get({"action": "query", "titles": "\|".join(batch),
	"prop": "extracts", "exsentences": 3,
	"exlimit": len(batch), "explaintext": True, "format": "json"})
	for page in data2.get("query", {}).get("pages", {}).values():
	extract = (page.get("extract") or "").strip()
	title = page.get("title", "").strip()
	if not extract or not title:
	continue
	for sentence in extract.replace("\n", " ").split("."):
	sentence = sentence.strip()
	words = sentence.split()
	if 3 <= len(words) <= 20:
	entries.append({
	"word": sentence,
	"translation": title,
	"language": lang,
	"source": "wikipedia",
	})
	except Exception:
	pass
	time.sleep(0.3)

	return entries


	# ── HF dataset registration (reference-based, no re-upload) ──────────────────

	def get_hf_dataset_refs(lang: str) -> list[dict]:
	"""Return the dataset reference dicts for this language."""
	return HF_DATASET_REGISTRY.get(lang, [])