Spaces:
Sleeping
Sleeping
File size: 6,549 Bytes
eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac 24b1617 a5737ac ced078c 6682858 eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac eddcaea a5737ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
Web harvester — pulls Bambara/Fula data from public internet sources
into the sahel-agri-feedback HF dataset repo.
Sources:
- RobotsMali/jeli-asr (HF, 33k Bambara audio+text samples)
- google/fleurs ff_sn (HF, Fula audio+text)
- bm.wikipedia.org / ff.wikipedia.org (Wikipedia API, text only -> vocabulary.jsonl)
HF audio datasets are registered by reference in dataset_sources.jsonl — the
Kaggle notebook loads them directly at training time. This avoids re-uploading
gigabytes of audio through the Space (which would timeout every time).
"""
from __future__ import annotations
import io
import json
import time
WIKI_APIS = {
"bam": "https://bm.wikipedia.org/w/api.php",
"ful": "https://ff.wikipedia.org/w/api.php",
}
# Wikipedia requires a descriptive User-Agent or returns 403
_UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)"
# Datasets registered by reference — Kaggle notebook loads them directly
HF_DATASET_REGISTRY = {
"bam": [
{
"repo": "RobotsMali/jeli-asr",
"config": "jeli-asr",
"split": "train",
"audio_col": "audio",
"text_col": "bam",
"max": 5_000,
"license": "cc-by-4.0",
},
],
"ful": [
{
"repo": "google/WaxalNLP",
"config": "ful_asr",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 2_000,
"license": "cc-by-4.0",
},
{
"repo": "Pullo-Africa-Protagonist/Fula-pular",
"config": "default",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 5_000,
"license": "cc-by-4.0",
"note": "9,761 Pular (Guinea) audio rows — primary ASR training source",
},
{
"repo": "guizme/adlam_fulfulde",
"config": "default",
"split": "train",
"audio_col": "audio",
"text_col": "transcription",
"max": 51,
"license": "cc-by-4.0",
"adlam": True,
"note": "51 Adlam-script audio rows — converted to Latin before training",
},
# OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0)
# 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech,
# multi-device, ages 5–76). Best available clean Guinea Pular ASR data.
# Download manually from https://openslr.org/106/ and upload to HF before training.
# Uncomment once the dataset repo is populated.
# {
# "repo": "ous-sow/slr106-pular",
# "config": "default",
# "split": "train",
# "audio_col": "audio",
# "text_col": "transcription",
# "max": 10_000,
# "license": "cc-by-sa-4.0",
# "note": "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers",
# },
# OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0)
# ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation
# set of 300 clips. Noisier than SLR106 but larger.
# Uncomment once uploaded to HF.
# {
# "repo": "ous-sow/slr105-pular",
# "config": "default",
# "split": "validation",
# "audio_col": "audio",
# "text_col": "transcription",
# "max": 300,
# "license": "cc-by-sa-4.0",
# "note": "OpenSLR SLR105 Guinea radio — Pular-tagged validation split",
# },
],
}
# ── Wikipedia text harvest ────────────────────────────────────────────────────
def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
"""
Fetch up to max_articles article extracts from the language Wikipedia.
Returns list of {word, translation, language, source} dicts for vocabulary.jsonl.
"""
import urllib.request, urllib.parse
api_url = WIKI_APIS.get(lang)
if not api_url:
return []
def _get(params: dict) -> dict:
url = f"{api_url}?{urllib.parse.urlencode(params)}"
req = urllib.request.Request(url, headers={"User-Agent": _UA})
with urllib.request.urlopen(req, timeout=20) as r:
return json.loads(r.read())
# Step 1: get article titles
data = _get({"action": "query", "list": "allpages", "aplimit": max_articles,
"apfilterredir": "nonredirects", "format": "json"})
titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
if not titles:
return []
# Step 2: fetch plain-text extracts in batches of 20
entries = []
for i in range(0, len(titles), 20):
batch = titles[i:i + 20]
try:
data2 = _get({"action": "query", "titles": "|".join(batch),
"prop": "extracts", "exsentences": 3,
"exlimit": len(batch), "explaintext": True, "format": "json"})
for page in data2.get("query", {}).get("pages", {}).values():
extract = (page.get("extract") or "").strip()
title = page.get("title", "").strip()
if not extract or not title:
continue
for sentence in extract.replace("\n", " ").split("."):
sentence = sentence.strip()
words = sentence.split()
if 3 <= len(words) <= 20:
entries.append({
"word": sentence,
"translation": title,
"language": lang,
"source": "wikipedia",
})
except Exception:
pass
time.sleep(0.3)
return entries
# ── HF dataset registration (reference-based, no re-upload) ──────────────────
def get_hf_dataset_refs(lang: str) -> list[dict]:
"""Return the dataset reference dicts for this language."""
return HF_DATASET_REGISTRY.get(lang, [])
|