File size: 6,549 Bytes
eddcaea
 
 
 
 
 
 
a5737ac
eddcaea
a5737ac
 
 
eddcaea
 
 
 
 
 
 
 
 
 
 
 
a5737ac
 
 
 
 
eddcaea
a5737ac
 
 
 
 
 
 
 
 
eddcaea
 
a5737ac
24b1617
 
a5737ac
 
 
 
 
 
ced078c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6682858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eddcaea
 
 
 
 
 
 
 
a5737ac
 
eddcaea
 
 
 
 
 
 
a5737ac
 
 
 
 
eddcaea
a5737ac
 
 
eddcaea
 
 
 
 
 
 
 
 
a5737ac
 
 
eddcaea
 
 
 
 
 
 
 
 
 
a5737ac
 
 
 
eddcaea
 
 
a5737ac
eddcaea
 
 
 
a5737ac
eddcaea
a5737ac
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Web harvester — pulls Bambara/Fula data from public internet sources
into the sahel-agri-feedback HF dataset repo.

Sources:
  - RobotsMali/jeli-asr         (HF, 33k Bambara audio+text samples)
  - google/fleurs ff_sn          (HF, Fula audio+text)
  - bm.wikipedia.org / ff.wikipedia.org  (Wikipedia API, text only -> vocabulary.jsonl)

HF audio datasets are registered by reference in dataset_sources.jsonl — the
Kaggle notebook loads them directly at training time.  This avoids re-uploading
gigabytes of audio through the Space (which would timeout every time).
"""
from __future__ import annotations

import io
import json
import time

WIKI_APIS = {
    "bam": "https://bm.wikipedia.org/w/api.php",
    "ful": "https://ff.wikipedia.org/w/api.php",
}

# Wikipedia requires a descriptive User-Agent or returns 403
_UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)"

# Datasets registered by reference — Kaggle notebook loads them directly
HF_DATASET_REGISTRY = {
    "bam": [
        {
            "repo":      "RobotsMali/jeli-asr",
            "config":    "jeli-asr",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "bam",
            "max":       5_000,
            "license":   "cc-by-4.0",
        },
    ],
    "ful": [
        {
            "repo":      "google/WaxalNLP",
            "config":    "ful_asr",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       2_000,
            "license":   "cc-by-4.0",
        },
        {
            "repo":      "Pullo-Africa-Protagonist/Fula-pular",
            "config":    "default",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       5_000,
            "license":   "cc-by-4.0",
            "note":      "9,761 Pular (Guinea) audio rows — primary ASR training source",
        },
        {
            "repo":      "guizme/adlam_fulfulde",
            "config":    "default",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       51,
            "license":   "cc-by-4.0",
            "adlam":     True,
            "note":      "51 Adlam-script audio rows — converted to Latin before training",
        },
        # OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0)
        # 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech,
        # multi-device, ages 5–76).  Best available clean Guinea Pular ASR data.
        # Download manually from https://openslr.org/106/ and upload to HF before training.
        # Uncomment once the dataset repo is populated.
        # {
        #     "repo":      "ous-sow/slr106-pular",
        #     "config":    "default",
        #     "split":     "train",
        #     "audio_col": "audio",
        #     "text_col":  "transcription",
        #     "max":       10_000,
        #     "license":   "cc-by-sa-4.0",
        #     "note":      "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers",
        # },
        # OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0)
        # ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation
        # set of 300 clips.  Noisier than SLR106 but larger.
        # Uncomment once uploaded to HF.
        # {
        #     "repo":      "ous-sow/slr105-pular",
        #     "config":    "default",
        #     "split":     "validation",
        #     "audio_col": "audio",
        #     "text_col":  "transcription",
        #     "max":       300,
        #     "license":   "cc-by-sa-4.0",
        #     "note":      "OpenSLR SLR105 Guinea radio — Pular-tagged validation split",
        # },
    ],
}


# ── Wikipedia text harvest ────────────────────────────────────────────────────

def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
    """
    Fetch up to max_articles article extracts from the language Wikipedia.
    Returns list of {word, translation, language, source} dicts for vocabulary.jsonl.
    """
    import urllib.request, urllib.parse

    api_url = WIKI_APIS.get(lang)
    if not api_url:
        return []

    def _get(params: dict) -> dict:
        url = f"{api_url}?{urllib.parse.urlencode(params)}"
        req = urllib.request.Request(url, headers={"User-Agent": _UA})
        with urllib.request.urlopen(req, timeout=20) as r:
            return json.loads(r.read())

    # Step 1: get article titles
    data   = _get({"action": "query", "list": "allpages", "aplimit": max_articles,
                   "apfilterredir": "nonredirects", "format": "json"})
    titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
    if not titles:
        return []

    # Step 2: fetch plain-text extracts in batches of 20
    entries = []
    for i in range(0, len(titles), 20):
        batch = titles[i:i + 20]
        try:
            data2 = _get({"action": "query", "titles": "|".join(batch),
                          "prop": "extracts", "exsentences": 3,
                          "exlimit": len(batch), "explaintext": True, "format": "json"})
            for page in data2.get("query", {}).get("pages", {}).values():
                extract = (page.get("extract") or "").strip()
                title   = page.get("title", "").strip()
                if not extract or not title:
                    continue
                for sentence in extract.replace("\n", " ").split("."):
                    sentence = sentence.strip()
                    words = sentence.split()
                    if 3 <= len(words) <= 20:
                        entries.append({
                            "word":        sentence,
                            "translation": title,
                            "language":    lang,
                            "source":      "wikipedia",
                        })
        except Exception:
            pass
        time.sleep(0.3)

    return entries


# ── HF dataset registration (reference-based, no re-upload) ──────────────────

def get_hf_dataset_refs(lang: str) -> list[dict]:
    """Return the dataset reference dicts for this language."""
    return HF_DATASET_REGISTRY.get(lang, [])