"""
Web harvester — pulls Bambara/Fula data from public internet sources
into the sahel-agri-feedback HF dataset repo.

Sources:
  - RobotsMali/jeli-asr         (HF, 33k Bambara audio+text samples)
  - google/fleurs ff_sn          (HF, Fula audio+text)
  - bm.wikipedia.org / ff.wikipedia.org  (Wikipedia API, text only -> vocabulary.jsonl)

HF audio datasets are registered by reference in dataset_sources.jsonl — the
Kaggle notebook loads them directly at training time.  This avoids re-uploading
gigabytes of audio through the Space (which would timeout every time).
"""
from __future__ import annotations

import io
import json
import time

WIKI_APIS = {
    "bam": "https://bm.wikipedia.org/w/api.php",
    "ful": "https://ff.wikipedia.org/w/api.php",
}

# Wikipedia requires a descriptive User-Agent or returns 403
_UA = "SahelVoiceAI/1.0 (Bambara/Fula language research; huggingface.co/spaces/ous-sow/sahel-agri-voice)"

# Datasets registered by reference — Kaggle notebook loads them directly
HF_DATASET_REGISTRY = {
    "bam": [
        {
            "repo":      "RobotsMali/jeli-asr",
            "config":    "jeli-asr",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "bam",
            "max":       5_000,
            "license":   "cc-by-4.0",
        },
    ],
    "ful": [
        {
            "repo":      "google/WaxalNLP",
            "config":    "ful_asr",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       2_000,
            "license":   "cc-by-4.0",
        },
        {
            "repo":      "Pullo-Africa-Protagonist/Fula-pular",
            "config":    "default",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       5_000,
            "license":   "cc-by-4.0",
            "note":      "9,761 Pular (Guinea) audio rows — primary ASR training source",
        },
        {
            "repo":      "guizme/adlam_fulfulde",
            "config":    "default",
            "split":     "train",
            "audio_col": "audio",
            "text_col":  "transcription",
            "max":       51,
            "license":   "cc-by-4.0",
            "adlam":     True,
            "note":      "51 Adlam-script audio rows — converted to Latin before training",
        },
        # OpenSLR SLR106 — West African Virtual Assistant ASR Corpus (Guinea, CC BY-SA 4.0)
        # 10,083 clean utterances from 49 Guinea-native Pular speakers (read speech,
        # multi-device, ages 5–76).  Best available clean Guinea Pular ASR data.
        # Download manually from https://openslr.org/106/ and upload to HF before training.
        # Uncomment once the dataset repo is populated.
        # {
        #     "repo":      "ous-sow/slr106-pular",
        #     "config":    "default",
        #     "split":     "train",
        #     "audio_col": "audio",
        #     "text_col":  "transcription",
        #     "max":       10_000,
        #     "license":   "cc-by-sa-4.0",
        #     "note":      "OpenSLR SLR106 Guinea Pular — 10k clean utterances, 49 speakers",
        # },
        # OpenSLR SLR105 — West African Radio Corpus (Guinea, CC BY-SA 4.0)
        # ~142 hours raw radio audio from 6 Guinea stations; Pular-tagged validation
        # set of 300 clips.  Noisier than SLR106 but larger.
        # Uncomment once uploaded to HF.
        # {
        #     "repo":      "ous-sow/slr105-pular",
        #     "config":    "default",
        #     "split":     "validation",
        #     "audio_col": "audio",
        #     "text_col":  "transcription",
        #     "max":       300,
        #     "license":   "cc-by-sa-4.0",
        #     "note":      "OpenSLR SLR105 Guinea radio — Pular-tagged validation split",
        # },
    ],
}


# ── Wikipedia text harvest ────────────────────────────────────────────────────

def harvest_wikipedia_text(lang: str, max_articles: int = 100) -> list[dict]:
    """
    Fetch up to max_articles article extracts from the language Wikipedia.
    Returns list of {word, translation, language, source} dicts for vocabulary.jsonl.
    """
    import urllib.request, urllib.parse

    api_url = WIKI_APIS.get(lang)
    if not api_url:
        return []

    def _get(params: dict) -> dict:
        url = f"{api_url}?{urllib.parse.urlencode(params)}"
        req = urllib.request.Request(url, headers={"User-Agent": _UA})
        with urllib.request.urlopen(req, timeout=20) as r:
            return json.loads(r.read())

    # Step 1: get article titles
    data   = _get({"action": "query", "list": "allpages", "aplimit": max_articles,
                   "apfilterredir": "nonredirects", "format": "json"})
    titles = [p["title"] for p in data.get("query", {}).get("allpages", [])]
    if not titles:
        return []

    # Step 2: fetch plain-text extracts in batches of 20
    entries = []
    for i in range(0, len(titles), 20):
        batch = titles[i:i + 20]
        try:
            data2 = _get({"action": "query", "titles": "|".join(batch),
                          "prop": "extracts", "exsentences": 3,
                          "exlimit": len(batch), "explaintext": True, "format": "json"})
            for page in data2.get("query", {}).get("pages", {}).values():
                extract = (page.get("extract") or "").strip()
                title   = page.get("title", "").strip()
                if not extract or not title:
                    continue
                for sentence in extract.replace("\n", " ").split("."):
                    sentence = sentence.strip()
                    words = sentence.split()
                    if 3 <= len(words) <= 20:
                        entries.append({
                            "word":        sentence,
                            "translation": title,
                            "language":    lang,
                            "source":      "wikipedia",
                        })
        except Exception:
            pass
        time.sleep(0.3)

    return entries


# ── HF dataset registration (reference-based, no re-upload) ──────────────────

def get_hf_dataset_refs(lang: str) -> list[dict]:
    """Return the dataset reference dicts for this language."""
    return HF_DATASET_REGISTRY.get(lang, [])