Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9 Claude Sonnet 4.6 commited on Apr 6

Commit

d2183cd

1 Parent(s): fba4954

Switch training dataset from google/waxal to google/fleurs

google/waxal was removed from HuggingFace Hub. google/fleurs is the
maintained replacement with identical column schema (audio, transcription).
Subset mapping: bam → bam_ML, ful → ff_SN.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/data/waxal_loader.py +22 -9

src/data/waxal_loader.py CHANGED Viewed

@@ -1,6 +1,10 @@
 """
-Loads and preprocesses the google/waxal dataset for Bambara (bam) and Fula (ful).
 Uses streaming to avoid downloading the full corpus before training.
 """
 from __future__ import annotations
@@ -20,14 +24,20 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-# google/waxal column names
 AUDIO_COL = "audio"
 TEXT_COL = "transcription"
 TARGET_SR = 16_000
 class WaxalDataLoader:
-    """Streams the google/waxal dataset and prepares examples for Whisper training."""
     def __init__(
         self,
@@ -35,22 +45,25 @@ class WaxalDataLoader:
         config: dict,
         hf_token: str | None = None,
     ) -> None:
-        if subset not in ("bam", "ful"):
             raise ValueError(f"subset must be 'bam' or 'ful', got '{subset}'")
         self.subset = subset
         self.config = config
         self.hf_token = hf_token
     def load_split(self, split: str = "train", streaming: bool = True) -> "IterableDataset | Dataset":
-        """Return a single split of google/waxal."""
-        logger.info("Loading google/waxal subset=%s split=%s streaming=%s", self.subset, split, streaming)
         ds = load_dataset(
-            "google/waxal",
-            self.subset,
             split=split,
             token=self.hf_token,
             streaming=streaming,
-            trust_remote_code=True,
         )
         if streaming:
             ds = ds.shuffle(seed=42, buffer_size=1000)

 """
+Loads and preprocesses the google/fleurs dataset for Bambara (bam) and Fula (ful).
 Uses streaming to avoid downloading the full corpus before training.
+google/waxal was removed from the Hub; google/fleurs is the maintained replacement.
+Subset mapping: bam → bam_ML (Bambara Mali), ful → ff_SN (Fula/Pular Senegal).
+Column names (audio, transcription) are identical.
 """
 from __future__ import annotations
 logger = logging.getLogger(__name__)
+# google/fleurs column names (identical to the former google/waxal schema)
 AUDIO_COL = "audio"
 TEXT_COL = "transcription"
 TARGET_SR = 16_000
+# Map our short language codes to google/fleurs subset names
+_FLEURS_SUBSET = {
+    "bam": "bam_ML",  # Bambara — Mali
+    "ful": "ff_SN",   # Fula/Pular — Senegal
+}
 class WaxalDataLoader:
+    """Streams the google/fleurs dataset and prepares examples for Whisper training."""
     def __init__(
         self,
         config: dict,
         hf_token: str | None = None,
     ) -> None:
+        if subset not in _FLEURS_SUBSET:
             raise ValueError(f"subset must be 'bam' or 'ful', got '{subset}'")
         self.subset = subset
+        self._fleurs_subset = _FLEURS_SUBSET[subset]
         self.config = config
         self.hf_token = hf_token
     def load_split(self, split: str = "train", streaming: bool = True) -> "IterableDataset | Dataset":
+        """Return a single split of google/fleurs."""
+        logger.info(
+            "Loading google/fleurs subset=%s (%s) split=%s streaming=%s",
+            self._fleurs_subset, self.subset, split, streaming,
+        )
         ds = load_dataset(
+            "google/fleurs",
+            self._fleurs_subset,
             split=split,
             token=self.hf_token,
             streaming=streaming,
         )
         if streaming:
             ds = ds.shuffle(seed=42, buffer_size=1000)