jefffffff9 Claude Sonnet 4.6 commited on
Commit
d2183cd
·
1 Parent(s): fba4954

Switch training dataset from google/waxal to google/fleurs

Browse files

google/waxal was removed from HuggingFace Hub. google/fleurs is the
maintained replacement with identical column schema (audio, transcription).
Subset mapping: bam → bam_ML, ful → ff_SN.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/data/waxal_loader.py +22 -9
src/data/waxal_loader.py CHANGED
@@ -1,6 +1,10 @@
1
  """
2
- Loads and preprocesses the google/waxal dataset for Bambara (bam) and Fula (ful).
3
  Uses streaming to avoid downloading the full corpus before training.
 
 
 
 
4
  """
5
  from __future__ import annotations
6
 
@@ -20,14 +24,20 @@ if TYPE_CHECKING:
20
 
21
  logger = logging.getLogger(__name__)
22
 
23
- # google/waxal column names
24
  AUDIO_COL = "audio"
25
  TEXT_COL = "transcription"
26
  TARGET_SR = 16_000
27
 
 
 
 
 
 
 
28
 
29
  class WaxalDataLoader:
30
- """Streams the google/waxal dataset and prepares examples for Whisper training."""
31
 
32
  def __init__(
33
  self,
@@ -35,22 +45,25 @@ class WaxalDataLoader:
35
  config: dict,
36
  hf_token: str | None = None,
37
  ) -> None:
38
- if subset not in ("bam", "ful"):
39
  raise ValueError(f"subset must be 'bam' or 'ful', got '{subset}'")
40
  self.subset = subset
 
41
  self.config = config
42
  self.hf_token = hf_token
43
 
44
  def load_split(self, split: str = "train", streaming: bool = True) -> "IterableDataset | Dataset":
45
- """Return a single split of google/waxal."""
46
- logger.info("Loading google/waxal subset=%s split=%s streaming=%s", self.subset, split, streaming)
 
 
 
47
  ds = load_dataset(
48
- "google/waxal",
49
- self.subset,
50
  split=split,
51
  token=self.hf_token,
52
  streaming=streaming,
53
- trust_remote_code=True,
54
  )
55
  if streaming:
56
  ds = ds.shuffle(seed=42, buffer_size=1000)
 
1
  """
2
+ Loads and preprocesses the google/fleurs dataset for Bambara (bam) and Fula (ful).
3
  Uses streaming to avoid downloading the full corpus before training.
4
+
5
+ google/waxal was removed from the Hub; google/fleurs is the maintained replacement.
6
+ Subset mapping: bam → bam_ML (Bambara Mali), ful → ff_SN (Fula/Pular Senegal).
7
+ Column names (audio, transcription) are identical.
8
  """
9
  from __future__ import annotations
10
 
 
24
 
25
  logger = logging.getLogger(__name__)
26
 
27
+ # google/fleurs column names (identical to the former google/waxal schema)
28
  AUDIO_COL = "audio"
29
  TEXT_COL = "transcription"
30
  TARGET_SR = 16_000
31
 
32
+ # Map our short language codes to google/fleurs subset names
33
+ _FLEURS_SUBSET = {
34
+ "bam": "bam_ML", # Bambara — Mali
35
+ "ful": "ff_SN", # Fula/Pular — Senegal
36
+ }
37
+
38
 
39
  class WaxalDataLoader:
40
+ """Streams the google/fleurs dataset and prepares examples for Whisper training."""
41
 
42
  def __init__(
43
  self,
 
45
  config: dict,
46
  hf_token: str | None = None,
47
  ) -> None:
48
+ if subset not in _FLEURS_SUBSET:
49
  raise ValueError(f"subset must be 'bam' or 'ful', got '{subset}'")
50
  self.subset = subset
51
+ self._fleurs_subset = _FLEURS_SUBSET[subset]
52
  self.config = config
53
  self.hf_token = hf_token
54
 
55
  def load_split(self, split: str = "train", streaming: bool = True) -> "IterableDataset | Dataset":
56
+ """Return a single split of google/fleurs."""
57
+ logger.info(
58
+ "Loading google/fleurs subset=%s (%s) split=%s streaming=%s",
59
+ self._fleurs_subset, self.subset, split, streaming,
60
+ )
61
  ds = load_dataset(
62
+ "google/fleurs",
63
+ self._fleurs_subset,
64
  split=split,
65
  token=self.hf_token,
66
  streaming=streaming,
 
67
  )
68
  if streaming:
69
  ds = ds.shuffle(seed=42, buffer_size=1000)