File size: 5,176 Bytes
e82a88e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.

Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
write documents in ASCENDING quality order so a sequential read during training
acts as a curriculum — the model sees noisier web text first and the densest
material (textbooks, then Wikipedia) last. Research shows this ordering plus a
moderate LR decay beats random shuffling for free.

The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
little denser (more math, stricter web filter):

    FineWeb-HQ (score-gated web)   45%   ~710M tokens   [first / lowest density]
    Python stack (filtered)        10%   ~160M tokens
    FineMath-4+                    15%   ~235M tokens
    Cosmopedia (stanford+wikihow)  25%   ~395M tokens
    Wikipedia EN                    5%    ~80M tokens   [last / highest density]
                                   ----  -----------
                                   100%  ~1.57B tokens  (Chinchilla-optimal)

Usage:
    python prepare_data.py                 # full ~1.57B token build
    python prepare_data.py --smoke         # tiny build to test the pipeline
"""

from __future__ import annotations

import argparse
import os

import numpy as np

from huggingface_hub import login

login(token="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")

TOKENIZER_PATH = "ivme_tokenizer.json"
OUT_DIR = "data"
DTYPE = np.uint16

# (source_key, target_tokens) in ASCENDING quality order — written in this order.
TOKEN_BUDGET = [
    ("fineweb_hq", 710_000_000),
    ("python",     160_000_000),
    ("finemath",   235_000_000),
    ("cosmopedia", 395_000_000),
    ("wikipedia",   80_000_000),
]
SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]

VAL_TOKENS = 2_000_000  # held out from the tail of each source proportionally


def make_stream(source_key):
    """Return (iterable_of_text, text_field) for a source."""
    from datasets import load_dataset

    if source_key == "fineweb_hq":
        ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
        return ds, "text"
    if source_key == "cosmopedia":
        # Two dense subsets concatenated.
        a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
        b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
        from itertools import chain
        return chain(a, b), "text"
    if source_key == "finemath":
        ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
        return ds, "text"
    if source_key == "python":
        ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
        return ds, "content"
    if source_key == "wikipedia":
        ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
        return ds, "text"
    raise ValueError(source_key)


def build(budget):
    from tokenizers import Tokenizer

    os.makedirs(OUT_DIR, exist_ok=True)
    tok = Tokenizer.from_file(TOKENIZER_PATH)
    eos_id = tok.token_to_id("<|eos|>")

    train_path = os.path.join(OUT_DIR, "train.bin")
    val_path = os.path.join(OUT_DIR, "val.bin")

    total_target = sum(n for _, n in budget)
    print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")

    train_f = open(train_path, "wb")
    val_buf = []  # small, held in memory

    written_train = 0
    for source_key, target in budget:
        stream, field = make_stream(source_key)
        src_written = 0
        # Reserve a slice of each source's tail for validation.
        val_target = int(VAL_TOKENS * (target / total_target))
        print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")

        for row in stream:
            text = row.get(field)
            if not text:
                continue
            ids = tok.encode(text).ids
            ids.append(eos_id)  # document boundary
            arr = np.array(ids, dtype=DTYPE)

            if len(val_buf) * 0 + src_written >= target:
                break
            # Send the first val_target tokens of this source to val, rest to train.
            if src_written < val_target:
                val_buf.append(arr)
            else:
                arr.tofile(train_f)
                written_train += len(arr)
            src_written += len(arr)

            if src_written % 5_000_000 < len(arr):
                print(f"  [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")

    train_f.close()
    val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
    val_arr.tofile(val_path)

    print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
    print(f"[data] val.bin   : {len(val_arr):,} tokens -> {val_path}")
    print(f"[data] curriculum order preserved (sequential read = ascending quality)")


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
    args = ap.parse_args()
    build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)