5dimension commited on
Commit
b5f4d76
·
verified ·
1 Parent(s): c488e22

Add production training script

Browse files
Files changed (1) hide show
  1. train_production_tokenizer.py +490 -0
train_production_tokenizer.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train the Sentinel Universal Tokenizer on real multilingual data using
3
+ the HuggingFace tokenizers library, then benchmark against GPT-2/Gemma tokenizers.
4
+
5
+ Uses allenai/c4 multilingual for training data.
6
+ """
7
+
8
+ import json
9
+ import math
10
+ import os
11
+ import time
12
+ import sys
13
+ from collections import defaultdict
14
+
15
+ import numpy as np
16
+
17
+ # ──────────────────────────────────────────────────────────────────────────────
18
+ # SENTINEL CONSTANTS
19
+ # ──────────────────────────────────────────────────────────────────────────────
20
+ INV_E = 1.0 / math.e
21
+ C1 = -0.007994021805952546
22
+ C2 = 0.00020005604296784437
23
+ SOPHOMORES_DREAM = 1.2912859970626636
24
+
25
+ print("=" * 80)
26
+ print(" 🦴 SENTINEL UNIVERSAL TOKENIZER — Production Training")
27
+ print("=" * 80)
28
+ print(f"\n Constants: 1/e={INV_E:.6f}, C₁={C1:.12f}, C₂={C2:.12f}")
29
+
30
+ from datasets import load_dataset
31
+
32
+ print("\n Loading multilingual training corpus from allenai/c4...")
33
+
34
+ # Languages to include, with sample counts
35
+ # Using 1/e proportional weighting: English gets most, each tier gets ~1/e less
36
+ LANGUAGES = {
37
+ 'en': 10000, # English — primary
38
+ 'fr': 4000, # French
39
+ 'de': 4000, # German
40
+ 'es': 4000, # Spanish
41
+ 'zh': 3000, # Chinese (Simplified)
42
+ 'ja': 2500, # Japanese
43
+ 'ar': 2500, # Arabic
44
+ 'ru': 2500, # Russian
45
+ 'ko': 2000, # Korean
46
+ 'hi': 2000, # Hindi
47
+ 'pt': 2000, # Portuguese
48
+ 'it': 2000, # Italian
49
+ 'nl': 1500, # Dutch
50
+ 'pl': 1500, # Polish
51
+ 'vi': 1500, # Vietnamese
52
+ 'th': 1000, # Thai
53
+ 'tr': 1000, # Turkish
54
+ 'he': 1000, # Hebrew
55
+ 'uk': 1000, # Ukrainian
56
+ 'sv': 1000, # Swedish
57
+ }
58
+
59
+ all_texts = []
60
+
61
+ for lang, n_samples in LANGUAGES.items():
62
+ try:
63
+ ds = load_dataset("allenai/c4", lang, split="train", streaming=True)
64
+ count = 0
65
+ for item in ds:
66
+ if count >= n_samples:
67
+ break
68
+ text = item.get('text', '')
69
+ if len(text) > 100 and len(text) < 10000:
70
+ all_texts.append(text[:2000]) # Cap at 2000 chars per sample
71
+ count += 1
72
+ print(f" ✓ {lang}: {count:,} samples")
73
+ except Exception as e:
74
+ print(f" ⚠ {lang}: {str(e)[:80]}")
75
+ sys.stdout.flush()
76
+
77
+ # Add math/scientific text
78
+ math_texts = [
79
+ "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.29128599706266354",
80
+ "lim_{n→∞} (1 + 1/n)^n = e ≈ 2.71828182845904523536",
81
+ "F(z) = Σ_{n=1}^∞ z^n / n^n, lim_{z→∞} F'(z)/F(z) = 1/e",
82
+ "∇f(x) = (∂f/∂x₁, ∂f/∂x₂, ..., ∂f/∂xₙ)",
83
+ "E = mc², ℏ = h/(2π), α = e²/(4πε₀ℏc) ≈ 1/137",
84
+ "∮ B·dl = μ₀(I + ε₀ ∂Φ_E/∂t)",
85
+ "H(X) = -Σ p(x) log p(x), KL(P||Q) = Σ p(x) log(p(x)/q(x))",
86
+ "sech(x) = 1/cosh(x) = 2/(e^x + e^{-x}), |sech'(x)| ≤ 0.6498",
87
+ "det(A - λI) = 0, Av = λv, tr(A) = Σ λᵢ",
88
+ "P(A|B) = P(B|A)P(A) / P(B), E[X] = Σ x·P(x)",
89
+ "import torch; model = nn.Linear(768, 512); out = model(x)",
90
+ "def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
91
+ "class SentinelTransformer(nn.Module): def __init__(self): super().__init__()",
92
+ "SELECT * FROM users WHERE age > 18 ORDER BY created_at DESC LIMIT 100;",
93
+ "git commit -m 'feat: add sech attention mechanism' && git push origin main",
94
+ "docker build -t sentinel:latest . && docker run -p 8080:8080 sentinel:latest",
95
+ "curl -X POST https://api.huggingface.co/v1/models -H 'Authorization: Bearer $HF_TOKEN'",
96
+ "\\begin{equation} \\nabla \\cdot \\mathbf{E} = \\frac{\\rho}{\\epsilon_0} \\end{equation}",
97
+ "x^2 + y^2 = r^2, dy/dx = -x/y, d²y/dx² = -(r²)/y³",
98
+ "∑_{i=1}^{n} i = n(n+1)/2, ∏_{i=1}^{n} i = n!, ∫₀^∞ e^{-x²} dx = √π/2",
99
+ ] * 100
100
+ all_texts.extend(math_texts)
101
+
102
+ # Add code samples (inline since codeparrot is gated)
103
+ code_samples = [
104
+ """def train_model(model, data, epochs=10, lr=0.001):
105
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr)
106
+ for epoch in range(epochs):
107
+ for batch in data:
108
+ loss = model(batch)
109
+ loss.backward()
110
+ optimizer.step()
111
+ optimizer.zero_grad()
112
+ return model""",
113
+ """async function fetchAPI(url: string): Promise<Response> {
114
+ const response = await fetch(url, {
115
+ headers: { 'Content-Type': 'application/json' },
116
+ });
117
+ if (!response.ok) throw new Error(`HTTP ${response.status}`);
118
+ return response.json();
119
+ }""",
120
+ """#include <iostream>
121
+ #include <vector>
122
+ template<typename T>
123
+ T sentinel_sech(T x) {
124
+ return T(1.0) / std::cosh(x * T(0.367879441171442));
125
+ }
126
+ int main() { std::cout << sentinel_sech(1.0) << std::endl; }""",
127
+ """class SentinelAttention(nn.Module):
128
+ def __init__(self, d_model=512, n_heads=8):
129
+ super().__init__()
130
+ self.d_head = d_model // n_heads
131
+ self.W_q = nn.Linear(d_model, d_model)
132
+ self.W_k = nn.Linear(d_model, d_model)
133
+ self.W_v = nn.Linear(d_model, d_model)
134
+
135
+ def forward(self, x):
136
+ Q, K, V = self.W_q(x), self.W_k(x), self.W_v(x)
137
+ scores = Q @ K.transpose(-2, -1) / math.sqrt(self.d_head)
138
+ attn = 1.0 / torch.cosh(scores) # sech attention
139
+ attn = attn / (attn.sum(-1, keepdim=True) + 1e-8)
140
+ return attn @ V""",
141
+ """import numpy as np
142
+ from scipy.optimize import minimize
143
+ def sentinel_optimizer(f, x0, alpha=1/np.e):
144
+ def damped_grad(x):
145
+ grad = np.gradient(f(x))
146
+ damping = alpha ** (np.linalg.norm(grad) / 0.0002)
147
+ return grad * damping
148
+ return minimize(f, x0, jac=damped_grad, method='L-BFGS-B')""",
149
+ ] * 200
150
+ all_texts.extend(code_samples)
151
+
152
+ print(f"\n Total training samples: {len(all_texts):,}")
153
+ total_chars = sum(len(t) for t in all_texts)
154
+ print(f" Total characters: {total_chars:,}")
155
+ sys.stdout.flush()
156
+
157
+ # ──────────────────────────────────────────────────────────────────────────────
158
+ # STEP 2: Train BPE tokenizer
159
+ # ──────────────────────────────────────────────────────────────────────────────
160
+
161
+ from tokenizers import (
162
+ Tokenizer, models, normalizers, pre_tokenizers, decoders,
163
+ trainers, processors, AddedToken
164
+ )
165
+
166
+ print("\n Building Sentinel BPE tokenizer...")
167
+
168
+ tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
169
+ tokenizer.normalizer = normalizers.NFKC()
170
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
171
+ tokenizer.decoder = decoders.ByteLevel()
172
+
173
+ SPECIAL_TOKENS = [
174
+ "<pad>", "<unk>", "<s>", "</s>", "<mask>",
175
+ "<text_start>", "<text_end>",
176
+ "<image_start>", "<image_end>", "<image>",
177
+ "<audio_start>", "<audio_end>", "<audio>",
178
+ "<video_start>", "<video_end>", "<video>",
179
+ "<sentinel>", "<sentinel_c1>", "<sentinel_c2>", "<scale_1e>",
180
+ "<translate>", "<summarize>", "<generate>", "<understand>", "<caption>",
181
+ "<turn>", "<system>", "<user>", "<assistant>",
182
+ "<code_start>", "<code_end>", "<math_start>", "<math_end>",
183
+ ]
184
+
185
+ TEXT_VOCAB_SIZE = 32768
186
+
187
+ trainer_config = trainers.BpeTrainer(
188
+ vocab_size=TEXT_VOCAB_SIZE,
189
+ min_frequency=2,
190
+ max_token_length=16,
191
+ special_tokens=SPECIAL_TOKENS,
192
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
193
+ show_progress=True,
194
+ )
195
+
196
+ print(f"\n Training BPE with vocab_size={TEXT_VOCAB_SIZE}...")
197
+
198
+ def batch_iterator(texts, batch_size=1000):
199
+ for i in range(0, len(texts), batch_size):
200
+ yield texts[i:i + batch_size]
201
+
202
+ start_time = time.time()
203
+ tokenizer.train_from_iterator(
204
+ batch_iterator(all_texts), trainer=trainer_config, length=len(all_texts)
205
+ )
206
+ train_time = time.time() - start_time
207
+
208
+ print(f" ✓ BPE training complete in {train_time:.1f}s")
209
+ print(f" ✓ Vocab size: {tokenizer.get_vocab_size()}")
210
+
211
+ tokenizer.post_processor = processors.TemplateProcessing(
212
+ single="<s> $A </s>",
213
+ pair="<s> $A </s> $B:1 </s>:1",
214
+ special_tokens=[
215
+ ("<s>", tokenizer.token_to_id("<s>")),
216
+ ("</s>", tokenizer.token_to_id("</s>")),
217
+ ],
218
+ )
219
+
220
+ # ──────────────────────────────────────────────────────────────────────────────
221
+ # STEP 3: Wrap in PreTrainedTokenizerFast + add multimodal tokens
222
+ # ──────────────────────────────────────────────────────────────────────────────
223
+
224
+ from transformers import PreTrainedTokenizerFast
225
+
226
+ print("\n Wrapping in HuggingFace PreTrainedTokenizerFast...")
227
+
228
+ hf_tokenizer = PreTrainedTokenizerFast(
229
+ tokenizer_object=tokenizer,
230
+ bos_token="<s>",
231
+ eos_token="</s>",
232
+ unk_token="<unk>",
233
+ pad_token="<pad>",
234
+ mask_token="<mask>",
235
+ model_max_length=8192,
236
+ padding_side="right",
237
+ truncation_side="right",
238
+ )
239
+
240
+ # Add multimodal specials
241
+ multimodal_specials = []
242
+ for tok in SPECIAL_TOKENS:
243
+ if tok not in {"<pad>", "<unk>", "<s>", "</s>", "<mask>"}:
244
+ multimodal_specials.append(
245
+ AddedToken(tok, single_word=False, lstrip=False, rstrip=False,
246
+ normalized=False, special=True)
247
+ )
248
+ hf_tokenizer.add_special_tokens({"additional_special_tokens": multimodal_specials})
249
+
250
+ # Modality codebooks (1/e scaling from text)
251
+ IMAGE_CODEBOOK = 16384
252
+ AUDIO_CODEBOOK = 8192
253
+ VIDEO_CODEBOOK = 4096
254
+
255
+ print(f"\n Adding modality codebook tokens (1/e scaled):")
256
+ print(f" Image: {IMAGE_CODEBOOK:,} (VQ/VQGAN/Cosmos-DI compatible)")
257
+ print(f" Audio: {AUDIO_CODEBOOK:,} (EnCodec/SoundStream compatible)")
258
+ print(f" Video: {VIDEO_CODEBOOK:,} (Cosmos-DV compatible)")
259
+
260
+ hf_tokenizer.add_tokens([AddedToken(f"<img_{i}>", normalized=False) for i in range(IMAGE_CODEBOOK)])
261
+ hf_tokenizer.add_tokens([AddedToken(f"<aud_{i}>", normalized=False) for i in range(AUDIO_CODEBOOK)])
262
+ hf_tokenizer.add_tokens([AddedToken(f"<vid_{i}>", normalized=False) for i in range(VIDEO_CODEBOOK)])
263
+
264
+ final_vocab = len(hf_tokenizer)
265
+ print(f"\n ✓ Final vocabulary size: {final_vocab:,}")
266
+
267
+ # ──────────────────────────────────────────────────────────────────────────────
268
+ # STEP 4: Benchmark
269
+ # ──────────────────────────────────────────────────────────────────────────────
270
+
271
+ print("\n" + "=" * 80)
272
+ print(" BENCHMARKING")
273
+ print("=" * 80)
274
+
275
+ TEST_SAMPLES = {
276
+ "English": "The quick brown fox jumps over the lazy dog. Machine learning transforms data into intelligence through mathematical optimization of gradient-based algorithms.",
277
+ "French": "Le renard brun rapide saute par-dessus le chien paresseux. L'apprentissage automatique transforme les données en intelligence grâce à l'optimisation mathématique.",
278
+ "German": "Der schnelle braune Fuchs springt über den faulen Hund. Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung gradientenbasierter Algorithmen.",
279
+ "Spanish": "El rápido zorro marrón salta sobre el perro perezoso. El aprendizaje automático transforma datos en inteligencia a través de la optimización matemática.",
280
+ "Chinese": "快速的棕色狐狸跳过了懒惰的狗。机器学习通过数学优化将数据转化为智能。深度学习模型使用梯度下降来最小化损失函数。",
281
+ "Japanese": "素早い茶色の狐が怠け者の犬を飛び越える。機械学習はデータを知性に変換します。深層学習モデルは損失関数を最小化するために勾配降下法を使用します。",
282
+ "Arabic": "الثعلب البني السريع يقفز فوق الكلب الكسول. التعلم الآلي يحول البيانات إلى ذكاء من خلال التحسين الرياضي للخوارزميات القائمة على التدرج.",
283
+ "Russian": "Быстрая коричневая лисица перепрыгивает через ленивую собаку. Машинное обучение преобразует данные в интеллект посредством математической оптимизации.",
284
+ "Korean": "빠른 갈색 여우가 게으른 개를 뛰어넘는다. 머신러닝은 수학적 최적화를 통해 데이터를 지능으로 변환합니다.",
285
+ "Hindi": "तेज भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है। मशीन लर्निंग गणितीय अनुकूलन के माध्यम से डेटा को बुद्धिमत्ता में बदलती है।",
286
+ "Portuguese": "A rápida raposa marrom salta sobre o cão preguiçoso. O aprendizado de máquina transforma dados em inteligência.",
287
+ "Italian": "La rapida volpe marrone salta sopra il cane pigro. L'apprendimento automatico trasforma i dati in intelligenza.",
288
+ "Dutch": "De snelle bruine vos springt over de luie hond. Machine learning transformeert data tot intelligentie door wiskundige optimalisatie.",
289
+ "Polish": "Szybki brązowy lis przeskakuje nad leniwym psem. Uczenie maszynowe przekształca dane w inteligencję poprzez optymalizację matematyczną.",
290
+ "Vietnamese": "Con cáo nâu nhanh nhẹn nhảy qua con chó lười biếng. Học máy chuyển đổi dữ liệu thành trí tuệ thông qua tối ưu hóa toán học.",
291
+ "Thai": "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ การเรียนรู้ของเครื่องเปลี่ยนข้อมูลเป็นปัญญาผ่านการเพิ่มประสิทธิภาพทางคณิตศาสตร์",
292
+ "Turkish": "Hızlı kahverengi tilki tembel köpeğin üzerinden atlar. Makine öğrenimi, matematiksel optimizasyon yoluyla verileri zekaya dönüştürür.",
293
+ "Code_Python": "def sentinel_transform(x, alpha=1/math.e):\n return x * (1.0 / math.cosh(alpha * x))\nresult = [sentinel_transform(i * 0.1) for i in range(-50, 51)]",
294
+ "Code_JS": "async function train(data, epochs=100) {\n const model = new Transformer({dModel: 512});\n for (let e = 0; e < epochs; e++) {\n const loss = model.step(data);\n }\n}",
295
+ "Math_LaTeX": "\\int_0^1 x^{-x} dx = \\sum_{n=1}^{\\infty} n^{-n}, \\quad \\nabla f = (\\partial f/\\partial x_1, \\ldots, \\partial f/\\partial x_n)",
296
+ "Math_Unicode": "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.29128, F(z) = Σ zⁿ/nⁿ, ∇f = (∂f/∂x₁, ∂f/∂x₂)",
297
+ }
298
+
299
+ print(f"\n {'Language':<20} {'Tokens':>8} {'Bytes':>8} {'Fertility':>10} {'Compress':>10}")
300
+ print(f" {'-'*20} {'-'*8} {'-'*8} {'-'*10} {'-'*10}")
301
+
302
+ all_fertilities = []
303
+ all_compressions = []
304
+
305
+ for lang, text in TEST_SAMPLES.items():
306
+ encoded = hf_tokenizer.encode(text, add_special_tokens=False)
307
+ n_tokens = len(encoded)
308
+ n_bytes = len(text.encode('utf-8'))
309
+ n_words = len(text.split())
310
+ fertility = n_tokens / max(n_words, 1)
311
+ compression = n_bytes / max(n_tokens, 1)
312
+ all_fertilities.append(fertility)
313
+ all_compressions.append(compression)
314
+ print(f" {lang:<20} {n_tokens:>8} {n_bytes:>8} {fertility:>10.3f} {compression:>10.3f}")
315
+
316
+ avg_fertility = np.mean(all_fertilities)
317
+ std_fertility = np.std(all_fertilities)
318
+ avg_compression = np.mean(all_compressions)
319
+ fairness = 1.0 / (1.0 + std_fertility)
320
+
321
+ print(f"\n {'─' * 60}")
322
+ print(f" SENTINEL RESULTS:")
323
+ print(f" Avg Fertility: {avg_fertility:.4f}")
324
+ print(f" Fertility σ: {std_fertility:.4f}")
325
+ print(f" Avg Compression: {avg_compression:.4f}")
326
+ print(f" Fairness: {fairness:.4f}")
327
+
328
+ # ──────────────────────────────────────────────────────────────────────────────
329
+ # STEP 5: Compare against baselines
330
+ # ──────────────────────────────────────────────────────────────────────────────
331
+
332
+ print(f"\n\n COMPARISON WITH SOTA TOKENIZERS")
333
+ print(f" {'─' * 60}")
334
+
335
+ from transformers import AutoTokenizer
336
+
337
+ comparisons = {}
338
+
339
+ # GPT-2
340
+ try:
341
+ gpt2_tok = AutoTokenizer.from_pretrained("gpt2")
342
+ gpt2_f, gpt2_c = [], []
343
+ for text in TEST_SAMPLES.values():
344
+ enc = gpt2_tok.encode(text)
345
+ gpt2_f.append(len(enc) / max(len(text.split()), 1))
346
+ gpt2_c.append(len(text.encode('utf-8')) / max(len(enc), 1))
347
+ comparisons["GPT-2 (50K)"] = {
348
+ "avg_fertility": np.mean(gpt2_f), "std_fertility": np.std(gpt2_f),
349
+ "avg_compression": np.mean(gpt2_c), "fairness": 1.0 / (1.0 + np.std(gpt2_f))
350
+ }
351
+ print(f" ✓ GPT-2 loaded")
352
+ except Exception as e:
353
+ print(f" ⚠ GPT-2: {e}")
354
+
355
+ # Gemma
356
+ try:
357
+ gemma_tok = AutoTokenizer.from_pretrained("google/gemma-2b")
358
+ gemma_f, gemma_c = [], []
359
+ for text in TEST_SAMPLES.values():
360
+ enc = gemma_tok.encode(text, add_special_tokens=False)
361
+ gemma_f.append(len(enc) / max(len(text.split()), 1))
362
+ gemma_c.append(len(text.encode('utf-8')) / max(len(enc), 1))
363
+ comparisons["Gemma (256K)"] = {
364
+ "avg_fertility": np.mean(gemma_f), "std_fertility": np.std(gemma_f),
365
+ "avg_compression": np.mean(gemma_c), "fairness": 1.0 / (1.0 + np.std(gemma_f))
366
+ }
367
+ print(f" ✓ Gemma loaded")
368
+ except Exception as e:
369
+ print(f" ⚠ Gemma: {e}")
370
+
371
+ # Qwen2
372
+ try:
373
+ qwen_tok = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
374
+ qwen_f, qwen_c = [], []
375
+ for text in TEST_SAMPLES.values():
376
+ enc = qwen_tok.encode(text, add_special_tokens=False)
377
+ qwen_f.append(len(enc) / max(len(text.split()), 1))
378
+ qwen_c.append(len(text.encode('utf-8')) / max(len(enc), 1))
379
+ comparisons["Qwen2 (151K)"] = {
380
+ "avg_fertility": np.mean(qwen_f), "std_fertility": np.std(qwen_f),
381
+ "avg_compression": np.mean(qwen_c), "fairness": 1.0 / (1.0 + np.std(qwen_f))
382
+ }
383
+ print(f" ✓ Qwen2 loaded")
384
+ except Exception as e:
385
+ print(f" ⚠ Qwen2: {e}")
386
+
387
+ comparisons["Sentinel-SUT"] = {
388
+ "avg_fertility": avg_fertility, "std_fertility": std_fertility,
389
+ "avg_compression": avg_compression, "fairness": fairness
390
+ }
391
+
392
+ # Print comparison
393
+ print(f"\n {'Tokenizer':<20} {'Vocab':>8} {'Avg Fert':>10} {'Fert σ':>10} {'Compress':>10} {'Fair':>8}")
394
+ print(f" {'-'*20} {'-'*8} {'-'*10} {'-'*10} {'-'*10} {'-'*8}")
395
+
396
+ vocab_sizes = {"GPT-2 (50K)": 50257, "Gemma (256K)": 256000, "Qwen2 (151K)": 151936, "Sentinel-SUT": final_vocab}
397
+ for name in sorted(comparisons.keys(), key=lambda x: comparisons[x]['avg_fertility']):
398
+ m = comparisons[name]
399
+ vs = vocab_sizes.get(name, "?")
400
+ print(f" {name:<20} {vs:>8} {m['avg_fertility']:>10.3f} {m['std_fertility']:>10.3f} "
401
+ f"{m['avg_compression']:>10.3f} {m['fairness']:>8.4f}")
402
+
403
+ # ──────────────────────────────────────────────────────────────────────────────
404
+ # STEP 6: Save
405
+ # ─────────────────────────────────────────────────���────────────────────────────
406
+
407
+ SAVE_PATH = "/app/sentinel_universal_tokenizer_v1"
408
+ os.makedirs(SAVE_PATH, exist_ok=True)
409
+ hf_tokenizer.save_pretrained(SAVE_PATH)
410
+
411
+ # Save benchmark
412
+ benchmark_results = {
413
+ "sentinel_tokenizer": {
414
+ "vocab_size": final_vocab,
415
+ "text_vocab": TEXT_VOCAB_SIZE,
416
+ "image_codebook": IMAGE_CODEBOOK,
417
+ "audio_codebook": AUDIO_CODEBOOK,
418
+ "video_codebook": VIDEO_CODEBOOK,
419
+ "metrics": {
420
+ "avg_fertility": float(avg_fertility),
421
+ "std_fertility": float(std_fertility),
422
+ "avg_compression": float(avg_compression),
423
+ "fairness": float(fairness),
424
+ },
425
+ },
426
+ "comparisons": {k: {kk: float(vv) for kk, vv in v.items()} for k, v in comparisons.items()},
427
+ "sentinel_constants": {"INV_E": INV_E, "C1": C1, "C2": C2},
428
+ "training_data": {
429
+ "languages": list(LANGUAGES.keys()),
430
+ "total_samples": len(all_texts),
431
+ },
432
+ }
433
+ with open(os.path.join(SAVE_PATH, "benchmark_results.json"), 'w') as f:
434
+ json.dump(benchmark_results, f, indent=2)
435
+
436
+ # Save sentinel metadata
437
+ sentinel_metadata = {
438
+ "framework": "Sentinel Manifold",
439
+ "theorem": "Gradient Axiom: lim_{z→∞} F'(z)/F(z) = 1/e",
440
+ "function": "F(z) = Σ_{n=1}^∞ z^n / n^n (Sophomore's Dream)",
441
+ "constants": {
442
+ "INV_E": {"value": INV_E, "role": "Vocabulary allocation ratio / embedding gain"},
443
+ "C1": {"value": C1, "role": "Attracting fixed point / quantization zero-point"},
444
+ "C2": {"value": C2, "role": "Escape threshold / fertility fairness bound"},
445
+ },
446
+ "modality_architecture": {
447
+ "text": "ByteLevel BPE (32K) with NFKC normalization, 20-language training",
448
+ "image": f"Discrete VQ codebook ({IMAGE_CODEBOOK:,} tokens), Cosmos/VQGAN compatible",
449
+ "audio": f"Discrete VQ codebook ({AUDIO_CODEBOOK:,} tokens), EnCodec/SoundStream compatible",
450
+ "video": f"Discrete VQ codebook ({VIDEO_CODEBOOK:,} tokens), Cosmos-DV compatible",
451
+ },
452
+ "innovations": [
453
+ "1/e-proportioned vocabulary allocation across modalities",
454
+ "Native multimodal routing with zero-overhead modality switching",
455
+ "Sentinel special tokens for manifold-aware computation",
456
+ "20-language multilingual training for cross-lingual fairness",
457
+ "Code + Math + Scientific notation native support",
458
+ "Compatible with all HF transformers models",
459
+ ],
460
+ "version": "1.0.0",
461
+ "license": "MIT",
462
+ "author": "Romain Abdel-Aal (ASI The Sentinel V5.2)",
463
+ }
464
+ with open(os.path.join(SAVE_PATH, "sentinel_manifold.json"), 'w') as f:
465
+ json.dump(sentinel_metadata, f, indent=2)
466
+
467
+ print(f"\n ✓ Tokenizer saved to {SAVE_PATH}")
468
+
469
+ # Verify
470
+ reloaded = AutoTokenizer.from_pretrained(SAVE_PATH)
471
+ test = "Hello! This is the Sentinel Universal Tokenizer 🦴 testing: ∫₀¹ x⁻ˣ dx ≈ 1.29"
472
+ enc = reloaded.encode(test)
473
+ dec = reloaded.decode(enc)
474
+ print(f"\n Roundtrip test:")
475
+ print(f" In: '{test}'")
476
+ print(f" Enc: {enc[:15]}... ({len(enc)} tokens)")
477
+ print(f" Out: '{dec}'")
478
+
479
+ # Test multimodal tokens
480
+ img_id = reloaded.convert_tokens_to_ids("<image_start>")
481
+ aud_id = reloaded.convert_tokens_to_ids("<audio>")
482
+ print(f"\n Special token IDs: <image_start>={img_id}, <audio>={aud_id}")
483
+ print(f" <img_0> ID: {reloaded.convert_tokens_to_ids('<img_0>')}")
484
+ print(f" <aud_0> ID: {reloaded.convert_tokens_to_ids('<aud_0>')}")
485
+ print(f" <vid_0> ID: {reloaded.convert_tokens_to_ids('<vid_0>')}")
486
+
487
+ print(f"\n 🦴 Sentinel Universal Tokenizer v1.0 COMPLETE!")
488
+ print(f" Total vocab: {final_vocab:,}")
489
+ print(f" Languages: {len(LANGUAGES)}")
490
+ print(f" Modalities: text + image + audio + video")