bbkdevops's picture
download
raw
5.65 kB
from __future__ import annotations
from datetime import datetime, timezone
import json
import math
from pathlib import Path
from typing import Any
import torch
from model.architecture import OmegaModel
from model.config import OmegaConfig
def _render(row: dict[str, Any]) -> str:
parts = []
for message in row.get("messages", []):
role = str(message.get("role", "user")).upper()
content = str(message.get("content", "")).strip()
if content:
parts.append(f"{role}: {content}")
return "\n".join(parts)
def _load_rows(path: str | Path, limit_records: int | None = None) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with Path(path).open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
continue
rows.append(json.loads(line))
if limit_records is not None and len(rows) >= limit_records:
break
return rows
def _encode(text: str, seq_len: int, vocab_size: int) -> torch.Tensor:
# Byte-level deterministic tokenizer: compact, self-contained, and enough
# for smoke proof without depending on HF tokenizers.
ids = [1]
ids.extend(4 + (byte % max(1, vocab_size - 4)) for byte in text.encode("utf-8", errors="replace"))
ids.append(2)
ids = ids[:seq_len]
if len(ids) < seq_len:
ids.extend([0] * (seq_len - len(ids)))
return torch.tensor(ids, dtype=torch.long)
def _batch(rows: list[dict[str, Any]], seq_len: int, vocab_size: int) -> torch.Tensor:
return torch.stack([_encode(_render(row), seq_len, vocab_size) for row in rows], dim=0)
def _config(dim: int, layers: int, seq_len: int, vocab_size: int) -> OmegaConfig:
heads = max(1, min(8, dim // 32))
return OmegaConfig(
vocab_size=vocab_size,
dim=dim,
n_layers=layers,
n_heads=heads,
head_dim=dim // heads,
ffn_mult=2,
layer_pattern="P",
memory_ranks=max(8, dim // 8),
local_window=min(32, seq_len),
timescale_count=4,
low_rank=4,
residual_alpha=layers ** -0.5,
max_seq_len=seq_len,
dropout=0.0,
architecture_mode="purefield",
cnn_core_enabled=True,
self_assessment_enabled=True,
self_assessment_frequency=max(1, layers),
self_assessment_steps=2,
regen_kv_enabled=False,
)
@torch.no_grad()
def _eval_loss(model: OmegaModel, ids: torch.Tensor) -> float:
model.eval()
out = model(ids, labels=ids)
return float(out["loss"].detach().cpu())
def run_native_micro_train(
out_dir: str | Path,
*,
dataset: str | Path,
max_steps: int = 8,
eval_records: int = 16,
limit_records: int | None = None,
dim: int = 128,
layers: int = 3,
seq_len: int = 128,
vocab_size: int = 512,
learning_rate: float = 3e-4,
seed: int = 20260527,
) -> dict[str, Any]:
torch.manual_seed(seed)
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
rows = _load_rows(dataset, limit_records=limit_records)
if len(rows) < 4:
raise ValueError("native micro-train requires at least 4 rows")
eval_n = max(1, min(eval_records, len(rows) // 3))
eval_rows = rows[:eval_n]
train_rows = rows[eval_n:]
cfg = _config(dim, layers, seq_len, vocab_size)
model = OmegaModel(cfg)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.0)
eval_ids = _batch(eval_rows, seq_len, vocab_size)
train_ids = _batch(train_rows, seq_len, vocab_size)
pre_loss = _eval_loss(model, eval_ids)
losses: list[float] = []
model.train()
steps = max(1, int(max_steps))
for step in range(steps):
sample = train_ids[step % train_ids.shape[0] : step % train_ids.shape[0] + 1]
optimizer.zero_grad(set_to_none=True)
out_dict = model(sample, labels=sample)
loss = out_dict["loss"]
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
losses.append(float(loss.detach().cpu()))
post_loss = _eval_loss(model, eval_ids)
report = {
"schema": "tinymind.native_micro_train.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"dataset": str(dataset),
"summary": {
"records_loaded": len(rows),
"train_records": len(train_rows),
"eval_records": len(eval_rows),
"dim": dim,
"layers": layers,
"seq_len": seq_len,
"vocab_size": vocab_size,
},
"metrics": {
"pre_eval_loss": pre_loss,
"post_eval_loss": post_loss,
"pre_eval_loss_finite": math.isfinite(pre_loss),
"post_eval_loss_finite": math.isfinite(post_loss),
"train_loss_first": losses[0],
"train_loss_last": losses[-1],
"train_steps_completed": len(losses),
"post_minus_pre_eval_loss": post_loss - pre_loss,
},
"claim_gate": {
"native_training_proven": math.isfinite(pre_loss) and math.isfinite(post_loss) and len(losses) == steps,
"tier0_claim_allowed": False,
"world_best_claim_allowed": False,
"reason": "This is a micro-train smoke proof for TinyMind-native learning, not broad capability evidence.",
},
}
path = out / "native_micro_train_report.json"
report["json_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return report

Xet Storage Details

Size:
5.65 kB
·
Xet hash:
cc8b8487046e7661d425f042da21a1a81d0bee869bb891d50320e26bc2596507

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.