Upload training_code/train.py with huggingface_hub

d42a1f3 verified 21 days ago

10.1 kB

	"""
	Distributed training script for 1B parameter Transformer.

	Launch: torchrun --nproc_per_node=8 train.py

	Stack: PyTorch DDP + BF16 autocast + 8x H100 80GB
	"""

	import os
	import sys
	import math
	import time
	import json
	import datetime

	import torch
	import torch.distributed as dist
	from torch.nn.parallel import DistributedDataParallel as DDP

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	from model.config import ModelConfig, TrainConfig
	from model.transformer import Transformer
	from model.data import get_tokenizer, create_dataloader


	def get_wsd_lr(step, warmup_steps, total_steps, max_lr, min_lr):
	"""Warmup-Stable-Decay: linear warmup -> constant -> cosine decay (last 20%)."""
	stable_end = int(total_steps * 0.8)
	if step < warmup_steps:
	return max_lr * step / max(warmup_steps, 1)
	elif step < stable_end:
	return max_lr
	else:
	progress = (step - stable_end) / max(total_steps - stable_end, 1)
	return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * progress))


	def find_latest_checkpoint(checkpoint_dir):
	"""Find the latest step_*.pt checkpoint in the directory."""
	import glob
	pattern = os.path.join(checkpoint_dir, "step_*.pt")
	files = glob.glob(pattern)
	if not files:
	return None, 0
	latest = max(files, key=lambda f: int(os.path.basename(f).replace("step_", "").replace(".pt", "")))
	step = int(os.path.basename(latest).replace("step_", "").replace(".pt", ""))
	return latest, step


	def main():
	dist.init_process_group("nccl", timeout=datetime.timedelta(minutes=30))
	rank = int(os.environ.get("RANK", 0))
	local_rank = int(os.environ.get("LOCAL_RANK", 0))
	world_size = int(os.environ.get("WORLD_SIZE", 1))
	torch.cuda.set_device(local_rank)
	device = torch.device(f"cuda:{local_rank}")

	model_config = ModelConfig()
	train_config = TrainConfig()

	eff_batch = train_config.batch_size_per_gpu * world_size * train_config.gradient_accumulation_steps
	tokens_per_step = eff_batch * model_config.max_seq_len
	total_steps = train_config.total_tokens // tokens_per_step

	if rank == 0:
	os.makedirs(train_config.log_dir, exist_ok=True)
	os.makedirs(train_config.checkpoint_dir, exist_ok=True)
	print("=" * 70)
	print(f" TRAINING 1B TRANSFORMER FROM SCRATCH")
	print(f" Arch: {model_config.num_layers}L / {model_config.hidden_dim}D / "
	f"{model_config.num_attention_heads}H / GQA-{model_config.num_kv_heads}KV / "
	f"SwiGLU-{model_config.intermediate_dim}")
	print(f" Seq: {model_config.max_seq_len} \| Vocab: {model_config.vocab_size}")
	print(f" GPUs: {world_size}x H100 80GB \| Backend: DDP + BF16 autocast")
	print(f" Batch: {eff_batch} seqs = {tokens_per_step:,} tok/step")
	print(f" Steps: {total_steps:,} \| Target: {train_config.total_tokens:,} tokens")
	print("=" * 70)

	# Tokenizer
	tokenizer = get_tokenizer()

	# Model
	torch.manual_seed(train_config.seed)
	model = Transformer(model_config).to(device)

	if rank == 0:
	n = sum(p.numel() for p in model.parameters())
	print(f"[Init] Params: {n:,} ({n/1e9:.3f}B)")

	model = DDP(model, device_ids=[local_rank])

	# Optimizer
	decay_params = [p for n, p in model.named_parameters() if p.dim() >= 2 and p.requires_grad]
	nodecay_params = [p for n, p in model.named_parameters() if p.dim() < 2 and p.requires_grad]
	optimizer = torch.optim.AdamW([
	{"params": decay_params, "weight_decay": train_config.weight_decay},
	{"params": nodecay_params, "weight_decay": 0.0},
	], lr=train_config.learning_rate, betas=(train_config.beta1, train_config.beta2), fused=True)

	if rank == 0:
	dp = sum(p.numel() for p in decay_params)
	ndp = sum(p.numel() for p in nodecay_params)
	print(f"[Init] Optimizer: {dp:,} decay + {ndp:,} no-decay params")

	# Resume from checkpoint
	resume_step = 0
	ckpt_path, ckpt_step = find_latest_checkpoint(train_config.checkpoint_dir)
	if ckpt_path is not None:
	if rank == 0:
	print(f"[Resume] Loading checkpoint: {ckpt_path} (step {ckpt_step})")
	ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
	model.module.load_state_dict(ckpt["model"])
	optimizer.load_state_dict(ckpt["optimizer"])
	resume_step = ckpt["step"]
	if rank == 0:
	print(f"[Resume] Restored model + optimizer at step {resume_step}, "
	f"loss was {ckpt.get('loss', 'N/A')}")
	del ckpt
	torch.cuda.empty_cache()
	else:
	if rank == 0:
	print("[Init] No checkpoint found, starting from scratch")

	# Data — use (seed + resume_step) so resumed runs see different shuffled data
	effective_seed = train_config.seed + resume_step
	dataloader = create_dataloader(tokenizer, train_config, rank=rank, world_size=world_size,
	seed_override=effective_seed)
	data_iter = iter(dataloader)

	if rank == 0:
	print(f"[Init] Dataloader ready (streaming FineWeb-Edu 10BT)")
	print(f"[Schedule] WSD: warmup {train_config.warmup_steps} -> "
	f"stable {int(total_steps*0.8)} -> decay {total_steps}")
	if resume_step > 0:
	remaining = total_steps - resume_step
	print(f"[Resume] Continuing from step {resume_step}, {remaining:,} steps remaining")
	print("-" * 70)
	sys.stdout.flush()

	# ===== TRAINING LOOP =====
	model.train()
	global_step = resume_step
	running_loss = 0.0
	best_loss = float("inf")
	tokens_done = resume_step * tokens_per_step
	t0 = time.time()
	step_t0 = time.time()

	log_file = open(os.path.join(train_config.log_dir, "train_log.jsonl"), "a") if rank == 0 else None

	while global_step < total_steps:
	optimizer.zero_grad(set_to_none=True)
	micro_loss = 0.0

	for micro in range(train_config.gradient_accumulation_steps):
	try:
	input_ids, labels = next(data_iter)
	except StopIteration:
	data_iter = iter(dataloader)
	input_ids, labels = next(data_iter)

	input_ids = input_ids.to(device, non_blocking=True)
	labels = labels.to(device, non_blocking=True)

	# BF16 autocast — no scaler needed (BF16 has enough dynamic range)
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	_, loss = model(input_ids, labels)
	loss = loss / train_config.gradient_accumulation_steps

	loss.backward()
	micro_loss += loss.item()

	# Gradient clipping
	torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)

	# LR schedule
	lr = get_wsd_lr(global_step, train_config.warmup_steps, total_steps,
	train_config.learning_rate, train_config.min_lr)
	for pg in optimizer.param_groups:
	pg["lr"] = lr

	optimizer.step()
	global_step += 1
	running_loss += micro_loss
	tokens_done += tokens_per_step

	# Log
	if global_step % train_config.log_interval == 0:
	dt = time.time() - step_t0
	tps = (train_config.log_interval * tokens_per_step) / max(dt, 1e-9)
	avg = running_loss / train_config.log_interval
	elapsed = time.time() - t0
	pct = 100.0 * global_step / total_steps
	eta = (elapsed / max(global_step, 1)) * (total_steps - global_step)

	if rank == 0:
	gpu_mem = torch.cuda.max_memory_allocated(device) / 1e9
	print(
	f"[Step {global_step:>6d}/{total_steps}] "
	f"loss={avg:.4f} \| lr={lr:.2e} \| "
	f"tok/s={tps:,.0f} \| GPU={gpu_mem:.1f}GB \| "
	f"{pct:.1f}% \| ETA={eta/3600:.1f}h",
	flush=True,
	)
	if log_file:
	log_file.write(json.dumps({
	"step": global_step, "loss": round(avg, 4), "lr": lr,
	"tps": round(tps), "tokens": tokens_done,
	"gpu_gb": round(gpu_mem, 1), "elapsed_s": round(elapsed, 1),
	}) + "\n")
	log_file.flush()

	if avg < best_loss:
	best_loss = avg
	running_loss = 0.0
	step_t0 = time.time()

	# Checkpoint
	if global_step % train_config.save_interval == 0:
	dist.barrier()
	if rank == 0:
	ckpt_path = os.path.join(train_config.checkpoint_dir, f"step_{global_step}.pt")
	torch.save({
	"step": global_step,
	"model": model.module.state_dict(),
	"optimizer": optimizer.state_dict(),
	"loss": avg if global_step % train_config.log_interval == 0 else micro_loss,
	"config": {"model": model_config.__dict__, "train": train_config.__dict__},
	}, ckpt_path)
	print(f" >> Checkpoint: {ckpt_path}", flush=True)
	dist.barrier()

	# Final
	dist.barrier()
	if rank == 0:
	final_path = os.path.join(train_config.checkpoint_dir, "final.pt")
	torch.save({
	"step": global_step,
	"model": model.module.state_dict(),
	"config": {"model": model_config.__dict__, "train": train_config.__dict__},
	}, final_path)
	total_time = time.time() - t0
	print("=" * 70)
	print(f" TRAINING COMPLETE")
	print(f" Steps: {global_step:,} \| Tokens: {tokens_done:,}")
	print(f" Time: {total_time/3600:.2f}h \| Throughput: {tokens_done/total_time:,.0f} tok/s")
	print(f" Best loss: {best_loss:.4f}")
	print(f" Final model: {final_path}")
	print("=" * 70)
	if log_file:
	log_file.close()

	dist.destroy_process_group()


	if __name__ == "__main__":
	main()