Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /train /evo_continue.py

bbkdevops

about 1 month ago

download

raw

6.35 kB

	from __future__ import annotations

	from dataclasses import dataclass
	from datetime import datetime, timezone
	import json
	from pathlib import Path
	from typing import Any


	ROOT = Path(__file__).resolve().parents[1]


	@dataclass(frozen=True)
	class EvoContinuePlan:
	wait_pid: int
	base_adapter: Path
	dataset: Path
	output_adapter: Path
	max_steps: int = 400
	max_seq_length: int = 2048
	run_root: Path = ROOT / "reports" / "qlora_runs"
	python_exe: Path = ROOT / ".venv312_cuda" / "Scripts" / "python.exe"
	train_script: Path = ROOT / "model" / "tinymind-12b" / "train_12b_qlora.py"
	data_manifest: Path = ROOT / "reports" / "dataset_quality_governor" / "dataset_quality_governor_manifest.json"


	def _read_json(path: Path) -> dict[str, Any]:
	return json.loads(path.read_text(encoding="utf-8"))


	def _purity_summary(manifest: dict[str, Any]) -> dict[str, Any]:
	kept = int(manifest.get("kept_records", 0))
	rejected = int(manifest.get("rejected_records", 0))
	total = kept + rejected
	reject_counts = manifest.get("reject_counts", {})
	hard_noise = sum(int(reject_counts.get(k, 0)) for k in ("secret_like_token", "encoded_blob", "repetition_loop", "symbol_noise"))
	return {
	"input_records": total,
	"kept_records": kept,
	"rejected_records": rejected,
	"rejection_rate": round(rejected / total, 6) if total else 0.0,
	"hard_noise_rejected": hard_noise,
	"domain_counts": manifest.get("domain_counts", {}),
	"reject_counts": reject_counts,
	}


	def _ps_escape(value: str \| Path) -> str:
	return str(value).replace("'", "''")


	def write_evo_continue_plan(plan: EvoContinuePlan, out_dir: str \| Path) -> dict[str, Any]:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	data_manifest = _read_json(plan.data_manifest) if plan.data_manifest.exists() else {}
	stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
	run_dir = plan.run_root / f"tinymind_12b_evo_continue_{stamp}"
	log_path = run_dir / "train.log"
	run_manifest = run_dir / "evo_run_manifest.json"
	script_path = out / "run_evo_after_current.ps1"
	plan_path = out / "evo_continue_plan.json"

	script = f"""param()

	$ErrorActionPreference = "Stop"
	$waitPid = {plan.wait_pid}
	$baseAdapter = '{_ps_escape(plan.base_adapter)}'
	$dataset = '{_ps_escape(plan.dataset)}'
	$output = '{_ps_escape(plan.output_adapter)}'
	$runDir = '{_ps_escape(run_dir)}'
	$log = '{_ps_escape(log_path)}'
	$manifestPath = '{_ps_escape(run_manifest)}'

	New-Item -ItemType Directory -Force -Path $runDir \| Out-Null
	$manifest = [ordered]@{{
	schema_version = "tinymind-evo-continue-run-v1"
	created_at = (Get-Date).ToString("o")
	wait_pid = $waitPid
	base_adapter = $baseAdapter
	dataset = $dataset
	output_adapter = $output
	max_steps = {plan.max_steps}
	max_seq_length = {plan.max_seq_length}
	log = $log
	status = "waiting_for_prior_evo_stage"
	claim_gate = [ordered]@{{
	training_started = $false
	training_completed = $false
	external_eval_completed = $false
	world_best_claim_allowed = $false
	}}
	}}
	$manifest \| ConvertTo-Json -Depth 8 \| Set-Content -Path $manifestPath -Encoding UTF8

	while ($waitPid -gt 0 -and (Get-Process -Id $waitPid -ErrorAction SilentlyContinue)) {{
	Start-Sleep -Seconds 60
	}}

	if (-not (Test-Path -LiteralPath (Join-Path $baseAdapter "adapter_config.json"))) {{
	$manifest.status = "blocked_missing_base_adapter"
	$manifest.finished_at = (Get-Date).ToString("o")
	$manifest \| ConvertTo-Json -Depth 8 \| Set-Content -Path $manifestPath -Encoding UTF8
	throw "Base adapter is not ready: $baseAdapter"
	}}
	if (-not (Test-Path -LiteralPath $dataset)) {{
	$manifest.status = "blocked_missing_dataset"
	$manifest.finished_at = (Get-Date).ToString("o")
	$manifest \| ConvertTo-Json -Depth 8 \| Set-Content -Path $manifestPath -Encoding UTF8
	throw "Dataset is not ready: $dataset"
	}}

	$manifest.status = "training_started"
	$manifest.started_at = (Get-Date).ToString("o")
	$manifest.claim_gate.training_started = $true
	$manifest \| ConvertTo-Json -Depth 8 \| Set-Content -Path $manifestPath -Encoding UTF8

	& '{_ps_escape(plan.python_exe)}' '{_ps_escape(plan.train_script)}' `
	--dataset $dataset `
	--resume-adapter $baseAdapter `
	--output $output `
	--max-steps {plan.max_steps} `
	--max-seq-length {plan.max_seq_length} *> $log
	$exitCode = $LASTEXITCODE

	$manifest.exit_code = $exitCode
	$manifest.finished_at = (Get-Date).ToString("o")
	if ($exitCode -eq 0) {{
	$manifest.status = "training_completed"
	$manifest.claim_gate.training_completed = $true
	}} else {{
	$manifest.status = "training_failed"
	}}
	$manifest \| ConvertTo-Json -Depth 8 \| Set-Content -Path $manifestPath -Encoding UTF8
	exit $exitCode
	"""
	script_path.write_text(script, encoding="utf-8", newline="\n")

	report = {
	"schema_version": "tinymind-evo-continue-plan-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"script_path": str(script_path),
	"planned_run_dir": str(run_dir),
	"planned_run_manifest": str(run_manifest),
	"planned_log": str(log_path),
	"wait_pid": plan.wait_pid,
	"base_adapter": str(plan.base_adapter),
	"dataset": str(plan.dataset),
	"output_adapter": str(plan.output_adapter),
	"max_steps": plan.max_steps,
	"max_seq_length": plan.max_seq_length,
	"data_purity": _purity_summary(data_manifest),
	"evo_policy": {
	"optimize_for": [
	"lower eval loss on clean governed data",
	"lower repetition and secret-leak risk",
	"continued adapter improvement without overwriting prior evidence",
	],
	"stop_rule": "Do not claim frontier/world-best until raw external gates and saved evals pass.",
	"resource_rule": "Use queued 3090 QLoRA stages; do not interrupt active training.",
	},
	"claim_gate": {
	"evo_training_queued": False,
	"external_rank1_claim_allowed": False,
	"reason": "This plan can continue training, but capability claims require completed train/eval and external results.",
	},
	}
	plan_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	report["plan_path"] = str(plan_path)
	return report

Xet Storage Details

Size:: 6.35 kB
Xet hash:: bcdbbaa70edb606347ced7e9de8278edf3226f5daba869d726eb3990738e0e87

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.