bbkdevops's picture
download
raw
6.35 kB
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
@dataclass(frozen=True)
class EvoContinuePlan:
wait_pid: int
base_adapter: Path
dataset: Path
output_adapter: Path
max_steps: int = 400
max_seq_length: int = 2048
run_root: Path = ROOT / "reports" / "qlora_runs"
python_exe: Path = ROOT / ".venv312_cuda" / "Scripts" / "python.exe"
train_script: Path = ROOT / "model" / "tinymind-12b" / "train_12b_qlora.py"
data_manifest: Path = ROOT / "reports" / "dataset_quality_governor" / "dataset_quality_governor_manifest.json"
def _read_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _purity_summary(manifest: dict[str, Any]) -> dict[str, Any]:
kept = int(manifest.get("kept_records", 0))
rejected = int(manifest.get("rejected_records", 0))
total = kept + rejected
reject_counts = manifest.get("reject_counts", {})
hard_noise = sum(int(reject_counts.get(k, 0)) for k in ("secret_like_token", "encoded_blob", "repetition_loop", "symbol_noise"))
return {
"input_records": total,
"kept_records": kept,
"rejected_records": rejected,
"rejection_rate": round(rejected / total, 6) if total else 0.0,
"hard_noise_rejected": hard_noise,
"domain_counts": manifest.get("domain_counts", {}),
"reject_counts": reject_counts,
}
def _ps_escape(value: str | Path) -> str:
return str(value).replace("'", "''")
def write_evo_continue_plan(plan: EvoContinuePlan, out_dir: str | Path) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
data_manifest = _read_json(plan.data_manifest) if plan.data_manifest.exists() else {}
stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
run_dir = plan.run_root / f"tinymind_12b_evo_continue_{stamp}"
log_path = run_dir / "train.log"
run_manifest = run_dir / "evo_run_manifest.json"
script_path = out / "run_evo_after_current.ps1"
plan_path = out / "evo_continue_plan.json"
script = f"""param()
$ErrorActionPreference = "Stop"
$waitPid = {plan.wait_pid}
$baseAdapter = '{_ps_escape(plan.base_adapter)}'
$dataset = '{_ps_escape(plan.dataset)}'
$output = '{_ps_escape(plan.output_adapter)}'
$runDir = '{_ps_escape(run_dir)}'
$log = '{_ps_escape(log_path)}'
$manifestPath = '{_ps_escape(run_manifest)}'
New-Item -ItemType Directory -Force -Path $runDir | Out-Null
$manifest = [ordered]@{{
schema_version = "tinymind-evo-continue-run-v1"
created_at = (Get-Date).ToString("o")
wait_pid = $waitPid
base_adapter = $baseAdapter
dataset = $dataset
output_adapter = $output
max_steps = {plan.max_steps}
max_seq_length = {plan.max_seq_length}
log = $log
status = "waiting_for_prior_evo_stage"
claim_gate = [ordered]@{{
training_started = $false
training_completed = $false
external_eval_completed = $false
world_best_claim_allowed = $false
}}
}}
$manifest | ConvertTo-Json -Depth 8 | Set-Content -Path $manifestPath -Encoding UTF8
while ($waitPid -gt 0 -and (Get-Process -Id $waitPid -ErrorAction SilentlyContinue)) {{
Start-Sleep -Seconds 60
}}
if (-not (Test-Path -LiteralPath (Join-Path $baseAdapter "adapter_config.json"))) {{
$manifest.status = "blocked_missing_base_adapter"
$manifest.finished_at = (Get-Date).ToString("o")
$manifest | ConvertTo-Json -Depth 8 | Set-Content -Path $manifestPath -Encoding UTF8
throw "Base adapter is not ready: $baseAdapter"
}}
if (-not (Test-Path -LiteralPath $dataset)) {{
$manifest.status = "blocked_missing_dataset"
$manifest.finished_at = (Get-Date).ToString("o")
$manifest | ConvertTo-Json -Depth 8 | Set-Content -Path $manifestPath -Encoding UTF8
throw "Dataset is not ready: $dataset"
}}
$manifest.status = "training_started"
$manifest.started_at = (Get-Date).ToString("o")
$manifest.claim_gate.training_started = $true
$manifest | ConvertTo-Json -Depth 8 | Set-Content -Path $manifestPath -Encoding UTF8
& '{_ps_escape(plan.python_exe)}' '{_ps_escape(plan.train_script)}' `
--dataset $dataset `
--resume-adapter $baseAdapter `
--output $output `
--max-steps {plan.max_steps} `
--max-seq-length {plan.max_seq_length} *> $log
$exitCode = $LASTEXITCODE
$manifest.exit_code = $exitCode
$manifest.finished_at = (Get-Date).ToString("o")
if ($exitCode -eq 0) {{
$manifest.status = "training_completed"
$manifest.claim_gate.training_completed = $true
}} else {{
$manifest.status = "training_failed"
}}
$manifest | ConvertTo-Json -Depth 8 | Set-Content -Path $manifestPath -Encoding UTF8
exit $exitCode
"""
script_path.write_text(script, encoding="utf-8", newline="\n")
report = {
"schema_version": "tinymind-evo-continue-plan-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"script_path": str(script_path),
"planned_run_dir": str(run_dir),
"planned_run_manifest": str(run_manifest),
"planned_log": str(log_path),
"wait_pid": plan.wait_pid,
"base_adapter": str(plan.base_adapter),
"dataset": str(plan.dataset),
"output_adapter": str(plan.output_adapter),
"max_steps": plan.max_steps,
"max_seq_length": plan.max_seq_length,
"data_purity": _purity_summary(data_manifest),
"evo_policy": {
"optimize_for": [
"lower eval loss on clean governed data",
"lower repetition and secret-leak risk",
"continued adapter improvement without overwriting prior evidence",
],
"stop_rule": "Do not claim frontier/world-best until raw external gates and saved evals pass.",
"resource_rule": "Use queued 3090 QLoRA stages; do not interrupt active training.",
},
"claim_gate": {
"evo_training_queued": False,
"external_rank1_claim_allowed": False,
"reason": "This plan can continue training, but capability claims require completed train/eval and external results.",
},
}
plan_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
report["plan_path"] = str(plan_path)
return report

Xet Storage Details

Size:
6.35 kB
·
Xet hash:
bcdbbaa70edb606347ced7e9de8278edf3226f5daba869d726eb3990738e0e87

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.