Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /self_dialogue_forge.py

bbkdevops

about 1 month ago

download

raw

8.49 kB

	"""Self-dialogue dataset forge for TinyMind.

	The records are generated from deterministic rules with disjoint train/eval
	seeds. Each target contains plan/act/verify/final fields so the model learns
	to produce an interaction trace instead of memorising a bare answer string.
	"""

	from __future__ import annotations

	from collections import Counter
	from dataclasses import asdict, dataclass
	import hashlib
	import json
	import random
	from pathlib import Path
	from typing import Iterable


	SCHEMA_VERSION = "tinymind-self-dialogue-v1"


	@dataclass(frozen=True)
	class SelfDialogueRecord:
	split: str
	lang: str
	domain: str
	rule_id: str
	prompt: str
	plan: str
	action: str
	verification: str
	final: str
	oracle: str
	source_seed: int
	quality_score: float = 1.0

	@property
	def target(self) -> str:
	return (
	f"<plan>{self.plan}</plan>\n"
	f"<act>{self.action}</act>\n"
	f"<verify>{self.verification}</verify>\n"
	f"<final>{self.final}</final>"
	)


	def _record_id(record: SelfDialogueRecord) -> str:
	payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True)
	return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:24]


	def _make_arithmetic(seed: int, split: str, lang: str) -> SelfDialogueRecord:
	rng = random.Random(seed)
	case_id = seed % 1_000_003
	a = rng.randint(11, 89)
	b = rng.randint(7, 61)
	c = rng.randint(2, 9)
	answer = (a + b) * c
	if lang == "th":
	prompt = f"case {case_id}: คิดเองทีละขั้น: ({a} + {b}) * {c} ได้เท่าไร"
	plan = "แยกโจทย์เป็นบวกก่อนแล้วคูณ ตรวจด้วยการคำนวณย้อนกลับ"
	action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}"
	verification = f"ตรวจย้อนกลับ {answer}/{c}={a + b} และ {a + b}-{b}={a}"
	final = f"คำตอบคือ {answer}"
	else:
	prompt = f"case {case_id}: Self-solve step by step: ({a} + {b}) * {c}"
	plan = "Add first, multiply second, then verify by reversing the operation."
	action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}"
	verification = f"Reverse check: {answer}/{c}={a + b} and {a + b}-{b}={a}"
	final = f"The answer is {answer}"
	return SelfDialogueRecord(split, lang, "symbolic_math", "add_then_multiply", prompt, plan, action, verification, final, str(answer), seed)


	def _make_boolean(seed: int, split: str, lang: str) -> SelfDialogueRecord:
	rng = random.Random(seed)
	case_id = seed % 1_000_003
	a = bool(rng.randint(0, 1))
	b = bool(rng.randint(0, 1))
	c = bool(rng.randint(0, 1))
	answer = (a and not b) or c
	bits = f"A={int(a)}, B={int(b)}, C={int(c)}"
	if lang == "th":
	prompt = f"case {case_id}: ตรวจตรรกะด้วยตัวเอง: ({bits}) ค่า (A and not B) or C คืออะไร"
	plan = "หาค่า not B ก่อน จากนั้นทำ and แล้ว or กับ C"
	action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}"
	verification = "แทนค่าซ้ำในนิพจน์เดิมแล้วได้ผลเดียวกัน"
	final = f"ผลลัพธ์คือ {int(answer)}"
	else:
	prompt = f"case {case_id}: Self-check the logic: ({bits}) evaluate (A and not B) or C"
	plan = "Compute not B, combine with A, then OR the result with C."
	action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}"
	verification = "Substituting the values back into the expression gives the same result."
	final = f"The result is {int(answer)}"
	return SelfDialogueRecord(split, lang, "logic", "boolean_trace", prompt, plan, action, verification, final, str(int(answer)), seed)


	def _make_repair(seed: int, split: str, lang: str) -> SelfDialogueRecord:
	rng = random.Random(seed)
	case_id = seed % 1_000_003
	n = rng.randint(18, 80)
	wrong = n + rng.choice([-5, -3, 4, 6])
	answer = n * 2
	if lang == "th":
	prompt = f"case {case_id}: มีคำตอบร่างว่า {n}*2={wrong} ให้ตรวจเองแล้วแก้"
	plan = "อย่าเชื่อร่างคำตอบ ให้คูณใหม่และเทียบกับคำตอบร่าง"
	action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}"
	verification = "คำนวณซ้ำพบว่าร่างเดิมไม่ตรง จึงต้องแก้เป็นค่าที่คำนวณใหม่"
	final = f"แก้คำตอบเป็น {answer}"
	else:
	prompt = f"case {case_id}: A draft says {n}*2={wrong}. Self-check and repair it."
	plan = "Do not trust the draft; recompute and compare against it."
	action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}"
	verification = "Recomputation disagrees with the draft, so the final answer must be repaired."
	final = f"Repair the answer to {answer}"
	return SelfDialogueRecord(split, lang, "self_correction", "repair_wrong_draft", prompt, plan, action, verification, final, str(answer), seed)


	GENERATORS = (_make_arithmetic, _make_boolean, _make_repair)


	class SelfDialogueForge:
	"""Generate auditable self-dialogue train/eval data."""

	def __init__(self, train_size: int = 48, eval_size: int = 12, seed: int = 20260523):
	self.train_size = int(train_size)
	self.eval_size = int(eval_size)
	self.seed = int(seed)

	def build(self) -> tuple[list[SelfDialogueRecord], list[SelfDialogueRecord]]:
	train = self._build_split("train", self.train_size, self.seed)
	eval_rows = self._build_split("eval", self.eval_size, self.seed + 1_000_000)
	return train, eval_rows

	def _build_split(self, split: str, size: int, base_seed: int) -> list[SelfDialogueRecord]:
	rows: list[SelfDialogueRecord] = []
	for i in range(size):
	lang = "th" if i % 3 == 0 else "en"
	generator = GENERATORS[i % len(GENERATORS)]
	rows.append(generator(base_seed + i * 37, split, lang))
	return rows

	def write_jsonl(self, out_dir: str \| Path) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	train, eval_rows = self.build()
	train_path = out / "self_dialogue_train.jsonl"
	eval_path = out / "self_dialogue_eval.jsonl"
	self._write_rows(train_path, train)
	self._write_rows(eval_path, eval_rows)

	train_ids = {_record_id(row) for row in train}
	eval_ids = {_record_id(row) for row in eval_rows}
	manifest = {
	"schema_version": SCHEMA_VERSION,
	"train_path": str(train_path),
	"eval_path": str(eval_path),
	"train_records": len(train),
	"eval_records": len(eval_rows),
	"train_eval_id_overlap": len(train_ids & eval_ids),
	"domain_counts": dict(Counter(row.domain for row in train + eval_rows)),
	"rule_counts": dict(Counter(row.rule_id for row in train + eval_rows)),
	"purity_policy": [
	"deterministic_oracle_generated",
	"train_eval_seed_disjoint",
	"contains_plan_act_verify_final",
	"no_external_answer_corpus",
	],
	"sha256": {
	"train": hashlib.sha256(train_path.read_bytes()).hexdigest(),
	"eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(),
	},
	}
	manifest_path = out / "self_dialogue_manifest.json"
	manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return manifest

	@staticmethod
	def _write_rows(path: Path, rows: Iterable[SelfDialogueRecord]) -> None:
	with path.open("w", encoding="utf-8", newline="\n") as f:
	for row in rows:
	payload = asdict(row)
	payload["id"] = _record_id(row)
	payload["schema_version"] = SCHEMA_VERSION
	payload["target"] = row.target
	f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")

Xet Storage Details

Size:: 8.49 kB
Xet hash:: 1dcc04481d9dded536a32f3fb85424d9301677f6556c8fc9862b13f027d6ae08

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.