Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /self_dialogue_forge.py
| """Self-dialogue dataset forge for TinyMind. | |
| The records are generated from deterministic rules with disjoint train/eval | |
| seeds. Each target contains plan/act/verify/final fields so the model learns | |
| to produce an interaction trace instead of memorising a bare answer string. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter | |
| from dataclasses import asdict, dataclass | |
| import hashlib | |
| import json | |
| import random | |
| from pathlib import Path | |
| from typing import Iterable | |
| SCHEMA_VERSION = "tinymind-self-dialogue-v1" | |
| class SelfDialogueRecord: | |
| split: str | |
| lang: str | |
| domain: str | |
| rule_id: str | |
| prompt: str | |
| plan: str | |
| action: str | |
| verification: str | |
| final: str | |
| oracle: str | |
| source_seed: int | |
| quality_score: float = 1.0 | |
| def target(self) -> str: | |
| return ( | |
| f"<plan>{self.plan}</plan>\n" | |
| f"<act>{self.action}</act>\n" | |
| f"<verify>{self.verification}</verify>\n" | |
| f"<final>{self.final}</final>" | |
| ) | |
| def _record_id(record: SelfDialogueRecord) -> str: | |
| payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True) | |
| return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:24] | |
| def _make_arithmetic(seed: int, split: str, lang: str) -> SelfDialogueRecord: | |
| rng = random.Random(seed) | |
| case_id = seed % 1_000_003 | |
| a = rng.randint(11, 89) | |
| b = rng.randint(7, 61) | |
| c = rng.randint(2, 9) | |
| answer = (a + b) * c | |
| if lang == "th": | |
| prompt = f"case {case_id}: คิดเองทีละขั้น: ({a} + {b}) * {c} ได้เท่าไร" | |
| plan = "แยกโจทย์เป็นบวกก่อนแล้วคูณ ตรวจด้วยการคำนวณย้อนกลับ" | |
| action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}" | |
| verification = f"ตรวจย้อนกลับ {answer}/{c}={a + b} และ {a + b}-{b}={a}" | |
| final = f"คำตอบคือ {answer}" | |
| else: | |
| prompt = f"case {case_id}: Self-solve step by step: ({a} + {b}) * {c}" | |
| plan = "Add first, multiply second, then verify by reversing the operation." | |
| action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}" | |
| verification = f"Reverse check: {answer}/{c}={a + b} and {a + b}-{b}={a}" | |
| final = f"The answer is {answer}" | |
| return SelfDialogueRecord(split, lang, "symbolic_math", "add_then_multiply", prompt, plan, action, verification, final, str(answer), seed) | |
| def _make_boolean(seed: int, split: str, lang: str) -> SelfDialogueRecord: | |
| rng = random.Random(seed) | |
| case_id = seed % 1_000_003 | |
| a = bool(rng.randint(0, 1)) | |
| b = bool(rng.randint(0, 1)) | |
| c = bool(rng.randint(0, 1)) | |
| answer = (a and not b) or c | |
| bits = f"A={int(a)}, B={int(b)}, C={int(c)}" | |
| if lang == "th": | |
| prompt = f"case {case_id}: ตรวจตรรกะด้วยตัวเอง: ({bits}) ค่า (A and not B) or C คืออะไร" | |
| plan = "หาค่า not B ก่อน จากนั้นทำ and แล้ว or กับ C" | |
| action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}" | |
| verification = "แทนค่าซ้ำในนิพจน์เดิมแล้วได้ผลเดียวกัน" | |
| final = f"ผลลัพธ์คือ {int(answer)}" | |
| else: | |
| prompt = f"case {case_id}: Self-check the logic: ({bits}) evaluate (A and not B) or C" | |
| plan = "Compute not B, combine with A, then OR the result with C." | |
| action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}" | |
| verification = "Substituting the values back into the expression gives the same result." | |
| final = f"The result is {int(answer)}" | |
| return SelfDialogueRecord(split, lang, "logic", "boolean_trace", prompt, plan, action, verification, final, str(int(answer)), seed) | |
| def _make_repair(seed: int, split: str, lang: str) -> SelfDialogueRecord: | |
| rng = random.Random(seed) | |
| case_id = seed % 1_000_003 | |
| n = rng.randint(18, 80) | |
| wrong = n + rng.choice([-5, -3, 4, 6]) | |
| answer = n * 2 | |
| if lang == "th": | |
| prompt = f"case {case_id}: มีคำตอบร่างว่า {n}*2={wrong} ให้ตรวจเองแล้วแก้" | |
| plan = "อย่าเชื่อร่างคำตอบ ให้คูณใหม่และเทียบกับคำตอบร่าง" | |
| action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}" | |
| verification = "คำนวณซ้ำพบว่าร่างเดิมไม่ตรง จึงต้องแก้เป็นค่าที่คำนวณใหม่" | |
| final = f"แก้คำตอบเป็น {answer}" | |
| else: | |
| prompt = f"case {case_id}: A draft says {n}*2={wrong}. Self-check and repair it." | |
| plan = "Do not trust the draft; recompute and compare against it." | |
| action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}" | |
| verification = "Recomputation disagrees with the draft, so the final answer must be repaired." | |
| final = f"Repair the answer to {answer}" | |
| return SelfDialogueRecord(split, lang, "self_correction", "repair_wrong_draft", prompt, plan, action, verification, final, str(answer), seed) | |
| GENERATORS = (_make_arithmetic, _make_boolean, _make_repair) | |
| class SelfDialogueForge: | |
| """Generate auditable self-dialogue train/eval data.""" | |
| def __init__(self, train_size: int = 48, eval_size: int = 12, seed: int = 20260523): | |
| self.train_size = int(train_size) | |
| self.eval_size = int(eval_size) | |
| self.seed = int(seed) | |
| def build(self) -> tuple[list[SelfDialogueRecord], list[SelfDialogueRecord]]: | |
| train = self._build_split("train", self.train_size, self.seed) | |
| eval_rows = self._build_split("eval", self.eval_size, self.seed + 1_000_000) | |
| return train, eval_rows | |
| def _build_split(self, split: str, size: int, base_seed: int) -> list[SelfDialogueRecord]: | |
| rows: list[SelfDialogueRecord] = [] | |
| for i in range(size): | |
| lang = "th" if i % 3 == 0 else "en" | |
| generator = GENERATORS[i % len(GENERATORS)] | |
| rows.append(generator(base_seed + i * 37, split, lang)) | |
| return rows | |
| def write_jsonl(self, out_dir: str | Path) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| train, eval_rows = self.build() | |
| train_path = out / "self_dialogue_train.jsonl" | |
| eval_path = out / "self_dialogue_eval.jsonl" | |
| self._write_rows(train_path, train) | |
| self._write_rows(eval_path, eval_rows) | |
| train_ids = {_record_id(row) for row in train} | |
| eval_ids = {_record_id(row) for row in eval_rows} | |
| manifest = { | |
| "schema_version": SCHEMA_VERSION, | |
| "train_path": str(train_path), | |
| "eval_path": str(eval_path), | |
| "train_records": len(train), | |
| "eval_records": len(eval_rows), | |
| "train_eval_id_overlap": len(train_ids & eval_ids), | |
| "domain_counts": dict(Counter(row.domain for row in train + eval_rows)), | |
| "rule_counts": dict(Counter(row.rule_id for row in train + eval_rows)), | |
| "purity_policy": [ | |
| "deterministic_oracle_generated", | |
| "train_eval_seed_disjoint", | |
| "contains_plan_act_verify_final", | |
| "no_external_answer_corpus", | |
| ], | |
| "sha256": { | |
| "train": hashlib.sha256(train_path.read_bytes()).hexdigest(), | |
| "eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(), | |
| }, | |
| } | |
| manifest_path = out / "self_dialogue_manifest.json" | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return manifest | |
| def _write_rows(path: Path, rows: Iterable[SelfDialogueRecord]) -> None: | |
| with path.open("w", encoding="utf-8", newline="\n") as f: | |
| for row in rows: | |
| payload = asdict(row) | |
| payload["id"] = _record_id(row) | |
| payload["schema_version"] = SCHEMA_VERSION | |
| payload["target"] = row.target | |
| f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") | |
Xet Storage Details
- Size:
- 8.49 kB
- Xet hash:
- 1dcc04481d9dded536a32f3fb85424d9301677f6556c8fc9862b13f027d6ae08
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.