bbkdevops's picture
download
raw
8.49 kB
"""Self-dialogue dataset forge for TinyMind.
The records are generated from deterministic rules with disjoint train/eval
seeds. Each target contains plan/act/verify/final fields so the model learns
to produce an interaction trace instead of memorising a bare answer string.
"""
from __future__ import annotations
from collections import Counter
from dataclasses import asdict, dataclass
import hashlib
import json
import random
from pathlib import Path
from typing import Iterable
SCHEMA_VERSION = "tinymind-self-dialogue-v1"
@dataclass(frozen=True)
class SelfDialogueRecord:
split: str
lang: str
domain: str
rule_id: str
prompt: str
plan: str
action: str
verification: str
final: str
oracle: str
source_seed: int
quality_score: float = 1.0
@property
def target(self) -> str:
return (
f"<plan>{self.plan}</plan>\n"
f"<act>{self.action}</act>\n"
f"<verify>{self.verification}</verify>\n"
f"<final>{self.final}</final>"
)
def _record_id(record: SelfDialogueRecord) -> str:
payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:24]
def _make_arithmetic(seed: int, split: str, lang: str) -> SelfDialogueRecord:
rng = random.Random(seed)
case_id = seed % 1_000_003
a = rng.randint(11, 89)
b = rng.randint(7, 61)
c = rng.randint(2, 9)
answer = (a + b) * c
if lang == "th":
prompt = f"case {case_id}: คิดเองทีละขั้น: ({a} + {b}) * {c} ได้เท่าไร"
plan = "แยกโจทย์เป็นบวกก่อนแล้วคูณ ตรวจด้วยการคำนวณย้อนกลับ"
action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}"
verification = f"ตรวจย้อนกลับ {answer}/{c}={a + b} และ {a + b}-{b}={a}"
final = f"คำตอบคือ {answer}"
else:
prompt = f"case {case_id}: Self-solve step by step: ({a} + {b}) * {c}"
plan = "Add first, multiply second, then verify by reversing the operation."
action = f"{a}+{b}={a + b}; {a + b}*{c}={answer}"
verification = f"Reverse check: {answer}/{c}={a + b} and {a + b}-{b}={a}"
final = f"The answer is {answer}"
return SelfDialogueRecord(split, lang, "symbolic_math", "add_then_multiply", prompt, plan, action, verification, final, str(answer), seed)
def _make_boolean(seed: int, split: str, lang: str) -> SelfDialogueRecord:
rng = random.Random(seed)
case_id = seed % 1_000_003
a = bool(rng.randint(0, 1))
b = bool(rng.randint(0, 1))
c = bool(rng.randint(0, 1))
answer = (a and not b) or c
bits = f"A={int(a)}, B={int(b)}, C={int(c)}"
if lang == "th":
prompt = f"case {case_id}: ตรวจตรรกะด้วยตัวเอง: ({bits}) ค่า (A and not B) or C คืออะไร"
plan = "หาค่า not B ก่อน จากนั้นทำ and แล้ว or กับ C"
action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}"
verification = "แทนค่าซ้ำในนิพจน์เดิมแล้วได้ผลเดียวกัน"
final = f"ผลลัพธ์คือ {int(answer)}"
else:
prompt = f"case {case_id}: Self-check the logic: ({bits}) evaluate (A and not B) or C"
plan = "Compute not B, combine with A, then OR the result with C."
action = f"not B={int(not b)}; A and not B={int(a and not b)}; result={int(answer)}"
verification = "Substituting the values back into the expression gives the same result."
final = f"The result is {int(answer)}"
return SelfDialogueRecord(split, lang, "logic", "boolean_trace", prompt, plan, action, verification, final, str(int(answer)), seed)
def _make_repair(seed: int, split: str, lang: str) -> SelfDialogueRecord:
rng = random.Random(seed)
case_id = seed % 1_000_003
n = rng.randint(18, 80)
wrong = n + rng.choice([-5, -3, 4, 6])
answer = n * 2
if lang == "th":
prompt = f"case {case_id}: มีคำตอบร่างว่า {n}*2={wrong} ให้ตรวจเองแล้วแก้"
plan = "อย่าเชื่อร่างคำตอบ ให้คูณใหม่และเทียบกับคำตอบร่าง"
action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}"
verification = "คำนวณซ้ำพบว่าร่างเดิมไม่ตรง จึงต้องแก้เป็นค่าที่คำนวณใหม่"
final = f"แก้คำตอบเป็น {answer}"
else:
prompt = f"case {case_id}: A draft says {n}*2={wrong}. Self-check and repair it."
plan = "Do not trust the draft; recompute and compare against it."
action = f"{n}*2={answer}; draft={wrong}; mismatch={int(answer != wrong)}"
verification = "Recomputation disagrees with the draft, so the final answer must be repaired."
final = f"Repair the answer to {answer}"
return SelfDialogueRecord(split, lang, "self_correction", "repair_wrong_draft", prompt, plan, action, verification, final, str(answer), seed)
GENERATORS = (_make_arithmetic, _make_boolean, _make_repair)
class SelfDialogueForge:
"""Generate auditable self-dialogue train/eval data."""
def __init__(self, train_size: int = 48, eval_size: int = 12, seed: int = 20260523):
self.train_size = int(train_size)
self.eval_size = int(eval_size)
self.seed = int(seed)
def build(self) -> tuple[list[SelfDialogueRecord], list[SelfDialogueRecord]]:
train = self._build_split("train", self.train_size, self.seed)
eval_rows = self._build_split("eval", self.eval_size, self.seed + 1_000_000)
return train, eval_rows
def _build_split(self, split: str, size: int, base_seed: int) -> list[SelfDialogueRecord]:
rows: list[SelfDialogueRecord] = []
for i in range(size):
lang = "th" if i % 3 == 0 else "en"
generator = GENERATORS[i % len(GENERATORS)]
rows.append(generator(base_seed + i * 37, split, lang))
return rows
def write_jsonl(self, out_dir: str | Path) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
train, eval_rows = self.build()
train_path = out / "self_dialogue_train.jsonl"
eval_path = out / "self_dialogue_eval.jsonl"
self._write_rows(train_path, train)
self._write_rows(eval_path, eval_rows)
train_ids = {_record_id(row) for row in train}
eval_ids = {_record_id(row) for row in eval_rows}
manifest = {
"schema_version": SCHEMA_VERSION,
"train_path": str(train_path),
"eval_path": str(eval_path),
"train_records": len(train),
"eval_records": len(eval_rows),
"train_eval_id_overlap": len(train_ids & eval_ids),
"domain_counts": dict(Counter(row.domain for row in train + eval_rows)),
"rule_counts": dict(Counter(row.rule_id for row in train + eval_rows)),
"purity_policy": [
"deterministic_oracle_generated",
"train_eval_seed_disjoint",
"contains_plan_act_verify_final",
"no_external_answer_corpus",
],
"sha256": {
"train": hashlib.sha256(train_path.read_bytes()).hexdigest(),
"eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(),
},
}
manifest_path = out / "self_dialogue_manifest.json"
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return manifest
@staticmethod
def _write_rows(path: Path, rows: Iterable[SelfDialogueRecord]) -> None:
with path.open("w", encoding="utf-8", newline="\n") as f:
for row in rows:
payload = asdict(row)
payload["id"] = _record_id(row)
payload["schema_version"] = SCHEMA_VERSION
payload["target"] = row.target
f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")

Xet Storage Details

Size:
8.49 kB
·
Xet hash:
1dcc04481d9dded536a32f3fb85424d9301677f6556c8fc9862b13f027d6ae08

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.