Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /hyper_pure_refinery.py

bbkdevops

about 1 month ago

download

raw

17.2 kB

	"""HyperPure Knowledge Refinery for TinyMind.

	This refinery turns small high-trust seed knowledge into dense, auditable,
	training-ready records. Each record carries a claim, evidence, verification,
	failure mode, transfer principle, and purity score so the model learns reusable
	reasoning instead of memorising loose prose.
	"""

	from __future__ import annotations

	from collections import Counter
	from dataclasses import asdict, dataclass
	from datetime import datetime, timezone
	import hashlib
	import json
	from pathlib import Path
	import re
	from typing import Iterable


	SCHEMA_VERSION = "tinymind-hyper-pure-refinery-v1"
	JUNK_MARKERS = (
	"lorem ipsum",
	"todo",
	"fixme",
	"???",
	"as an ai language model",
	"subscribe",
	"click here",
	)


	DOMAIN_BLUEPRINTS = {
	"reasoning_logic": {
	"lang": "en",
	"seed": "Valid reasoning separates claim, premise, inference rule, counterexample, and verification.",
	"skills": ["modus_ponens", "contrapositive", "contradiction_repair", "evidence_policy"],
	},
	"thai_expert_language": {
	"lang": "th",
	"seed": "คำตอบภาษาไทยที่แม่นยำต้องแยกแก่นความหมาย เหตุผล หลักฐาน เงื่อนไข และข้อจำกัดอย่างเป็นธรรมชาติ",
	"skills": ["semantic_precision", "natural_explanation", "formal_to_plain_thai", "ambiguity_control"],
	},
	"english_expert_language": {
	"lang": "en",
	"seed": "Expert English answers preserve scope, modality, causality, and concrete evidence while staying readable.",
	"skills": ["technical_style", "scope_control", "causal_explanation", "calibrated_uncertainty"],
	},
	"polyglot_software": {
	"lang": "en",
	"seed": "Robust software projects use contracts, reproducible builds, typed boundaries, tests, benchmarks, and release evidence.",
	"skills": ["python", "typescript", "rust", "go", "cpp", "java", "swift", "project_scaffold"],
	},
	"tool_sandbox_os": {
	"lang": "en",
	"seed": "Tool-using AI should inspect before act, run sandboxed checks, capture stdout/stderr, and verify file artifacts.",
	"skills": ["powershell", "bash", "cmd", "lua_sandbox", "rust_tooling", "file_manifest"],
	},
	"retrieval_grounding": {
	"lang": "en",
	"seed": "Grounded answers must cite exact chunks, hashes, retrieval terms, and refuse unsupported facts.",
	"skills": ["evidence_ledger", "hash_recall", "semantic_retrieval", "hallucination_refusal"],
	},
	"long_context_memory": {
	"lang": "en",
	"seed": "Ten-million-token recall is guaranteed by exact hashed archives and regenerated context, not by pretending hidden state stores everything.",
	"skills": ["10m_archive", "passkey_recall", "chunk_hash", "bounded_state"],
	},
	"math_science": {
	"lang": "en",
	"seed": "Hard technical answers require definitions, assumptions, derivation, unit checks, and independent validation.",
	"skills": ["algebra", "probability", "physics_units", "scientific_method"],
	},
	"benchmark_claims": {
	"lang": "en",
	"seed": "Model quality claims are valid only when benchmark name, split, date, artifact, and score are saved.",
	"skills": ["mmlu_pro", "ifeval", "arena", "provider_eval", "claim_gate"],
	},
	"self_improvement": {
	"lang": "en",
	"seed": "Self-improving AI grows only through measured error, targeted data, adapter updates, and regression gates.",
	"skills": ["error_mining", "data_refinement", "adapter_growth", "regression_gate"],
	},
	}


	@dataclass(frozen=True)
	class HyperPureRecord:
	domain: str
	skill: str
	lang: str
	question: str
	answer: str
	claim: str
	evidence: str
	verification: str
	transfer_principle: str
	failure_mode: str
	negative_filter: str
	source: str
	source_sha256: str
	license: str
	quality_score: float
	purity_score: float
	rarity_score: float
	depth_score: float


	def _sha256(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8")).hexdigest()


	def _norm(text: str) -> str:
	return re.sub(r"\s+", " ", text.strip().lower())


	def _junk_score(text: str) -> float:
	lower = text.lower()
	score = 0.0
	if any(marker in lower for marker in JUNK_MARKERS):
	score += 0.8
	words = re.findall(r"[\w\u0E00-\u0E7F]+", lower)
	if len(words) < 40:
	score += 0.2
	if words:
	diversity = len(set(words)) / len(words)
	if diversity < 0.28:
	score += 0.35
	return min(score, 1.0)


	def _record_id(record: HyperPureRecord) -> str:
	return _sha256(json.dumps(asdict(record), ensure_ascii=False, sort_keys=True))[:24]


	class HyperPureKnowledgeRefinery:
	"""Create dense expert CEV records with strict purity gates."""

	purity_policy = [
	"claim_evidence_verification_required",
	"source_hash_required",
	"failure_mode_required",
	"transfer_principle_required",
	"dedupe_by_domain_skill_question",
	"junk_marker_blocked",
	"short_or_repetitive_text_blocked",
	]

	def __init__(self, records_per_skill: int = 2, eval_ratio: float = 0.2):
	self.records_per_skill = max(1, int(records_per_skill))
	self.eval_ratio = min(max(float(eval_ratio), 0.05), 0.5)

	def build_records(self) -> list[HyperPureRecord]:
	records: list[HyperPureRecord] = []
	for domain, blueprint in DOMAIN_BLUEPRINTS.items():
	seed = str(blueprint["seed"])
	lang = str(blueprint["lang"])
	for skill in blueprint["skills"]:
	for i in range(self.records_per_skill):
	records.append(self._make_record(domain, skill, lang, seed, i))
	return records

	def _make_record(self, domain: str, skill: str, lang: str, seed: str, variant: int) -> HyperPureRecord:
	source_text = f"{domain}:{skill}:{seed}"
	source_hash = _sha256(source_text)
	if lang == "th":
	question = f"สกัดแก่นความรู้ระดับลึกของ {skill} ในโดเมน {domain} แบบตรวจซ้ำได้ [variant {variant}]"
	answer = (
	f"แก่นของ {skill} คือการเปลี่ยนความรู้จากข้อความทั่วไปให้เป็นขั้นตอนที่ตรวจได้: "
	f"เริ่มจากนิยามขอบเขต แยก claim หลัก ระบุ evidence ที่มี hash ตรวจซ้ำได้ "
	f"อธิบายเหตุผลเป็นลำดับ แล้วปิดท้ายด้วยข้อจำกัดและ failure mode. "
	f"เมื่อนำไปใช้กับ {domain} โมเดลต้องตอบเป็นภาษาธรรมชาติ แต่ทุกข้อเท็จจริงต้องย้อนกลับไปยังหลักฐานได้"
	)
	transfer = "ใช้กับโจทย์ใหม่โดยถามก่อนว่า claim ใดต้องพิสูจน์ หลักฐานอยู่ไหน และอะไรคือเงื่อนไขที่ทำให้คำตอบผิด"
	failure = "คำตอบจะปนเปื้อนถ้าใช้คำสวยแต่ไม่มี evidence หรือไม่บอกเงื่อนไขที่ทำให้ claim ใช้ไม่ได้"
	negative = "บล็อกข้อความที่สั้น ซ้ำ ฟุ้ง หรืออ้างความจริงโดยไม่มี hash/source"
	else:
	question = f"Extract the deepest reusable knowledge pattern for {skill} in {domain} [variant {variant}]"
	answer = (
	f"The reusable core of {skill} is to turn raw knowledge into an auditable procedure: "
	f"define scope, isolate the claim, attach hash-backed evidence, explain the inference path, "
	f"state limits, and test the result against a failure case. In {domain}, the model should answer "
	f"naturally while keeping every factual commitment traceable to evidence rather than style or memorized phrasing."
	)
	transfer = "For a new task, identify the claim, locate evidence, choose the smallest valid rule, verify output, then store the checked artifact."
	failure = "The method fails when the answer sounds fluent but lacks source hashes, boundary conditions, or a reproducible check."
	negative = "Reject filler, repetition, unsupported rankings, vague authority claims, and records without CEV fields."
	claim = f"{domain}/{skill} teaches a transferable expert operation, not a fixed answer."
	evidence = f"hyper_pure_seed_sha256={source_hash}; domain={domain}; skill={skill}; variant={variant}"
	verification = (
	"Recompute source_sha256, confirm CEV fields are nonempty, check answer diversity, "
	"verify transfer_principle and failure_mode exist, then run train/eval split hash."
	)
	quality = 0.985
	purity = 1.0 - _junk_score("\n".join([question, answer, claim, evidence, verification, transfer, failure]))
	rarity = 0.90 + min(0.09, 0.01 * variant)
	depth = 0.96 if len(answer) > 280 else 0.90
	return HyperPureRecord(
	domain=domain,
	skill=skill,
	lang=lang,
	question=question,
	answer=answer,
	claim=claim,
	evidence=evidence,
	verification=verification,
	transfer_principle=transfer,
	failure_mode=failure,
	negative_filter=negative,
	source="tinymind_hyper_pure_refinery_seed",
	source_sha256=source_hash,
	license="internal-clean-synthetic-cev",
	quality_score=quality,
	purity_score=purity,
	rarity_score=rarity,
	depth_score=depth,
	)

	def select(self, records: Iterable[HyperPureRecord]) -> tuple[list[HyperPureRecord], list[dict]]:
	kept: dict[str, HyperPureRecord] = {}
	blocked: list[dict] = []
	for record in records:
	text = "\n".join(
	[
	record.question,
	record.answer,
	record.claim,
	record.evidence,
	record.verification,
	record.transfer_principle,
	record.failure_mode,
	]
	)
	reasons = []
	if _junk_score(text) > 0.05:
	reasons.append("junk_score")
	if min(record.quality_score, record.purity_score, record.depth_score) < 0.94:
	reasons.append("score_floor")
	if not all([record.claim, record.evidence, record.verification, record.source_sha256]):
	reasons.append("missing_cev_or_hash")
	if reasons:
	blocked.append({"domain": record.domain, "skill": record.skill, "reasons": reasons})
	continue
	key = f"{record.domain}:{record.skill}:{_norm(record.question)}"
	kept.setdefault(key, record)
	return list(kept.values()), blocked

	def write_dataset(self, out_dir: str \| Path) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	selected, blocked = self.select(self.build_records())
	selected.sort(key=lambda row: (row.domain, row.skill, row.lang, row.question))
	train, eval_rows = self._split(selected)
	train_path = out / "hyper_pure_train.jsonl"
	eval_path = out / "hyper_pure_eval.jsonl"
	self._write_jsonl(train_path, train)
	self._write_jsonl(eval_path, eval_rows)
	audit = self._audit(selected, blocked, train_path, eval_path)
	manifest_path = out / "hyper_pure_manifest.json"
	audit["manifest_path"] = str(manifest_path)
	manifest_path.write_text(json.dumps(audit, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path = out / "hyper_pure_manifest.md"
	audit["markdown_path"] = str(md_path)
	md_path.write_text(self._markdown(audit), encoding="utf-8")
	manifest_path.write_text(json.dumps(audit, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return audit

	def _split(self, rows: list[HyperPureRecord]) -> tuple[list[HyperPureRecord], list[HyperPureRecord]]:
	by_domain: dict[str, list[HyperPureRecord]] = {}
	for row in rows:
	by_domain.setdefault(row.domain, []).append(row)
	train: list[HyperPureRecord] = []
	eval_rows: list[HyperPureRecord] = []
	for domain_rows in by_domain.values():
	n_eval = max(1, int(round(len(domain_rows) * self.eval_ratio)))
	eval_rows.extend(domain_rows[-n_eval:])
	train.extend(domain_rows[:-n_eval])
	return train, eval_rows

	def _row(self, record: HyperPureRecord) -> dict:
	row = asdict(record)
	row["id"] = _record_id(record)
	row["schema_version"] = SCHEMA_VERSION
	row["created_at"] = datetime.now(timezone.utc).isoformat()
	row["text"] = (
	f"<domain>{record.domain}</domain>\n"
	f"<skill>{record.skill}</skill>\n"
	f"<claim>{record.claim}</claim>\n"
	f"<evidence>{record.evidence}</evidence>\n"
	f"<verification>{record.verification}</verification>\n"
	f"<transfer>{record.transfer_principle}</transfer>\n"
	f"<failure>{record.failure_mode}</failure>\n"
	f"<user>{record.question}</user>\n"
	f"<assistant>{record.answer}</assistant>"
	)
	return row

	def _write_jsonl(self, path: Path, records: list[HyperPureRecord]) -> None:
	with path.open("w", encoding="utf-8", newline="\n") as f:
	for record in records:
	f.write(json.dumps(self._row(record), ensure_ascii=False, sort_keys=True) + "\n")

	def _audit(self, rows: list[HyperPureRecord], blocked: list[dict], train_path: Path, eval_path: Path) -> dict:
	domain_counts = Counter(row.domain for row in rows)
	skill_counts = Counter(row.skill for row in rows)
	purity = sum(row.purity_score for row in rows) / max(len(rows), 1)
	quality = sum(row.quality_score for row in rows) / max(len(rows), 1)
	depth = sum(row.depth_score for row in rows) / max(len(rows), 1)
	coverage = len(domain_counts) / max(len(DOMAIN_BLUEPRINTS), 1)
	gate_passed = bool(rows) and purity >= 0.97 and quality >= 0.98 and depth >= 0.94 and coverage == 1.0
	return {
	"schema_version": SCHEMA_VERSION,
	"created_at": datetime.now(timezone.utc).isoformat(),
	"train_path": str(train_path),
	"eval_path": str(eval_path),
	"records_written": len(rows),
	"train_records": sum(1 for _ in train_path.read_text(encoding="utf-8").splitlines() if _.strip()),
	"eval_records": sum(1 for _ in eval_path.read_text(encoding="utf-8").splitlines() if _.strip()),
	"blocked_records": len(blocked),
	"blocked": blocked,
	"domain_counts": dict(domain_counts),
	"skill_counts": dict(skill_counts),
	"purity_policy": self.purity_policy,
	"scores": {
	"purity": purity,
	"quality": quality,
	"depth": depth,
	"domain_coverage": coverage,
	},
	"gate": {
	"passed": gate_passed,
	"reason": "requires purity>=0.97 quality>=0.98 depth>=0.94 and full domain coverage",
	},
	"sha256": {
	"train": hashlib.sha256(train_path.read_bytes()).hexdigest(),
	"eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(),
	},
	"world_best_claim_allowed": False,
	"claim_note": "This is a high-purity local dataset refinery. Best-in-world requires external data-quality benchmarks.",
	}

	def _markdown(self, audit: dict) -> str:
	return "\n".join(
	[
	"# TinyMind HyperPure Knowledge Manifest",
	"",
	f"- Records: {audit['records_written']}",
	f"- Gate passed: {audit['gate']['passed']}",
	f"- Purity: {audit['scores']['purity']:.4f}",
	f"- Quality: {audit['scores']['quality']:.4f}",
	f"- Depth: {audit['scores']['depth']:.4f}",
	f"- Domain coverage: {audit['scores']['domain_coverage']:.2%}",
	"- World-best claim: false",
	"",
	]
	)

Xet Storage Details

Size:: 17.2 kB
Xet hash:: b2db309acd2ca5183bf932bbd439ff17e1f6bab3b60130504783815cb01528c9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.