Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /hyper_pure_refinery.py
| """HyperPure Knowledge Refinery for TinyMind. | |
| This refinery turns small high-trust seed knowledge into dense, auditable, | |
| training-ready records. Each record carries a claim, evidence, verification, | |
| failure mode, transfer principle, and purity score so the model learns reusable | |
| reasoning instead of memorising loose prose. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| import re | |
| from typing import Iterable | |
| SCHEMA_VERSION = "tinymind-hyper-pure-refinery-v1" | |
| JUNK_MARKERS = ( | |
| "lorem ipsum", | |
| "todo", | |
| "fixme", | |
| "???", | |
| "as an ai language model", | |
| "subscribe", | |
| "click here", | |
| ) | |
| DOMAIN_BLUEPRINTS = { | |
| "reasoning_logic": { | |
| "lang": "en", | |
| "seed": "Valid reasoning separates claim, premise, inference rule, counterexample, and verification.", | |
| "skills": ["modus_ponens", "contrapositive", "contradiction_repair", "evidence_policy"], | |
| }, | |
| "thai_expert_language": { | |
| "lang": "th", | |
| "seed": "คำตอบภาษาไทยที่แม่นยำต้องแยกแก่นความหมาย เหตุผล หลักฐาน เงื่อนไข และข้อจำกัดอย่างเป็นธรรมชาติ", | |
| "skills": ["semantic_precision", "natural_explanation", "formal_to_plain_thai", "ambiguity_control"], | |
| }, | |
| "english_expert_language": { | |
| "lang": "en", | |
| "seed": "Expert English answers preserve scope, modality, causality, and concrete evidence while staying readable.", | |
| "skills": ["technical_style", "scope_control", "causal_explanation", "calibrated_uncertainty"], | |
| }, | |
| "polyglot_software": { | |
| "lang": "en", | |
| "seed": "Robust software projects use contracts, reproducible builds, typed boundaries, tests, benchmarks, and release evidence.", | |
| "skills": ["python", "typescript", "rust", "go", "cpp", "java", "swift", "project_scaffold"], | |
| }, | |
| "tool_sandbox_os": { | |
| "lang": "en", | |
| "seed": "Tool-using AI should inspect before act, run sandboxed checks, capture stdout/stderr, and verify file artifacts.", | |
| "skills": ["powershell", "bash", "cmd", "lua_sandbox", "rust_tooling", "file_manifest"], | |
| }, | |
| "retrieval_grounding": { | |
| "lang": "en", | |
| "seed": "Grounded answers must cite exact chunks, hashes, retrieval terms, and refuse unsupported facts.", | |
| "skills": ["evidence_ledger", "hash_recall", "semantic_retrieval", "hallucination_refusal"], | |
| }, | |
| "long_context_memory": { | |
| "lang": "en", | |
| "seed": "Ten-million-token recall is guaranteed by exact hashed archives and regenerated context, not by pretending hidden state stores everything.", | |
| "skills": ["10m_archive", "passkey_recall", "chunk_hash", "bounded_state"], | |
| }, | |
| "math_science": { | |
| "lang": "en", | |
| "seed": "Hard technical answers require definitions, assumptions, derivation, unit checks, and independent validation.", | |
| "skills": ["algebra", "probability", "physics_units", "scientific_method"], | |
| }, | |
| "benchmark_claims": { | |
| "lang": "en", | |
| "seed": "Model quality claims are valid only when benchmark name, split, date, artifact, and score are saved.", | |
| "skills": ["mmlu_pro", "ifeval", "arena", "provider_eval", "claim_gate"], | |
| }, | |
| "self_improvement": { | |
| "lang": "en", | |
| "seed": "Self-improving AI grows only through measured error, targeted data, adapter updates, and regression gates.", | |
| "skills": ["error_mining", "data_refinement", "adapter_growth", "regression_gate"], | |
| }, | |
| } | |
| class HyperPureRecord: | |
| domain: str | |
| skill: str | |
| lang: str | |
| question: str | |
| answer: str | |
| claim: str | |
| evidence: str | |
| verification: str | |
| transfer_principle: str | |
| failure_mode: str | |
| negative_filter: str | |
| source: str | |
| source_sha256: str | |
| license: str | |
| quality_score: float | |
| purity_score: float | |
| rarity_score: float | |
| depth_score: float | |
| def _sha256(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| def _norm(text: str) -> str: | |
| return re.sub(r"\s+", " ", text.strip().lower()) | |
| def _junk_score(text: str) -> float: | |
| lower = text.lower() | |
| score = 0.0 | |
| if any(marker in lower for marker in JUNK_MARKERS): | |
| score += 0.8 | |
| words = re.findall(r"[\w\u0E00-\u0E7F]+", lower) | |
| if len(words) < 40: | |
| score += 0.2 | |
| if words: | |
| diversity = len(set(words)) / len(words) | |
| if diversity < 0.28: | |
| score += 0.35 | |
| return min(score, 1.0) | |
| def _record_id(record: HyperPureRecord) -> str: | |
| return _sha256(json.dumps(asdict(record), ensure_ascii=False, sort_keys=True))[:24] | |
| class HyperPureKnowledgeRefinery: | |
| """Create dense expert CEV records with strict purity gates.""" | |
| purity_policy = [ | |
| "claim_evidence_verification_required", | |
| "source_hash_required", | |
| "failure_mode_required", | |
| "transfer_principle_required", | |
| "dedupe_by_domain_skill_question", | |
| "junk_marker_blocked", | |
| "short_or_repetitive_text_blocked", | |
| ] | |
| def __init__(self, records_per_skill: int = 2, eval_ratio: float = 0.2): | |
| self.records_per_skill = max(1, int(records_per_skill)) | |
| self.eval_ratio = min(max(float(eval_ratio), 0.05), 0.5) | |
| def build_records(self) -> list[HyperPureRecord]: | |
| records: list[HyperPureRecord] = [] | |
| for domain, blueprint in DOMAIN_BLUEPRINTS.items(): | |
| seed = str(blueprint["seed"]) | |
| lang = str(blueprint["lang"]) | |
| for skill in blueprint["skills"]: | |
| for i in range(self.records_per_skill): | |
| records.append(self._make_record(domain, skill, lang, seed, i)) | |
| return records | |
| def _make_record(self, domain: str, skill: str, lang: str, seed: str, variant: int) -> HyperPureRecord: | |
| source_text = f"{domain}:{skill}:{seed}" | |
| source_hash = _sha256(source_text) | |
| if lang == "th": | |
| question = f"สกัดแก่นความรู้ระดับลึกของ {skill} ในโดเมน {domain} แบบตรวจซ้ำได้ [variant {variant}]" | |
| answer = ( | |
| f"แก่นของ {skill} คือการเปลี่ยนความรู้จากข้อความทั่วไปให้เป็นขั้นตอนที่ตรวจได้: " | |
| f"เริ่มจากนิยามขอบเขต แยก claim หลัก ระบุ evidence ที่มี hash ตรวจซ้ำได้ " | |
| f"อธิบายเหตุผลเป็นลำดับ แล้วปิดท้ายด้วยข้อจำกัดและ failure mode. " | |
| f"เมื่อนำไปใช้กับ {domain} โมเดลต้องตอบเป็นภาษาธรรมชาติ แต่ทุกข้อเท็จจริงต้องย้อนกลับไปยังหลักฐานได้" | |
| ) | |
| transfer = "ใช้กับโจทย์ใหม่โดยถามก่อนว่า claim ใดต้องพิสูจน์ หลักฐานอยู่ไหน และอะไรคือเงื่อนไขที่ทำให้คำตอบผิด" | |
| failure = "คำตอบจะปนเปื้อนถ้าใช้คำสวยแต่ไม่มี evidence หรือไม่บอกเงื่อนไขที่ทำให้ claim ใช้ไม่ได้" | |
| negative = "บล็อกข้อความที่สั้น ซ้ำ ฟุ้ง หรืออ้างความจริงโดยไม่มี hash/source" | |
| else: | |
| question = f"Extract the deepest reusable knowledge pattern for {skill} in {domain} [variant {variant}]" | |
| answer = ( | |
| f"The reusable core of {skill} is to turn raw knowledge into an auditable procedure: " | |
| f"define scope, isolate the claim, attach hash-backed evidence, explain the inference path, " | |
| f"state limits, and test the result against a failure case. In {domain}, the model should answer " | |
| f"naturally while keeping every factual commitment traceable to evidence rather than style or memorized phrasing." | |
| ) | |
| transfer = "For a new task, identify the claim, locate evidence, choose the smallest valid rule, verify output, then store the checked artifact." | |
| failure = "The method fails when the answer sounds fluent but lacks source hashes, boundary conditions, or a reproducible check." | |
| negative = "Reject filler, repetition, unsupported rankings, vague authority claims, and records without CEV fields." | |
| claim = f"{domain}/{skill} teaches a transferable expert operation, not a fixed answer." | |
| evidence = f"hyper_pure_seed_sha256={source_hash}; domain={domain}; skill={skill}; variant={variant}" | |
| verification = ( | |
| "Recompute source_sha256, confirm CEV fields are nonempty, check answer diversity, " | |
| "verify transfer_principle and failure_mode exist, then run train/eval split hash." | |
| ) | |
| quality = 0.985 | |
| purity = 1.0 - _junk_score("\n".join([question, answer, claim, evidence, verification, transfer, failure])) | |
| rarity = 0.90 + min(0.09, 0.01 * variant) | |
| depth = 0.96 if len(answer) > 280 else 0.90 | |
| return HyperPureRecord( | |
| domain=domain, | |
| skill=skill, | |
| lang=lang, | |
| question=question, | |
| answer=answer, | |
| claim=claim, | |
| evidence=evidence, | |
| verification=verification, | |
| transfer_principle=transfer, | |
| failure_mode=failure, | |
| negative_filter=negative, | |
| source="tinymind_hyper_pure_refinery_seed", | |
| source_sha256=source_hash, | |
| license="internal-clean-synthetic-cev", | |
| quality_score=quality, | |
| purity_score=purity, | |
| rarity_score=rarity, | |
| depth_score=depth, | |
| ) | |
| def select(self, records: Iterable[HyperPureRecord]) -> tuple[list[HyperPureRecord], list[dict]]: | |
| kept: dict[str, HyperPureRecord] = {} | |
| blocked: list[dict] = [] | |
| for record in records: | |
| text = "\n".join( | |
| [ | |
| record.question, | |
| record.answer, | |
| record.claim, | |
| record.evidence, | |
| record.verification, | |
| record.transfer_principle, | |
| record.failure_mode, | |
| ] | |
| ) | |
| reasons = [] | |
| if _junk_score(text) > 0.05: | |
| reasons.append("junk_score") | |
| if min(record.quality_score, record.purity_score, record.depth_score) < 0.94: | |
| reasons.append("score_floor") | |
| if not all([record.claim, record.evidence, record.verification, record.source_sha256]): | |
| reasons.append("missing_cev_or_hash") | |
| if reasons: | |
| blocked.append({"domain": record.domain, "skill": record.skill, "reasons": reasons}) | |
| continue | |
| key = f"{record.domain}:{record.skill}:{_norm(record.question)}" | |
| kept.setdefault(key, record) | |
| return list(kept.values()), blocked | |
| def write_dataset(self, out_dir: str | Path) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| selected, blocked = self.select(self.build_records()) | |
| selected.sort(key=lambda row: (row.domain, row.skill, row.lang, row.question)) | |
| train, eval_rows = self._split(selected) | |
| train_path = out / "hyper_pure_train.jsonl" | |
| eval_path = out / "hyper_pure_eval.jsonl" | |
| self._write_jsonl(train_path, train) | |
| self._write_jsonl(eval_path, eval_rows) | |
| audit = self._audit(selected, blocked, train_path, eval_path) | |
| manifest_path = out / "hyper_pure_manifest.json" | |
| audit["manifest_path"] = str(manifest_path) | |
| manifest_path.write_text(json.dumps(audit, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path = out / "hyper_pure_manifest.md" | |
| audit["markdown_path"] = str(md_path) | |
| md_path.write_text(self._markdown(audit), encoding="utf-8") | |
| manifest_path.write_text(json.dumps(audit, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return audit | |
| def _split(self, rows: list[HyperPureRecord]) -> tuple[list[HyperPureRecord], list[HyperPureRecord]]: | |
| by_domain: dict[str, list[HyperPureRecord]] = {} | |
| for row in rows: | |
| by_domain.setdefault(row.domain, []).append(row) | |
| train: list[HyperPureRecord] = [] | |
| eval_rows: list[HyperPureRecord] = [] | |
| for domain_rows in by_domain.values(): | |
| n_eval = max(1, int(round(len(domain_rows) * self.eval_ratio))) | |
| eval_rows.extend(domain_rows[-n_eval:]) | |
| train.extend(domain_rows[:-n_eval]) | |
| return train, eval_rows | |
| def _row(self, record: HyperPureRecord) -> dict: | |
| row = asdict(record) | |
| row["id"] = _record_id(record) | |
| row["schema_version"] = SCHEMA_VERSION | |
| row["created_at"] = datetime.now(timezone.utc).isoformat() | |
| row["text"] = ( | |
| f"<domain>{record.domain}</domain>\n" | |
| f"<skill>{record.skill}</skill>\n" | |
| f"<claim>{record.claim}</claim>\n" | |
| f"<evidence>{record.evidence}</evidence>\n" | |
| f"<verification>{record.verification}</verification>\n" | |
| f"<transfer>{record.transfer_principle}</transfer>\n" | |
| f"<failure>{record.failure_mode}</failure>\n" | |
| f"<user>{record.question}</user>\n" | |
| f"<assistant>{record.answer}</assistant>" | |
| ) | |
| return row | |
| def _write_jsonl(self, path: Path, records: list[HyperPureRecord]) -> None: | |
| with path.open("w", encoding="utf-8", newline="\n") as f: | |
| for record in records: | |
| f.write(json.dumps(self._row(record), ensure_ascii=False, sort_keys=True) + "\n") | |
| def _audit(self, rows: list[HyperPureRecord], blocked: list[dict], train_path: Path, eval_path: Path) -> dict: | |
| domain_counts = Counter(row.domain for row in rows) | |
| skill_counts = Counter(row.skill for row in rows) | |
| purity = sum(row.purity_score for row in rows) / max(len(rows), 1) | |
| quality = sum(row.quality_score for row in rows) / max(len(rows), 1) | |
| depth = sum(row.depth_score for row in rows) / max(len(rows), 1) | |
| coverage = len(domain_counts) / max(len(DOMAIN_BLUEPRINTS), 1) | |
| gate_passed = bool(rows) and purity >= 0.97 and quality >= 0.98 and depth >= 0.94 and coverage == 1.0 | |
| return { | |
| "schema_version": SCHEMA_VERSION, | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "train_path": str(train_path), | |
| "eval_path": str(eval_path), | |
| "records_written": len(rows), | |
| "train_records": sum(1 for _ in train_path.read_text(encoding="utf-8").splitlines() if _.strip()), | |
| "eval_records": sum(1 for _ in eval_path.read_text(encoding="utf-8").splitlines() if _.strip()), | |
| "blocked_records": len(blocked), | |
| "blocked": blocked, | |
| "domain_counts": dict(domain_counts), | |
| "skill_counts": dict(skill_counts), | |
| "purity_policy": self.purity_policy, | |
| "scores": { | |
| "purity": purity, | |
| "quality": quality, | |
| "depth": depth, | |
| "domain_coverage": coverage, | |
| }, | |
| "gate": { | |
| "passed": gate_passed, | |
| "reason": "requires purity>=0.97 quality>=0.98 depth>=0.94 and full domain coverage", | |
| }, | |
| "sha256": { | |
| "train": hashlib.sha256(train_path.read_bytes()).hexdigest(), | |
| "eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(), | |
| }, | |
| "world_best_claim_allowed": False, | |
| "claim_note": "This is a high-purity local dataset refinery. Best-in-world requires external data-quality benchmarks.", | |
| } | |
| def _markdown(self, audit: dict) -> str: | |
| return "\n".join( | |
| [ | |
| "# TinyMind HyperPure Knowledge Manifest", | |
| "", | |
| f"- Records: {audit['records_written']}", | |
| f"- Gate passed: {audit['gate']['passed']}", | |
| f"- Purity: {audit['scores']['purity']:.4f}", | |
| f"- Quality: {audit['scores']['quality']:.4f}", | |
| f"- Depth: {audit['scores']['depth']:.4f}", | |
| f"- Domain coverage: {audit['scores']['domain_coverage']:.2%}", | |
| "- World-best claim: false", | |
| "", | |
| ] | |
| ) | |
Xet Storage Details
- Size:
- 17.2 kB
- Xet hash:
- b2db309acd2ca5183bf932bbd439ff17e1f6bab3b60130504783815cb01528c9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.