Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /pure_forge.py
| """Pure dataset forge for TinyMind. | |
| The forge intentionally starts conservative: it accepts only schema-valid, | |
| high-score records from clean/verified sources, removes duplicates, writes a | |
| JSONL artifact, and saves a manifest with domain counts plus content hash. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter | |
| from dataclasses import asdict, dataclass | |
| import hashlib | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Iterable | |
| SCHEMA_VERSION = "tinymind-pure-v1" | |
| BAD_MARKERS = ( | |
| "as an ai", | |
| "i don't know", | |
| "i do not know", | |
| "not sure", | |
| "ไม่ทราบ", | |
| "ไม่แน่ใจ", | |
| ) | |
| class PureRecord: | |
| domain: str | |
| lang: str | |
| question: str | |
| answer: str | |
| source: str | |
| license: str | |
| quality_score: float | |
| rarity_score: float = 0.0 | |
| def _normalized_key(record: PureRecord) -> str: | |
| text = re.sub(r"\s+", " ", record.question.strip().lower()) | |
| return f"{record.lang}:{record.domain}:{text}" | |
| def _stable_id(record: PureRecord) -> str: | |
| payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True) | |
| return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:20] | |
| def _contains_bad_marker(record: PureRecord) -> bool: | |
| text = f"{record.question}\n{record.answer}".lower() | |
| return any(marker in text for marker in BAD_MARKERS) | |
| class PureDatasetForge: | |
| """Create clean TinyMind training JSONL with evidence manifest.""" | |
| purity_policy = ( | |
| "human_or_verified_synthetic", | |
| "schema_valid", | |
| "deduplicated_by_question_domain_language", | |
| "quality_and_rarity_thresholded", | |
| "license_recorded_per_sample", | |
| ) | |
| def __init__(self, min_quality: float = 0.85, min_rarity: float = 0.0): | |
| self.min_quality = float(min_quality) | |
| self.min_rarity = float(min_rarity) | |
| def is_pure(self, record: PureRecord) -> bool: | |
| if record.lang not in {"th", "en"}: | |
| return False | |
| if record.quality_score < self.min_quality or record.rarity_score < self.min_rarity: | |
| return False | |
| if not all( | |
| str(value).strip() | |
| for value in (record.domain, record.question, record.answer, record.source, record.license) | |
| ): | |
| return False | |
| if len(record.question.strip()) < 8 or len(record.answer.strip()) < 16: | |
| return False | |
| return not _contains_bad_marker(record) | |
| def select(self, records: Iterable[PureRecord]) -> list[PureRecord]: | |
| candidates = [record for record in records if self.is_pure(record)] | |
| candidates.sort(key=lambda r: (r.quality_score, r.rarity_score, len(r.answer)), reverse=True) | |
| kept: dict[str, PureRecord] = {} | |
| for record in candidates: | |
| kept.setdefault(_normalized_key(record), record) | |
| return sorted(kept.values(), key=lambda r: (r.domain, r.lang, r.question)) | |
| def to_row(self, record: PureRecord) -> dict: | |
| row = asdict(record) | |
| row["id"] = _stable_id(record) | |
| row["schema_version"] = SCHEMA_VERSION | |
| return row | |
| def write_jsonl(self, records: Iterable[PureRecord], out_path: str | Path) -> dict: | |
| out = Path(out_path) | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| selected = self.select(records) | |
| rows = [self.to_row(record) for record in selected] | |
| with out.open("w", encoding="utf-8", newline="\n") as f: | |
| for row in rows: | |
| f.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n") | |
| payload = out.read_bytes() | |
| domain_counts = dict(Counter(row["domain"] for row in rows)) | |
| manifest = { | |
| "schema_version": SCHEMA_VERSION, | |
| "records_written": len(rows), | |
| "domain_counts": domain_counts, | |
| "sha256": hashlib.sha256(payload).hexdigest(), | |
| "bytes": len(payload), | |
| "purity_policy": list(self.purity_policy), | |
| "min_quality": self.min_quality, | |
| "min_rarity": self.min_rarity, | |
| } | |
| manifest_path = out.with_suffix(".manifest.json") | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return manifest | |
Xet Storage Details
- Size:
- 4.27 kB
- Xet hash:
- 7bbee9ed2b34a236833cbf9d6e9d51f5658190f1bf3c9a8e5b5da2ea3fedd4c4
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.