bbkdevops's picture
download
raw
4.27 kB
"""Pure dataset forge for TinyMind.
The forge intentionally starts conservative: it accepts only schema-valid,
high-score records from clean/verified sources, removes duplicates, writes a
JSONL artifact, and saves a manifest with domain counts plus content hash.
"""
from __future__ import annotations
from collections import Counter
from dataclasses import asdict, dataclass
import hashlib
import json
import re
from pathlib import Path
from typing import Iterable
SCHEMA_VERSION = "tinymind-pure-v1"
BAD_MARKERS = (
"as an ai",
"i don't know",
"i do not know",
"not sure",
"ไม่ทราบ",
"ไม่แน่ใจ",
)
@dataclass(frozen=True)
class PureRecord:
domain: str
lang: str
question: str
answer: str
source: str
license: str
quality_score: float
rarity_score: float = 0.0
def _normalized_key(record: PureRecord) -> str:
text = re.sub(r"\s+", " ", record.question.strip().lower())
return f"{record.lang}:{record.domain}:{text}"
def _stable_id(record: PureRecord) -> str:
payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:20]
def _contains_bad_marker(record: PureRecord) -> bool:
text = f"{record.question}\n{record.answer}".lower()
return any(marker in text for marker in BAD_MARKERS)
class PureDatasetForge:
"""Create clean TinyMind training JSONL with evidence manifest."""
purity_policy = (
"human_or_verified_synthetic",
"schema_valid",
"deduplicated_by_question_domain_language",
"quality_and_rarity_thresholded",
"license_recorded_per_sample",
)
def __init__(self, min_quality: float = 0.85, min_rarity: float = 0.0):
self.min_quality = float(min_quality)
self.min_rarity = float(min_rarity)
def is_pure(self, record: PureRecord) -> bool:
if record.lang not in {"th", "en"}:
return False
if record.quality_score < self.min_quality or record.rarity_score < self.min_rarity:
return False
if not all(
str(value).strip()
for value in (record.domain, record.question, record.answer, record.source, record.license)
):
return False
if len(record.question.strip()) < 8 or len(record.answer.strip()) < 16:
return False
return not _contains_bad_marker(record)
def select(self, records: Iterable[PureRecord]) -> list[PureRecord]:
candidates = [record for record in records if self.is_pure(record)]
candidates.sort(key=lambda r: (r.quality_score, r.rarity_score, len(r.answer)), reverse=True)
kept: dict[str, PureRecord] = {}
for record in candidates:
kept.setdefault(_normalized_key(record), record)
return sorted(kept.values(), key=lambda r: (r.domain, r.lang, r.question))
def to_row(self, record: PureRecord) -> dict:
row = asdict(record)
row["id"] = _stable_id(record)
row["schema_version"] = SCHEMA_VERSION
return row
def write_jsonl(self, records: Iterable[PureRecord], out_path: str | Path) -> dict:
out = Path(out_path)
out.parent.mkdir(parents=True, exist_ok=True)
selected = self.select(records)
rows = [self.to_row(record) for record in selected]
with out.open("w", encoding="utf-8", newline="\n") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
payload = out.read_bytes()
domain_counts = dict(Counter(row["domain"] for row in rows))
manifest = {
"schema_version": SCHEMA_VERSION,
"records_written": len(rows),
"domain_counts": domain_counts,
"sha256": hashlib.sha256(payload).hexdigest(),
"bytes": len(payload),
"purity_policy": list(self.purity_policy),
"min_quality": self.min_quality,
"min_rarity": self.min_rarity,
}
manifest_path = out.with_suffix(".manifest.json")
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return manifest

Xet Storage Details

Size:
4.27 kB
·
Xet hash:
7bbee9ed2b34a236833cbf9d6e9d51f5658190f1bf3c9a8e5b5da2ea3fedd4c4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.