Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /compressed_context_2m.py

bbkdevops

about 1 month ago

download

raw

7.3 kB

	"""2M-token compressed exact context evidence for TinyMind.

	This module proves the practical path for very large context: exact persisted
	tokens in a hash ledger, plus a tiny compression index that stores only anchors,
	hashes, and locators in memory. It is not a claim that the transformer KV cache
	contains two million tokens.
	"""

	from __future__ import annotations

	from dataclasses import asdict, dataclass
	from datetime import datetime, timezone
	import json
	from pathlib import Path
	from typing import Iterable

	from evaluation.evidence_ledger import EvidenceLedgerV2


	TARGET_CONTEXT_TOKENS = 2_000_000


	@dataclass(frozen=True)
	class CompressionIndexConfig:
	target_tokens: int = TARGET_CONTEXT_TOKENS
	chunk_tokens: int = 8192
	anchor_stride_chunks: int = 8
	semantic_span_length: int = 16


	def _token_stream(token_count: int, passkeys: dict[int, int]) -> Iterable[int]:
	for position in range(int(token_count)):
	if position in passkeys:
	yield int(passkeys[position])
	else:
	# Deterministic high-entropy-ish stream without storing a list.
	yield int(((position * 1_103_515_245 + 12_345) ^ (position >> 7)) % 32_000)


	def build_compression_index(manifest: dict, cfg: CompressionIndexConfig) -> dict:
	chunks = manifest["chunks"]
	anchors = [
	{
	"chunk_id": int(chunk["chunk_id"]),
	"start": int(chunk["start"]),
	"length": int(chunk["length"]),
	"sha256_prefix": str(chunk["sha256"])[:16],
	}
	for chunk in chunks
	if int(chunk["chunk_id"]) % max(1, cfg.anchor_stride_chunks) == 0
	]
	passkey_positions = sorted(int(pos) for pos in manifest.get("passkeys", {}).keys())
	estimated_index_bytes = len(json.dumps(anchors, sort_keys=True).encode("utf-8")) + len(passkey_positions) * 24
	return {
	"schema_version": "tinymind-2m-compression-index-v1",
	"method": "AxiomDeltaHashIndex",
	"description": (
	"Exact tokens remain in the ledger. The compressed index stores sparse chunk anchors, "
	"hash prefixes, semantic spans, and passkey locators for retrieval/regeneration."
	),
	"target_tokens": int(manifest["total_tokens"]),
	"chunk_tokens": int(manifest["chunk_tokens"]),
	"chunk_count": int(manifest["chunk_count"]),
	"anchor_stride_chunks": int(cfg.anchor_stride_chunks),
	"anchor_count": len(anchors),
	"anchors": anchors,
	"passkey_positions": passkey_positions,
	"kv_tokens_stored": 0,
	"estimated_index_bytes": estimated_index_bytes,
	"index_bytes_per_token": estimated_index_bytes / max(1, int(manifest["total_tokens"])),
	}


	def run_compressed_context_2m_benchmark(
	out_dir: str \| Path,
	*,
	token_count: int = TARGET_CONTEXT_TOKENS,
	chunk_tokens: int = 8192,
	anchor_stride_chunks: int = 8,
	) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	cfg = CompressionIndexConfig(
	target_tokens=token_count,
	chunk_tokens=chunk_tokens,
	anchor_stride_chunks=anchor_stride_chunks,
	)
	positions = [0, token_count // 3, (token_count * 2) // 3, token_count - 1]
	passkeys = {pos: 120_000_000 + i for i, pos in enumerate(positions)}
	semantic_spans = {
	"first": (0, cfg.semantic_span_length),
	"middle": (token_count // 2, cfg.semantic_span_length),
	"last_window": (max(0, token_count - cfg.semantic_span_length), cfg.semantic_span_length),
	}

	ledger = EvidenceLedgerV2(out / "ledger", chunk_tokens=chunk_tokens)
	manifest = ledger.ingest(_token_stream(token_count, passkeys), passkeys=passkeys, semantic_spans=semantic_spans)
	integrity = ledger.verify_integrity()
	index = build_compression_index(manifest, cfg)

	passkey_hits = []
	for position, expected in passkeys.items():
	hit = ledger.recall_position(position)
	hit["expected"] = expected
	hit["matched"] = int(hit["token"]) == int(expected)
	passkey_hits.append(hit)
	semantic_hits = {
	name: {
	"start": hit["start"],
	"length": hit["length"],
	"token_count": len(hit["tokens"]),
	"first_token": hit["tokens"][0] if hit["tokens"] else None,
	}
	for name in semantic_spans
	for hit in [ledger.recall_semantic(name)]
	}

	archive_bytes = sum(Path(chunk["path"]).stat().st_size for chunk in manifest["chunks"])
	report = {
	"schema_version": "tinymind-compressed-context-2m-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"config": asdict(cfg),
	"ledger_manifest": str(ledger.manifest_path),
	"semantic_index_path": str(ledger.semantic_path),
	"compression_index": index,
	"integrity": integrity,
	"measurements": {
	"measured_tokens": int(token_count),
	"archive_bytes": int(archive_bytes),
	"archive_bytes_per_token": archive_bytes / max(1, int(token_count)),
	"index_bytes": int(index["estimated_index_bytes"]),
	"index_bytes_per_token": index["index_bytes_per_token"],
	"kv_tokens_stored": 0,
	"passkey_recall_passed": all(hit["matched"] for hit in passkey_hits),
	"semantic_recall_passed": all(hit["token_count"] == cfg.semantic_span_length for hit in semantic_hits.values()),
	},
	"passkey_hits": passkey_hits,
	"semantic_hits": semantic_hits,
	"claim_gate": {
	"compressed_2m_context_ready": token_count >= TARGET_CONTEXT_TOKENS
	and all(hit["matched"] for hit in passkey_hits)
	and integrity["passed"],
	"full_kv_2m_claim_allowed": False,
	"exact_recall_claim_allowed": True,
	"world_longest_context_claim_allowed": False,
	"reason": "2M exact recall is backed by ledger/index evidence; full transformer KV and external rank claims are not asserted.",
	},
	}
	json_path = out / "compressed_context_2m_report.json"
	md_path = out / "compressed_context_2m_report.md"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _markdown(report: dict) -> str:
	m = report["measurements"]
	gate = report["claim_gate"]
	return "\n".join(
	[
	"# TinyMind 2M Compressed Context Evidence",
	"",
	f"- Measured tokens: {m['measured_tokens']}",
	f"- Archive bytes/token: {m['archive_bytes_per_token']:.4f}",
	f"- Index bytes/token: {m['index_bytes_per_token']:.8f}",
	f"- KV tokens stored: {m['kv_tokens_stored']}",
	f"- Passkey recall passed: {m['passkey_recall_passed']}",
	f"- Semantic recall passed: {m['semantic_recall_passed']}",
	f"- 2M compressed context ready: {gate['compressed_2m_context_ready']}",
	f"- Full KV 2M claim allowed: {gate['full_kv_2m_claim_allowed']}",
	f"- World-longest claim allowed: {gate['world_longest_context_claim_allowed']}",
	"",
	]
	)

Xet Storage Details

Size:: 7.3 kB
Xet hash:: ffeac445b9568d5b45822e4503fa3737831efa765b97694db799c9b45697c959

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.