Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /compressed_context_2m.py
| """2M-token compressed exact context evidence for TinyMind. | |
| This module proves the practical path for very large context: exact persisted | |
| tokens in a hash ledger, plus a tiny compression index that stores only anchors, | |
| hashes, and locators in memory. It is not a claim that the transformer KV cache | |
| contains two million tokens. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Iterable | |
| from evaluation.evidence_ledger import EvidenceLedgerV2 | |
| TARGET_CONTEXT_TOKENS = 2_000_000 | |
| class CompressionIndexConfig: | |
| target_tokens: int = TARGET_CONTEXT_TOKENS | |
| chunk_tokens: int = 8192 | |
| anchor_stride_chunks: int = 8 | |
| semantic_span_length: int = 16 | |
| def _token_stream(token_count: int, passkeys: dict[int, int]) -> Iterable[int]: | |
| for position in range(int(token_count)): | |
| if position in passkeys: | |
| yield int(passkeys[position]) | |
| else: | |
| # Deterministic high-entropy-ish stream without storing a list. | |
| yield int(((position * 1_103_515_245 + 12_345) ^ (position >> 7)) % 32_000) | |
| def build_compression_index(manifest: dict, cfg: CompressionIndexConfig) -> dict: | |
| chunks = manifest["chunks"] | |
| anchors = [ | |
| { | |
| "chunk_id": int(chunk["chunk_id"]), | |
| "start": int(chunk["start"]), | |
| "length": int(chunk["length"]), | |
| "sha256_prefix": str(chunk["sha256"])[:16], | |
| } | |
| for chunk in chunks | |
| if int(chunk["chunk_id"]) % max(1, cfg.anchor_stride_chunks) == 0 | |
| ] | |
| passkey_positions = sorted(int(pos) for pos in manifest.get("passkeys", {}).keys()) | |
| estimated_index_bytes = len(json.dumps(anchors, sort_keys=True).encode("utf-8")) + len(passkey_positions) * 24 | |
| return { | |
| "schema_version": "tinymind-2m-compression-index-v1", | |
| "method": "AxiomDeltaHashIndex", | |
| "description": ( | |
| "Exact tokens remain in the ledger. The compressed index stores sparse chunk anchors, " | |
| "hash prefixes, semantic spans, and passkey locators for retrieval/regeneration." | |
| ), | |
| "target_tokens": int(manifest["total_tokens"]), | |
| "chunk_tokens": int(manifest["chunk_tokens"]), | |
| "chunk_count": int(manifest["chunk_count"]), | |
| "anchor_stride_chunks": int(cfg.anchor_stride_chunks), | |
| "anchor_count": len(anchors), | |
| "anchors": anchors, | |
| "passkey_positions": passkey_positions, | |
| "kv_tokens_stored": 0, | |
| "estimated_index_bytes": estimated_index_bytes, | |
| "index_bytes_per_token": estimated_index_bytes / max(1, int(manifest["total_tokens"])), | |
| } | |
| def run_compressed_context_2m_benchmark( | |
| out_dir: str | Path, | |
| *, | |
| token_count: int = TARGET_CONTEXT_TOKENS, | |
| chunk_tokens: int = 8192, | |
| anchor_stride_chunks: int = 8, | |
| ) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| cfg = CompressionIndexConfig( | |
| target_tokens=token_count, | |
| chunk_tokens=chunk_tokens, | |
| anchor_stride_chunks=anchor_stride_chunks, | |
| ) | |
| positions = [0, token_count // 3, (token_count * 2) // 3, token_count - 1] | |
| passkeys = {pos: 120_000_000 + i for i, pos in enumerate(positions)} | |
| semantic_spans = { | |
| "first": (0, cfg.semantic_span_length), | |
| "middle": (token_count // 2, cfg.semantic_span_length), | |
| "last_window": (max(0, token_count - cfg.semantic_span_length), cfg.semantic_span_length), | |
| } | |
| ledger = EvidenceLedgerV2(out / "ledger", chunk_tokens=chunk_tokens) | |
| manifest = ledger.ingest(_token_stream(token_count, passkeys), passkeys=passkeys, semantic_spans=semantic_spans) | |
| integrity = ledger.verify_integrity() | |
| index = build_compression_index(manifest, cfg) | |
| passkey_hits = [] | |
| for position, expected in passkeys.items(): | |
| hit = ledger.recall_position(position) | |
| hit["expected"] = expected | |
| hit["matched"] = int(hit["token"]) == int(expected) | |
| passkey_hits.append(hit) | |
| semantic_hits = { | |
| name: { | |
| "start": hit["start"], | |
| "length": hit["length"], | |
| "token_count": len(hit["tokens"]), | |
| "first_token": hit["tokens"][0] if hit["tokens"] else None, | |
| } | |
| for name in semantic_spans | |
| for hit in [ledger.recall_semantic(name)] | |
| } | |
| archive_bytes = sum(Path(chunk["path"]).stat().st_size for chunk in manifest["chunks"]) | |
| report = { | |
| "schema_version": "tinymind-compressed-context-2m-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "config": asdict(cfg), | |
| "ledger_manifest": str(ledger.manifest_path), | |
| "semantic_index_path": str(ledger.semantic_path), | |
| "compression_index": index, | |
| "integrity": integrity, | |
| "measurements": { | |
| "measured_tokens": int(token_count), | |
| "archive_bytes": int(archive_bytes), | |
| "archive_bytes_per_token": archive_bytes / max(1, int(token_count)), | |
| "index_bytes": int(index["estimated_index_bytes"]), | |
| "index_bytes_per_token": index["index_bytes_per_token"], | |
| "kv_tokens_stored": 0, | |
| "passkey_recall_passed": all(hit["matched"] for hit in passkey_hits), | |
| "semantic_recall_passed": all(hit["token_count"] == cfg.semantic_span_length for hit in semantic_hits.values()), | |
| }, | |
| "passkey_hits": passkey_hits, | |
| "semantic_hits": semantic_hits, | |
| "claim_gate": { | |
| "compressed_2m_context_ready": token_count >= TARGET_CONTEXT_TOKENS | |
| and all(hit["matched"] for hit in passkey_hits) | |
| and integrity["passed"], | |
| "full_kv_2m_claim_allowed": False, | |
| "exact_recall_claim_allowed": True, | |
| "world_longest_context_claim_allowed": False, | |
| "reason": "2M exact recall is backed by ledger/index evidence; full transformer KV and external rank claims are not asserted.", | |
| }, | |
| } | |
| json_path = out / "compressed_context_2m_report.json" | |
| md_path = out / "compressed_context_2m_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict) -> str: | |
| m = report["measurements"] | |
| gate = report["claim_gate"] | |
| return "\n".join( | |
| [ | |
| "# TinyMind 2M Compressed Context Evidence", | |
| "", | |
| f"- Measured tokens: {m['measured_tokens']}", | |
| f"- Archive bytes/token: {m['archive_bytes_per_token']:.4f}", | |
| f"- Index bytes/token: {m['index_bytes_per_token']:.8f}", | |
| f"- KV tokens stored: {m['kv_tokens_stored']}", | |
| f"- Passkey recall passed: {m['passkey_recall_passed']}", | |
| f"- Semantic recall passed: {m['semantic_recall_passed']}", | |
| f"- 2M compressed context ready: {gate['compressed_2m_context_ready']}", | |
| f"- Full KV 2M claim allowed: {gate['full_kv_2m_claim_allowed']}", | |
| f"- World-longest claim allowed: {gate['world_longest_context_claim_allowed']}", | |
| "", | |
| ] | |
| ) | |
Xet Storage Details
- Size:
- 7.3 kB
- Xet hash:
- ffeac445b9568d5b45822e4503fa3737831efa765b97694db799c9b45697c959
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.