bbkdevops's picture
download
raw
7.3 kB
"""2M-token compressed exact context evidence for TinyMind.
This module proves the practical path for very large context: exact persisted
tokens in a hash ledger, plus a tiny compression index that stores only anchors,
hashes, and locators in memory. It is not a claim that the transformer KV cache
contains two million tokens.
"""
from __future__ import annotations
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Iterable
from evaluation.evidence_ledger import EvidenceLedgerV2
TARGET_CONTEXT_TOKENS = 2_000_000
@dataclass(frozen=True)
class CompressionIndexConfig:
target_tokens: int = TARGET_CONTEXT_TOKENS
chunk_tokens: int = 8192
anchor_stride_chunks: int = 8
semantic_span_length: int = 16
def _token_stream(token_count: int, passkeys: dict[int, int]) -> Iterable[int]:
for position in range(int(token_count)):
if position in passkeys:
yield int(passkeys[position])
else:
# Deterministic high-entropy-ish stream without storing a list.
yield int(((position * 1_103_515_245 + 12_345) ^ (position >> 7)) % 32_000)
def build_compression_index(manifest: dict, cfg: CompressionIndexConfig) -> dict:
chunks = manifest["chunks"]
anchors = [
{
"chunk_id": int(chunk["chunk_id"]),
"start": int(chunk["start"]),
"length": int(chunk["length"]),
"sha256_prefix": str(chunk["sha256"])[:16],
}
for chunk in chunks
if int(chunk["chunk_id"]) % max(1, cfg.anchor_stride_chunks) == 0
]
passkey_positions = sorted(int(pos) for pos in manifest.get("passkeys", {}).keys())
estimated_index_bytes = len(json.dumps(anchors, sort_keys=True).encode("utf-8")) + len(passkey_positions) * 24
return {
"schema_version": "tinymind-2m-compression-index-v1",
"method": "AxiomDeltaHashIndex",
"description": (
"Exact tokens remain in the ledger. The compressed index stores sparse chunk anchors, "
"hash prefixes, semantic spans, and passkey locators for retrieval/regeneration."
),
"target_tokens": int(manifest["total_tokens"]),
"chunk_tokens": int(manifest["chunk_tokens"]),
"chunk_count": int(manifest["chunk_count"]),
"anchor_stride_chunks": int(cfg.anchor_stride_chunks),
"anchor_count": len(anchors),
"anchors": anchors,
"passkey_positions": passkey_positions,
"kv_tokens_stored": 0,
"estimated_index_bytes": estimated_index_bytes,
"index_bytes_per_token": estimated_index_bytes / max(1, int(manifest["total_tokens"])),
}
def run_compressed_context_2m_benchmark(
out_dir: str | Path,
*,
token_count: int = TARGET_CONTEXT_TOKENS,
chunk_tokens: int = 8192,
anchor_stride_chunks: int = 8,
) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
cfg = CompressionIndexConfig(
target_tokens=token_count,
chunk_tokens=chunk_tokens,
anchor_stride_chunks=anchor_stride_chunks,
)
positions = [0, token_count // 3, (token_count * 2) // 3, token_count - 1]
passkeys = {pos: 120_000_000 + i for i, pos in enumerate(positions)}
semantic_spans = {
"first": (0, cfg.semantic_span_length),
"middle": (token_count // 2, cfg.semantic_span_length),
"last_window": (max(0, token_count - cfg.semantic_span_length), cfg.semantic_span_length),
}
ledger = EvidenceLedgerV2(out / "ledger", chunk_tokens=chunk_tokens)
manifest = ledger.ingest(_token_stream(token_count, passkeys), passkeys=passkeys, semantic_spans=semantic_spans)
integrity = ledger.verify_integrity()
index = build_compression_index(manifest, cfg)
passkey_hits = []
for position, expected in passkeys.items():
hit = ledger.recall_position(position)
hit["expected"] = expected
hit["matched"] = int(hit["token"]) == int(expected)
passkey_hits.append(hit)
semantic_hits = {
name: {
"start": hit["start"],
"length": hit["length"],
"token_count": len(hit["tokens"]),
"first_token": hit["tokens"][0] if hit["tokens"] else None,
}
for name in semantic_spans
for hit in [ledger.recall_semantic(name)]
}
archive_bytes = sum(Path(chunk["path"]).stat().st_size for chunk in manifest["chunks"])
report = {
"schema_version": "tinymind-compressed-context-2m-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"config": asdict(cfg),
"ledger_manifest": str(ledger.manifest_path),
"semantic_index_path": str(ledger.semantic_path),
"compression_index": index,
"integrity": integrity,
"measurements": {
"measured_tokens": int(token_count),
"archive_bytes": int(archive_bytes),
"archive_bytes_per_token": archive_bytes / max(1, int(token_count)),
"index_bytes": int(index["estimated_index_bytes"]),
"index_bytes_per_token": index["index_bytes_per_token"],
"kv_tokens_stored": 0,
"passkey_recall_passed": all(hit["matched"] for hit in passkey_hits),
"semantic_recall_passed": all(hit["token_count"] == cfg.semantic_span_length for hit in semantic_hits.values()),
},
"passkey_hits": passkey_hits,
"semantic_hits": semantic_hits,
"claim_gate": {
"compressed_2m_context_ready": token_count >= TARGET_CONTEXT_TOKENS
and all(hit["matched"] for hit in passkey_hits)
and integrity["passed"],
"full_kv_2m_claim_allowed": False,
"exact_recall_claim_allowed": True,
"world_longest_context_claim_allowed": False,
"reason": "2M exact recall is backed by ledger/index evidence; full transformer KV and external rank claims are not asserted.",
},
}
json_path = out / "compressed_context_2m_report.json"
md_path = out / "compressed_context_2m_report.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _markdown(report: dict) -> str:
m = report["measurements"]
gate = report["claim_gate"]
return "\n".join(
[
"# TinyMind 2M Compressed Context Evidence",
"",
f"- Measured tokens: {m['measured_tokens']}",
f"- Archive bytes/token: {m['archive_bytes_per_token']:.4f}",
f"- Index bytes/token: {m['index_bytes_per_token']:.8f}",
f"- KV tokens stored: {m['kv_tokens_stored']}",
f"- Passkey recall passed: {m['passkey_recall_passed']}",
f"- Semantic recall passed: {m['semantic_recall_passed']}",
f"- 2M compressed context ready: {gate['compressed_2m_context_ready']}",
f"- Full KV 2M claim allowed: {gate['full_kv_2m_claim_allowed']}",
f"- World-longest claim allowed: {gate['world_longest_context_claim_allowed']}",
"",
]
)

Xet Storage Details

Size:
7.3 kB
·
Xet hash:
ffeac445b9568d5b45822e4503fa3737831efa765b97694db799c9b45697c959

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.