AbstractPhil
/

geolip-aleph-void

+# lexical_atlas.py
+"""
+Lexical Atlas — the full wordnet-lexical-topology vocabulary on the sphere
+===========================================================================
+Extracts the ENTIRE AbstractPhil/wordnet-lexical-topology setup (~12.8M
+n-grams across {nltk, hf, unicode} x {char, word} x {1..5}gram configs) into
+spherical coordinates, correctly spaced — where "correct" is determined by
+capacity mathematics, not hope.
+THE CAPACITY LAW (computed exactly, 2026-06-09):
+    12.8M uniformly spaced points on S^(D-1), median nearest-neighbor angle:
+        D=4 : 0.363 deg  -> 0.06 logits of address contrast at tau=0.1
+                            (neighbors indistinguishable through K=64; fp16
+                            cannot resolve the cosines, fp32 marginal)
+        D=32: 39.1 deg   |  D=48: 47.6 deg  -> 7-8 logits, comfortable
+    The CM-band result (band-valid D=32-112, sweet spot 32-56) independently
+    prescribes the same range. THEREFORE the atlas is TWO-TIER:
+    TIER 1 (base)  : deterministic low-discrepancy placement at band-valid D
+                     (default 48) — scrambled-Sobol -> Gaussian -> normalize.
+                     Uniform by construction, reproducible by seed, unique
+                     per n-gram. This is "spaced on the sphere correctly."
+    TIER 2 (view)  : the LEARNED D_addr=4 address-space view extracted from a
+                     trained AlephLM checkpoint — per n-gram: bytes -> pad ->
+                     trigrams -> kappa rows (W_kappa o byte_emb) -> mean ->
+                     normalize. This is where the model actually PLACED the
+                     vocabulary; crowded by necessity (see law), meaningful
+                     as geometry-of-content, not as unique identity.
+Honesty on the learned view: mean composition is order-insensitive, so
+anagrammatic n-grams (same trigram multiset) COLLIDE; collisions are counted
+and reported per config. The deterministic tier never collides.
+Per-config outputs:
+    atlas/{config}.parquet   columns: ngram, rank, frequency, n_tri,
+                             vec_base (D_base floats), vec_view (4 floats)
+    atlas/{config}.stats.json   NN-angle distribution (sampled), statute of
+                             both tiers (4k subsample), collision count
+Usage:
+    python lexical_atlas.py --checkpoint aleph_lm_hybrid_corpus.pt \\
+        --configs char_eng_unigram char_eng_2gram char_eng_3gram \\
+                  char_eng_4gram char_eng_5gram --d-base 48
+    # --configs all  -> every config in the dataset (~12.8M rows total)
+Depends: aleph_lm.py (+ its deps), pyarrow, huggingface_hub.
+Author: AbstractPhil + Mirel    Date: 2026-06-09    License: MIT
+"""
+from __future__ import annotations
+import json
+import math
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+DATASET = "AbstractPhil/wordnet-lexical-topology"
+PAD_BYTE = 0x00                       # reserved pad symbol (documented, learned slot)
+_GRAMS = ("unigram", "2gram", "3gram", "4gram", "5gram")
+SOURCE_CONFIGS = ([f"nltk_{k}_eng_{n}" for k in ("char", "word") for n in _GRAMS]
+                  + [f"hf_{k}_eng_{n}" for k in ("char", "word") for n in _GRAMS]
+                  + [f"unicode_global_{n}" for n in _GRAMS])
+LEGACY_CONFIGS = [f"{k}_eng_{n}" for k in ("char", "word") for n in _GRAMS]
+# legacy unprefixed configs are pre-merged ANCESTORS (verified: char_eng_3gram
+# is a superset of nltk_char_eng_3gram, freq corr 0.914) — excluded from 'all'
+# to avoid double counting; available explicitly.
+ALL_CONFIGS = SOURCE_CONFIGS
+def source_of(config: str) -> str:
+    for s in ("nltk", "hf", "unicode"):
+        if config.startswith(s):
+            return s
+    return "legacy"
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Config
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+@dataclass
+class AtlasConfig:
+    checkpoint: Optional[str] = None          # AlephLM ckpt (None = base tier only)
+    configs: List[str] = field(default_factory=lambda: [
+        "char_eng_unigram", "char_eng_2gram", "char_eng_3gram",
+        "char_eng_4gram", "char_eng_5gram"])
+    d_base: int = 48                          # band-valid (CM sweet spot 32-56)
+    base_seed: int = 1234                     # determinism of Tier 1
+    out_dir: str = "atlas"
+    batch: int = 65536
+    max_tri: int = 16                         # n-grams longer than 48 bytes truncated
+    stats_sample: int = 4000
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Tier 1 — deterministic band-valid base (correct spacing by construction)
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+class SobolSphere:
+    """Low-discrepancy points on S^(D-1): scrambled Sobol -> inverse-normal ->
+    normalize. Deterministic per (seed, global index): the same n-gram (by its
+    global rank position) always receives the same point. Never collides."""
+    def __init__(self, D: int, seed: int):
+        self.D, self.seed = D, seed
+        self.eng = torch.quasirandom.SobolEngine(D, scramble=True, seed=seed)
+        self._cursor = 0
+    def take(self, n: int) -> Tensor:
+        u = self.eng.draw(n).clamp(1e-6, 1 - 1e-6)
+        g = torch.erfinv(2 * u - 1) * math.sqrt(2.0)        # inverse normal CDF
+        self._cursor += n
+        return F.normalize(g, dim=-1)
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Tier 2 — learned address-space view (the model's own placement)
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+class LearnedView:
+    """kappa-row composer from a trained AlephLM checkpoint."""
+    def __init__(self, checkpoint: str, device: str):
+        from aleph_lm import AlephLM, AlephLMConfig
+        d = torch.load(checkpoint, map_location=device, weights_only=False)
+        fields = AlephLMConfig.__dataclass_fields__
+        cfg = AlephLMConfig(**{k: v for k, v in d["config"].items() if k in fields})
+        bank = d.get("bank", None)
+        self.model = AlephLM(cfg, bank=bank).to(device)
+        self.model.load_state_dict(d["model_state_dict"])
+        self.model.eval()
+        self.cfg, self.device = cfg, device
+    @torch.no_grad()
+    def compose(self, tri: Tensor, n_tri: Tensor) -> Tensor:
+        """tri: (B, T, 3) padded trigram bytes; n_tri: (B,) valid counts.
+        Returns (B, D_addr) unit rows: normalized mean of per-trigram
+        kappa rows over the valid prefix. Order-insensitive (collisions
+        among anagrams; counted upstream)."""
+        m = self.model
+        tri = tri.to(self.device)
+        e = sum(emb(tri[..., i]) for i, emb in enumerate(m.byte_emb))   # (B,T,d)
+        rows = F.normalize(m.W_kappa(e), dim=-1)                        # (B,T,Da)
+        mask = (torch.arange(tri.shape[1], device=self.device)[None, :]
+                < n_tri.to(self.device)[:, None]).float().unsqueeze(-1)
+        mean = (rows * mask).sum(1) / mask.sum(1).clamp_min(1e-9)
+        return F.normalize(mean, dim=-1).cpu()
+def ngrams_to_trigrams(ngrams: List[str], max_tri: int
+                       ) -> Tuple[Tensor, Tensor, np.ndarray]:
+    """UTF-8 encode, pad to multiple of 3 with PAD_BYTE, frame as trigrams.
+    Returns (B, max_tri, 3) bytes, (B,) counts, and the trigram-multiset hash
+    per n-gram (for anagram-collision counting)."""
+    B = len(ngrams)
+    out = np.zeros((B, max_tri, 3), dtype=np.int64)
+    counts = np.zeros(B, dtype=np.int64)
+    mhash = np.zeros(B, dtype=np.uint64)
+    for i, s in enumerate(ngrams):
+        b = str(s).encode("utf-8", errors="ignore")[: 3 * max_tri]
+        if len(b) % 3:
+            b = b + bytes([PAD_BYTE]) * (3 - len(b) % 3)
+        t = np.frombuffer(b, dtype=np.uint8).reshape(-1, 3).astype(np.int64)
+        n = len(t)
+        out[i, :n] = t
+        counts[i] = max(n, 1)
+        ids = (t[:, 0] * 65536 + t[:, 1] * 256 + t[:, 2]).astype(np.uint64)
+        h = np.uint64(0)
+        for v in np.sort(ids):                     # order-free multiset hash
+            h = (h * np.uint64(1099511628211)) ^ (v + np.uint64(0x9E3779B9))
+        mhash[i] = h ^ np.uint64(n)
+    return torch.from_numpy(out), torch.from_numpy(counts), mhash
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Spacing battery
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+def spacing_stats(vecs: Tensor, sample: int, seed: int = 0) -> Dict:
+    """Sampled NN-angle distribution + statute on a subsample."""
+    g = torch.Generator().manual_seed(seed)
+    idx = torch.randperm(len(vecs), generator=g)[: min(sample, len(vecs))]
+    X = F.normalize(vecs[idx].float(), dim=-1)
+    cos = (X @ X.t()).clamp(-1, 1)
+    cos.fill_diagonal_(-1)
+    nn_deg = torch.acos(cos.max(dim=-1).values) * 180 / math.pi
+    st = statute(X)
+    return {"nn_deg_median": nn_deg.median().item(),
+            "nn_deg_p05": nn_deg.quantile(0.05).item(),
+            "nn_deg_p95": nn_deg.quantile(0.95).item(),
+            "statute": st}
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Extraction
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+def extract_config(name: str, cfg: AtlasConfig, sobol: SobolSphere,
+                   view: Optional[LearnedView]) -> Dict:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    from huggingface_hub import hf_hub_download
+    path = hf_hub_download(DATASET, f"data/{name}-00000-of-00001.parquet",
+                           repo_type="dataset")
+    t = pq.read_table(path, columns=["ngram", "rank", "frequency"]) \
+          .to_pandas().sort_values("rank").reset_index(drop=True)
+    N = len(t)
+    print(f"[{name}] {N:,} n-grams")
+    base = sobol.take(N)                                     # (N, D_base)
+    views, counts, hashes = [], [], []
+    if view is not None:
+        for s0 in range(0, N, cfg.batch):
+            chunk = t["ngram"].iloc[s0: s0 + cfg.batch].tolist()
+            tri, n_tri, mh = ngrams_to_trigrams(chunk, cfg.max_tri)
+            views.append(view.compose(tri, n_tri))
+            counts.append(n_tri)
+            hashes.append(mh)
+        vview = torch.cat(views)
+        n_tri = torch.cat(counts)
+        mh = np.concatenate(hashes)
+        n_coll = int(N - len(np.unique(mh)))
+    else:
+        vview, n_tri, n_coll = None, None, 0
+    os.makedirs(cfg.out_dir, exist_ok=True)
+    cols = {"ngram": pa.array(t["ngram"].astype(str)),
+            "rank": pa.array(t["rank"].astype("int64")),
+            "frequency": pa.array(t["frequency"].astype("float64")),
+            "vec_base": pa.array(base.numpy().tolist(),
+                                 type=pa.list_(pa.float32(), cfg.d_base))}
+    if vview is not None:
+        cols["n_tri"] = pa.array(n_tri.numpy().astype("int8"))
+        cols["vec_view"] = pa.array(vview.numpy().tolist(),
+                                    type=pa.list_(pa.float32(), vview.shape[1]))
+    out_path = os.path.join(cfg.out_dir, f"{name}.parquet")
+    pq.write_table(pa.table(cols), out_path)
+    stats = {"config": name, "n": N, "d_base": cfg.d_base,
+             "anagram_collisions_view": n_coll,
+             "base": spacing_stats(base, cfg.stats_sample)}
+    if vview is not None:
+        stats["view"] = spacing_stats(vview, cfg.stats_sample)
+    with open(os.path.join(cfg.out_dir, f"{name}.stats.json"), "w") as f:
+        json.dump(stats, f, indent=2, default=str)
+    print(f"  base NN {stats['base']['nn_deg_median']:.2f} deg "
+          f"(statute {stats['base']['statute']['statute']})"
+          + (f"   view NN {stats['view']['nn_deg_median']:.3f} deg "
+             f"(statute {stats['view']['statute']['statute']}, "
+             f"collisions {n_coll})" if vview is not None else "")
+          + f"   -> {out_path}")
+    return stats
+def build_atlas(cfg: AtlasConfig) -> List[Dict]:
+    names = ALL_CONFIGS if cfg.configs == ["all"] else cfg.configs
+    sobol = SobolSphere(cfg.d_base, cfg.base_seed)           # ONE stream:
+    view = LearnedView(cfg.checkpoint, cfg.device) if cfg.checkpoint else None
+    # global index = unique placement across ALL configs (never reused)
+    all_stats = []
+    for name in names:
+        all_stats.append(extract_config(name, cfg, sobol, view))
+    total = sum(s["n"] for s in all_stats)
+    print(f"\n[atlas] {total:,} n-grams placed at D={cfg.d_base} "
+          f"(Tier 1, deterministic, collision-free)"
+          + (f" + learned D=4 view (Tier 2)" if view else ""))
+    return all_stats
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Canon — weighted dedupe across sources: ONE STRING, ONE POINT
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Cross-config duplicates of the same n-gram must not receive different
+# Tier-1 placements. Canonization: per-config frequencies are normalized
+# (sum to 1 within config — scale-free across sources), scaled by a
+# per-source weight (HF elevated: frequency-weighted definitions with
+# cardinality), summed per unique string, re-ranked, and placed once.
+DEFAULT_SOURCE_WEIGHTS = {"hf": 5.0, "nltk": 1.0, "unicode": 1.0, "legacy": 0.0}
+def canonize(cfg: AtlasConfig,
+             source_weights: Optional[Dict[str, float]] = None,
+             configs: Optional[List[str]] = None) -> Dict:
+    """Build the canonical deduplicated atlas directly from the dataset."""
+    import pandas as pd
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    from huggingface_hub import hf_hub_download
+    W = dict(DEFAULT_SOURCE_WEIGHTS)
+    if source_weights:
+        W.update(source_weights)
+    names = configs or SOURCE_CONFIGS
+    frames, prov = [], []
+    for name in names:
+        lam = W.get(source_of(name), 0.0)
+        if lam <= 0:
+            print(f"[canon] {name}: weight 0 — skipped")
+            continue
+        p = hf_hub_download(DATASET, f"data/{name}-00000-of-00001.parquet",
+                            repo_type="dataset")
+        t = pq.read_table(p, columns=["ngram", "frequency"]).to_pandas()
+        t["ngram"] = t["ngram"].astype(str)
+        t["w"] = lam * t["frequency"] / max(t["frequency"].sum(), 1e-30)
+        t["src"] = source_of(name)
+        frames.append(t[["ngram", "w", "src"]])
+        print(f"[canon] {name}: {len(t):,} rows  (lambda={lam})")
+    allrows = pd.concat(frames, ignore_index=True)
+    print(f"[canon] total rows {len(allrows):,}")
+    agg = allrows.groupby("ngram", sort=False).agg(
+        weight=("w", "sum"),
+        n_sources=("src", "nunique"),
+        sources=("src", lambda s: "+".join(sorted(set(s)))))
+    agg = agg.sort_values("weight", ascending=False).reset_index()
+    N = len(agg)
+    dup = len(allrows) - N
+    print(f"[canon] unique n-grams {N:,}  (merged {dup:,} duplicate rows)")
+    # Tier 1: one fresh stream over the canonical ranking — one string, one point
+    sobol = SobolSphere(cfg.d_base, cfg.base_seed)
+    base = sobol.take(N)
+    # Tier 2: learned view regenerated per unique string (pure function)
+    view = LearnedView(cfg.checkpoint, cfg.device) if cfg.checkpoint else None
+    vview, n_tri_all, n_coll = None, None, 0
+    if view is not None:
+        views, counts, hashes = [], [], []
+        for s0 in range(0, N, cfg.batch):
+            chunk = agg["ngram"].iloc[s0: s0 + cfg.batch].tolist()
+            tri, n_tri, mh = ngrams_to_trigrams(chunk, cfg.max_tri)
+            views.append(view.compose(tri, n_tri))
+            counts.append(n_tri)
+            hashes.append(mh)
+        vview = torch.cat(views)
+        n_tri_all = torch.cat(counts)
+        mh = np.concatenate(hashes)
+        n_coll = int(N - len(np.unique(mh)))
+    os.makedirs(cfg.out_dir, exist_ok=True)
+    cols = {"ngram": pa.array(agg["ngram"]),
+            "weight": pa.array(agg["weight"].astype("float64")),
+            "rank": pa.array(np.arange(1, N + 1, dtype=np.int64)),
+            "n_sources": pa.array(agg["n_sources"].astype("int8")),
+            "sources": pa.array(agg["sources"]),
+            "vec_base": pa.array(base.numpy().tolist(),
+                                 type=pa.list_(pa.float32(), cfg.d_base))}
+    if vview is not None:
+        cols["n_tri"] = pa.array(n_tri_all.numpy().astype("int8"))
+        cols["vec_view"] = pa.array(vview.numpy().tolist(),
+                                    type=pa.list_(pa.float32(), vview.shape[1]))
+    out_path = os.path.join(cfg.out_dir, "canon.parquet")
+    pq.write_table(pa.table(cols), out_path)
+    stats = {"unique": N, "merged_duplicates": dup,
+             "source_weights": W, "configs": names,
+             "anagram_collisions_view": n_coll,
+             "base": spacing_stats(base, cfg.stats_sample)}
+    if vview is not None:
+        stats["view"] = spacing_stats(vview, cfg.stats_sample)
+    with open(os.path.join(cfg.out_dir, "canon.stats.json"), "w") as f:
+        json.dump(stats, f, indent=2, default=str)
+    print(f"[canon] -> {out_path}  "
+          f"(base NN {stats['base']['nn_deg_median']:.2f} deg"
+          + (f", view collisions {n_coll}" if vview is not None else "") + ")")
+    return stats
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Stratified bank — round-robin across the granularity ladder
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Breadth-first sampling: rank-1 of every config, then rank-2, ... with
+# weighted dedupe along the way, until `target` unique n-grams. Yields a
+# compact multi-granularity candidate vocabulary stratified across
+# {source} x {char, word} x {1..5}gram. Two outputs:
+#   bank_{target}.parquet      the full multi-granularity bank
+#   bank_{target}_tri.pt       the 3-byte-exact subset as an (M, 3) tensor —
+#                              a DROP-IN AlephLM trigram bank (only exact
+#                              3-byte strings can match raw next-trigram
+#                              targets; variable-length candidates await the
+#                              span-prediction head — v2, noted in log)
+def stratified_bank(cfg: AtlasConfig, target: int = 4096,
+                    source_weights: Optional[Dict[str, float]] = None,
+                    configs: Optional[List[str]] = None) -> Dict:
+    import pandas as pd
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    from huggingface_hub import hf_hub_download
+    W = dict(DEFAULT_SOURCE_WEIGHTS)
+    if source_weights:
+        W.update(source_weights)
+    names = [n for n in (configs or SOURCE_CONFIGS)
+             if W.get(source_of(n), 0.0) > 0]
+    tables = []
+    for name in names:
+        p = hf_hub_download(DATASET, f"data/{name}-00000-of-00001.parquet",
+                            repo_type="dataset")
+        t = pq.read_table(p, columns=["ngram", "rank", "frequency"]).to_pandas()
+        t["ngram"] = t["ngram"].astype(str)
+        lam = W[source_of(name)]
+        t["w"] = lam * t["frequency"] / max(t["frequency"].sum(), 1e-30)
+        t["config"] = name
+        tables.append(t.sort_values("rank").reset_index(drop=True))
+    chosen: Dict[str, Dict] = {}
+    depth = 0
+    while len(chosen) < target:
+        progressed = False
+        for t in tables:
+            if depth >= len(t):
+                continue
+            progressed = True
+            row = t.iloc[depth]
+            rec = chosen.get(row.ngram)
+            if rec is None:
+                chosen[row.ngram] = {"weight": row.w, "configs": {row.config},
+                                     "first_depth": depth}
+            else:
+                rec["weight"] += row.w
+                rec["configs"].add(row.config)
+            if len(chosen) >= target:
+                break
+        depth += 1
+        if not progressed:
+            break
+    rows = [{"ngram": k, "weight": v["weight"],
+             "n_configs": len(v["configs"]),
+             "configs": "+".join(sorted(v["configs"])),
+             "first_depth": v["first_depth"],
+             "n_bytes": len(k.encode("utf-8", errors="ignore"))}
+            for k, v in chosen.items()]
+    bank = pd.DataFrame(rows).sort_values(
+        ["first_depth", "weight"], ascending=[True, False]).reset_index(drop=True)
+    os.makedirs(cfg.out_dir, exist_ok=True)
+    out_pq = os.path.join(cfg.out_dir, f"bank_{target}.parquet")
+    pq.write_table(pa.Table.from_pandas(bank, preserve_index=False), out_pq)
+    tri_rows = [list(k.encode("utf-8")) for k in bank["ngram"]
+                if len(k.encode("utf-8", errors="ignore")) == 3]
+    tri = torch.tensor(tri_rows, dtype=torch.long) if tri_rows else         torch.empty(0, 3, dtype=torch.long)
+    out_pt = os.path.join(cfg.out_dir, f"bank_{target}_tri.pt")
+    torch.save({"bank": tri, "source": "stratified_atlas",
+                "target": target, "weights": W, "configs": names}, out_pt)
+    comp = bank.groupby(bank["configs"].str.split("+").str[0]).size().to_dict()
+    print(f"[bank] {len(bank):,} unique n-grams at round-robin depth {depth}"
+          f"  (3-byte-exact: {len(tri):,} -> {out_pt})")
+    print(f"[bank] multi-config members: {(bank.n_configs > 1).sum():,}"
+          f"   byte-length histogram: "
+          f"{bank.n_bytes.value_counts().sort_index().to_dict()}")
+    print(f"[bank] -> {out_pq}")
+    return {"n": len(bank), "depth": depth, "n_tri": len(tri),
+            "paths": [out_pq, out_pt]}
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# Activation
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser(description="Full lexical-topology atlas")
+    ap.add_argument("--checkpoint", default=None)
+    ap.add_argument("--configs", nargs="+",
+                    default=["all"])#"char_eng_unigram", "char_eng_2gram",
+                             #"char_eng_3gram", "char_eng_4gram",
+                             #"char_eng_5gram"])
+    ap.add_argument("--d-base", type=int, default=48)
+    ap.add_argument("--out", default="atlas")
+    ap.add_argument("--device",
+                    default="cuda" if torch.cuda.is_available() else "cpu")
+    ap.add_argument("--canon", action="store_true",
+                    help="weighted dedupe across sources: one string, one point")
+    ap.add_argument("--weights", default="hf=5,nltk=1,unicode=1,legacy=0",
+                    help="per-source lambdas, e.g. hf=5,nltk=1,unicode=1")
+    ap.add_argument("--bank", type=int, default=0,
+                    help="build a stratified bank of this many unique n-grams")
+    args, _unknown = ap.parse_known_args()
+    acfg = AtlasConfig(checkpoint=args.checkpoint, configs=args.configs,
+                       d_base=args.d_base, out_dir=args.out, device=args.device)
+    sw = {k: float(v) for k, v in
+          (kv.split("=") for kv in args.weights.split(","))}
+    if args.bank:
+        stratified_bank(acfg, target=args.bank, source_weights=sw,
+                        configs=None if args.configs in (["all"], ["sources"])
+                        else args.configs)
+    elif args.canon:
+        canonize(acfg, source_weights=sw,
+                 configs=None if args.configs == ["all"] else
+                 (SOURCE_CONFIGS if args.configs == ["sources"] else args.configs))
+    else:
+        build_atlas(acfg)