""" modular_mind.py -- numpy inference for the trained Modular Mind boss brain. Loads the weights produced by train.py (mm_weights.npz) and runs the exact same forward pass as mm_torch.ModularMindPolicy, in pure numpy (no torch needed at game runtime -> tiny, fast, instant cold-start on a HuggingFace Space). decide(state) returns the chosen boss action plus rich telemetry (per-specialist drives, the shared latent, the coordinator's modulation) so the game can VISUALISE the Modular Mind making each decision -- including the two modulator specialists (Punisher, Enrage) whose only influence is the latent they write into the bridge. """ from __future__ import annotations import os import numpy as np from features import ACTIONS, NF, extract_features, legal_mask # must match mm_torch.SPEC_DEFS SPEC_DEFS = [ ("Aggressor", "CLEAVE", "#ff4d4d"), ("Stalker", "APPROACH", "#4da6ff"), ("Survivor", "RETREAT", "#9b59b6"), ("Baiter", "IDLE", "#f1c40f"), ("Defender", "BLOCK", "#48b0c4"), ("Punisher", None, "#e67e22"), ("Enrage", None, "#c0392b"), ] WEIGHTS_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "mm_weights.npz") H, D_LATENT = 24, 12 def _layernorm(x, w, b, eps=1e-5): mu = x.mean() var = ((x - mu) ** 2).mean() return (x - mu) / np.sqrt(var + eps) * w + b def _relu(x): return np.maximum(x, 0.0) class ModularMind: def __init__(self, weights_path=WEIGHTS_PATH): if os.path.exists(weights_path): self.W = {k: v for k, v in np.load(weights_path).items()} self.trained = True else: # fallback so the game still runs before training finishes self.W = self._random_weights() self.trained = False def _random_weights(self): rng = np.random.default_rng(0) W = {} for i, (_, owns, _) in enumerate(SPEC_DEFS): W[f"s{i}_fc1_w"] = rng.normal(0, .3, (H, NF)) W[f"s{i}_fc1_b"] = np.zeros(H) W[f"s{i}_lat_w"] = rng.normal(0, .3, (D_LATENT, H)) W[f"s{i}_lat_b"] = np.zeros(D_LATENT) if owns is not None: W[f"s{i}_drv_w"] = rng.normal(0, .3, (1, H)) W[f"s{i}_drv_b"] = np.zeros(1) W["link_ni_w"] = np.ones(D_LATENT); W["link_ni_b"] = np.zeros(D_LATENT) W["link_v"] = rng.normal(0, .3, (2 * D_LATENT, D_LATENT)) W["link_g"] = rng.normal(0, .3, (2 * D_LATENT, D_LATENT)) W["link_d"] = rng.normal(0, .3, (D_LATENT, 2 * D_LATENT)) W["link_no_w"] = np.ones(D_LATENT); W["link_no_b"] = np.zeros(D_LATENT) W["coord_w"] = rng.normal(0, .3, (len(ACTIONS), D_LATENT)) W["coord_b"] = np.zeros(len(ACTIONS)) return W def decide(self, state: dict, explore: float = 0.06): W = self.W f = extract_features(state).astype(np.float64) drives = np.zeros(len(ACTIONS)) latents = [] per_spec = [] for i, (name, owns, color) in enumerate(SPEC_DEFS): h = np.tanh(W[f"s{i}_fc1_w"] @ f + W[f"s{i}_fc1_b"]) lat = W[f"s{i}_lat_w"] @ h + W[f"s{i}_lat_b"] latents.append(lat) drv = None if owns is not None: drv = float((W[f"s{i}_drv_w"] @ h + W[f"s{i}_drv_b"]).item()) drives[ACTIONS.index(owns)] += drv per_spec.append({ "name": name, "owns": owns, "color": color, "drive": round(drv, 3) if drv is not None else None, "latent_norm": round(float(np.linalg.norm(lat)), 3), }) # RecursiveLink: shared latent bridge z = np.sum(latents, axis=0) zn = _layernorm(z, W["link_ni_w"], W["link_ni_b"]) reglu = _relu(W["link_g"] @ zn) * (W["link_v"] @ zn) out = W["link_d"] @ reglu shared = _layernorm(out + z, W["link_no_w"], W["link_no_b"]) # Coordinator read-out (modulator influence flows in here) modulation = W["coord_w"] @ shared + W["coord_b"] logits = drives + modulation mask = legal_mask(state) masked = np.where(mask > 0.5, logits, -1e9) probs = np.exp(masked - masked.max()) probs = probs / probs.sum() # `explore` is a true mistake-rate: with prob `explore` take a uniformly # random LEGAL action (the difficulty dial), else the best action. (Sampling # from `probs` barely degrades play -- the policy is too confident -- so we # use uniform mistakes to make easier tiers actually easier.) if explore > 0 and np.random.random() < explore: legal_idx = np.where(mask > 0.5)[0] choice = int(np.random.choice(legal_idx)) else: choice = int(np.argmax(masked)) phase = 2 if state.get("bossHP", 1.0) < 0.5 else 1 return { "action": ACTIONS[choice], "phase": phase, "trained": self.trained, "specialists": per_spec, "base_drive": {a: round(float(d), 3) for a, d in zip(ACTIONS, drives)}, "modulation": {a: round(float(m), 3) for a, m in zip(ACTIONS, modulation)}, "final_drive": {a: round(float(d), 3) for a, d in zip(ACTIONS, logits)}, "probs": {a: round(float(p), 3) for a, p in zip(ACTIONS, probs)}, "legal": {a: bool(m > 0.5) for a, m in zip(ACTIONS, mask)}, "shared_latent": [round(float(x), 3) for x in shared], } # ---- difficulty tiers -------------------------------------------------------- # Each tier = a different training checkpoint PLUS a decision-noise level. Easy is a # partially-trained brain that also "thinks loosely" (more exploration -> more # mistakes, more openings for the player); Hard is the fully-trained brain playing # sharply. Missing weight files fall back to the hard weights. _DIR = os.path.dirname(os.path.abspath(__file__)) # Difficulty = decision-noise on the trained brain. `explore` is the mistake-rate # (prob of a random legal action); higher = the boss makes more exploitable mistakes. # (Once the boss learned to BLOCK it dominates the sim even early in training, so a # "weaker checkpoint" no longer yields an easy boss -- noise is the controllable dial: # vs a near-optimal dodger these give boss win-rates ~0.35 / ~0.65 / ~0.95.) DIFFICULTY = { "easy": {"file": "mm_weights.npz", "explore": 0.50}, "normal": {"file": "mm_weights.npz", "explore": 0.22}, "hard": {"file": "mm_weights.npz", "explore": 0.04}, } _MINDS: dict[str, "ModularMind"] = {} def get_mind(difficulty: str = "hard") -> ModularMind: key = difficulty if difficulty in DIFFICULTY else "hard" if key not in _MINDS: path = os.path.join(_DIR, DIFFICULTY[key]["file"]) if not os.path.exists(path): # tier not trained yet path = os.path.join(_DIR, DIFFICULTY["hard"]["file"]) mind = ModularMind(path) # the HARD tier shares the online learner's live (player-adapted) weights, # so finetuning from real fights takes effect immediately. (Lazy import to # avoid an import cycle: online -> mm_grad -> modular_mind.) if key == "hard": try: import online if online.ENABLED: mind.W = online.live_weights() mind.trained = True except Exception as e: print(f"[modular_mind] online weights not shared ({e})") _MINDS[key] = mind return _MINDS[key] def decide(state: dict) -> dict: """Route a decision to the brain for the requested difficulty tier (each tier has its own checkpoint and exploration/mistake level).""" key = str(state.get("difficulty", "hard")) if key not in DIFFICULTY: key = "hard" out = get_mind(key).decide(state, explore=DIFFICULTY[key]["explore"]) out["difficulty"] = key return out