ModuleMind / modular_mind.py
Quazim0t0's picture
Add files using upload-large-folder tool
45e7dfb verified
raw
history blame contribute delete
7.95 kB
"""
modular_mind.py -- numpy inference for the trained Modular Mind boss brain.
Loads the weights produced by train.py (mm_weights.npz) and runs the exact same
forward pass as mm_torch.ModularMindPolicy, in pure numpy (no torch needed at game
runtime -> tiny, fast, instant cold-start on a HuggingFace Space).
decide(state) returns the chosen boss action plus rich telemetry (per-specialist
drives, the shared latent, the coordinator's modulation) so the game can VISUALISE
the Modular Mind making each decision -- including the two modulator specialists
(Punisher, Enrage) whose only influence is the latent they write into the bridge.
"""
from __future__ import annotations
import os
import numpy as np
from features import ACTIONS, NF, extract_features, legal_mask
# must match mm_torch.SPEC_DEFS
SPEC_DEFS = [
("Aggressor", "CLEAVE", "#ff4d4d"),
("Stalker", "APPROACH", "#4da6ff"),
("Survivor", "RETREAT", "#9b59b6"),
("Baiter", "IDLE", "#f1c40f"),
("Defender", "BLOCK", "#48b0c4"),
("Punisher", None, "#e67e22"),
("Enrage", None, "#c0392b"),
]
WEIGHTS_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "mm_weights.npz")
H, D_LATENT = 24, 12
def _layernorm(x, w, b, eps=1e-5):
mu = x.mean()
var = ((x - mu) ** 2).mean()
return (x - mu) / np.sqrt(var + eps) * w + b
def _relu(x):
return np.maximum(x, 0.0)
class ModularMind:
def __init__(self, weights_path=WEIGHTS_PATH):
if os.path.exists(weights_path):
self.W = {k: v for k, v in np.load(weights_path).items()}
self.trained = True
else: # fallback so the game still runs before training finishes
self.W = self._random_weights()
self.trained = False
def _random_weights(self):
rng = np.random.default_rng(0)
W = {}
for i, (_, owns, _) in enumerate(SPEC_DEFS):
W[f"s{i}_fc1_w"] = rng.normal(0, .3, (H, NF))
W[f"s{i}_fc1_b"] = np.zeros(H)
W[f"s{i}_lat_w"] = rng.normal(0, .3, (D_LATENT, H))
W[f"s{i}_lat_b"] = np.zeros(D_LATENT)
if owns is not None:
W[f"s{i}_drv_w"] = rng.normal(0, .3, (1, H))
W[f"s{i}_drv_b"] = np.zeros(1)
W["link_ni_w"] = np.ones(D_LATENT); W["link_ni_b"] = np.zeros(D_LATENT)
W["link_v"] = rng.normal(0, .3, (2 * D_LATENT, D_LATENT))
W["link_g"] = rng.normal(0, .3, (2 * D_LATENT, D_LATENT))
W["link_d"] = rng.normal(0, .3, (D_LATENT, 2 * D_LATENT))
W["link_no_w"] = np.ones(D_LATENT); W["link_no_b"] = np.zeros(D_LATENT)
W["coord_w"] = rng.normal(0, .3, (len(ACTIONS), D_LATENT))
W["coord_b"] = np.zeros(len(ACTIONS))
return W
def decide(self, state: dict, explore: float = 0.06):
W = self.W
f = extract_features(state).astype(np.float64)
drives = np.zeros(len(ACTIONS))
latents = []
per_spec = []
for i, (name, owns, color) in enumerate(SPEC_DEFS):
h = np.tanh(W[f"s{i}_fc1_w"] @ f + W[f"s{i}_fc1_b"])
lat = W[f"s{i}_lat_w"] @ h + W[f"s{i}_lat_b"]
latents.append(lat)
drv = None
if owns is not None:
drv = float((W[f"s{i}_drv_w"] @ h + W[f"s{i}_drv_b"]).item())
drives[ACTIONS.index(owns)] += drv
per_spec.append({
"name": name, "owns": owns, "color": color,
"drive": round(drv, 3) if drv is not None else None,
"latent_norm": round(float(np.linalg.norm(lat)), 3),
})
# RecursiveLink: shared latent bridge
z = np.sum(latents, axis=0)
zn = _layernorm(z, W["link_ni_w"], W["link_ni_b"])
reglu = _relu(W["link_g"] @ zn) * (W["link_v"] @ zn)
out = W["link_d"] @ reglu
shared = _layernorm(out + z, W["link_no_w"], W["link_no_b"])
# Coordinator read-out (modulator influence flows in here)
modulation = W["coord_w"] @ shared + W["coord_b"]
logits = drives + modulation
mask = legal_mask(state)
masked = np.where(mask > 0.5, logits, -1e9)
probs = np.exp(masked - masked.max())
probs = probs / probs.sum()
# `explore` is a true mistake-rate: with prob `explore` take a uniformly
# random LEGAL action (the difficulty dial), else the best action. (Sampling
# from `probs` barely degrades play -- the policy is too confident -- so we
# use uniform mistakes to make easier tiers actually easier.)
if explore > 0 and np.random.random() < explore:
legal_idx = np.where(mask > 0.5)[0]
choice = int(np.random.choice(legal_idx))
else:
choice = int(np.argmax(masked))
phase = 2 if state.get("bossHP", 1.0) < 0.5 else 1
return {
"action": ACTIONS[choice],
"phase": phase,
"trained": self.trained,
"specialists": per_spec,
"base_drive": {a: round(float(d), 3) for a, d in zip(ACTIONS, drives)},
"modulation": {a: round(float(m), 3) for a, m in zip(ACTIONS, modulation)},
"final_drive": {a: round(float(d), 3) for a, d in zip(ACTIONS, logits)},
"probs": {a: round(float(p), 3) for a, p in zip(ACTIONS, probs)},
"legal": {a: bool(m > 0.5) for a, m in zip(ACTIONS, mask)},
"shared_latent": [round(float(x), 3) for x in shared],
}
# ---- difficulty tiers --------------------------------------------------------
# Each tier = a different training checkpoint PLUS a decision-noise level. Easy is a
# partially-trained brain that also "thinks loosely" (more exploration -> more
# mistakes, more openings for the player); Hard is the fully-trained brain playing
# sharply. Missing weight files fall back to the hard weights.
_DIR = os.path.dirname(os.path.abspath(__file__))
# Difficulty = decision-noise on the trained brain. `explore` is the mistake-rate
# (prob of a random legal action); higher = the boss makes more exploitable mistakes.
# (Once the boss learned to BLOCK it dominates the sim even early in training, so a
# "weaker checkpoint" no longer yields an easy boss -- noise is the controllable dial:
# vs a near-optimal dodger these give boss win-rates ~0.35 / ~0.65 / ~0.95.)
DIFFICULTY = {
"easy": {"file": "mm_weights.npz", "explore": 0.50},
"normal": {"file": "mm_weights.npz", "explore": 0.22},
"hard": {"file": "mm_weights.npz", "explore": 0.04},
}
_MINDS: dict[str, "ModularMind"] = {}
def get_mind(difficulty: str = "hard") -> ModularMind:
key = difficulty if difficulty in DIFFICULTY else "hard"
if key not in _MINDS:
path = os.path.join(_DIR, DIFFICULTY[key]["file"])
if not os.path.exists(path): # tier not trained yet
path = os.path.join(_DIR, DIFFICULTY["hard"]["file"])
mind = ModularMind(path)
# the HARD tier shares the online learner's live (player-adapted) weights,
# so finetuning from real fights takes effect immediately. (Lazy import to
# avoid an import cycle: online -> mm_grad -> modular_mind.)
if key == "hard":
try:
import online
if online.ENABLED:
mind.W = online.live_weights()
mind.trained = True
except Exception as e:
print(f"[modular_mind] online weights not shared ({e})")
_MINDS[key] = mind
return _MINDS[key]
def decide(state: dict) -> dict:
"""Route a decision to the brain for the requested difficulty tier (each tier
has its own checkpoint and exploration/mistake level)."""
key = str(state.get("difficulty", "hard"))
if key not in DIFFICULTY:
key = "hard"
out = get_mind(key).decide(state, explore=DIFFICULTY[key]["explore"])
out["difficulty"] = key
return out