Instructions to use Johnblick187/SmartCoderMoE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Johnblick187/SmartCoderMoE with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Johnblick187/SmartCoderMoE", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use Johnblick187/SmartCoderMoE with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Johnblick187/SmartCoderMoE"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Johnblick187/SmartCoderMoE

SGLang

How to use Johnblick187/SmartCoderMoE with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Johnblick187/SmartCoderMoE" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Johnblick187/SmartCoderMoE" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Johnblick187/SmartCoderMoE with Docker Model Runner:
```
docker model run hf.co/Johnblick187/SmartCoderMoE
```

Johnblick187 commited on 18 days ago

Commit

d9c1d79

verified ·

1 Parent(s): d00ea53

Update modeling_smartcoder_moe.py

Browse files

Files changed (1) hide show

modeling_smartcoder_moe.py +73 -161

modeling_smartcoder_moe.py CHANGED Viewed

@@ -19,9 +19,8 @@ import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel, PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from typing import Optional, Tuple, List
 # ── Config ────────────────────────────────────────────────────────────────────
@@ -84,24 +83,24 @@ class RotaryEmbedding(nn.Module):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
-        self.max_pos = max_pos
-        self._build_cache(max_pos)
-    def _build_cache(self, seq_len):
-        t = torch.arange(seq_len, device=self.inv_freq.device).float()
-        freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat([freqs, freqs], dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
-    def forward(self, seq_len):
-        if seq_len > self.max_pos:
-            self._build_cache(seq_len)
         return self.cos_cached[:, :, :seq_len, :], \
                self.sin_cached[:, :, :seq_len, :]
-# ── LayerNorm (with bias) ─────────────────────────────────────────────────────
 class LayerNormWithBias(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
@@ -117,120 +116,80 @@ class LayerNormWithBias(nn.Module):
 class SmartCoderAttention(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_heads   = config.num_attention_heads
         self.num_kv_heads = config.num_key_value_heads
-        self.head_dim    = config.head_dim
         self.num_kv_groups = self.num_heads // self.num_kv_heads
         self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=True)
         self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
         self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
         self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=True)
         self.rotary_emb = RotaryEmbedding(config.head_dim, config.max_position_embeddings, config.rope_theta)
-    def forward(self, hidden_states, attention_mask=None, past_key_value=None, use_cache=False):
         B, T, _ = hidden_states.shape
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(T)
         cos = cos[:, :, :T, :self.head_dim]
         sin = sin[:, :, :T, :self.head_dim]
         q, k = apply_rotary_emb(q, k, cos, sin)
-        if past_key_value is not None:
-            k = torch.cat([past_key_value[0], k], dim=2)
-            v = torch.cat([past_key_value[1], v], dim=2)
-        present = (k, v) if use_cache else None
-        # Expand KV heads to match Q heads (GQA)
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
-        scale = math.sqrt(self.head_dim)
-        attn = torch.matmul(q, k.transpose(-2, -1)) / scale
-        kv_len = k.shape[2]
-        causal_mask = torch.triu(
-            torch.full((T, kv_len), float("-inf"), device=q.device, dtype=q.dtype),
-            diagonal=1 + kv_len - T
-        )
-        attn = attn + causal_mask.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
             attn = attn + attention_mask
         attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
-        out = torch.matmul(attn, v)
-        out = out.transpose(1, 2).contiguous().view(B, T, -1)
-        return self.o_proj(out), present
 # ── MoE MLP ───────────────────────────────────────────────────────────────────
 class SmartCoderMoEMLP(nn.Module):
-    """
-    Hybrid Dense + MoE MLP.
-    dense path:   hidden -> dense_fc (8192) -> gelu -> dense_proj (2048)
-    expert path:  router picks top-k experts from experts_fc/experts_proj
-    output = dense_out + expert_out
-    """
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
         H  = config.hidden_size
         DI = config.dense_intermediate_size
         NE = config.num_experts
         EI = config.expert_intermediate_size
-        K  = config.num_experts_per_tok
-        self.num_experts     = NE
-        self.top_k           = K
-        # Dense residual path
-        self.dense_fc   = nn.Linear(H, DI, bias=True)
-        self.dense_proj = nn.Linear(DI, H, bias=True)
-        # MoE path — stored as batched weight matrices matching safetensors layout
-        # experts_fc:   [NE, EI, H]
-        # experts_proj: [NE, H, EI]
-        self.experts_fc   = nn.Parameter(torch.empty(NE, EI, H))
         self.experts_proj = nn.Parameter(torch.empty(NE, H, EI))
-        self.router       = nn.Linear(H, NE, bias=False)
     def forward(self, x):
         B, T, H = x.shape
-        # Dense path
         dense_out = self.dense_proj(F.gelu(self.dense_fc(x)))
-        # Router
-        router_logits = self.router(x)                          # [B, T, NE]
         router_weights = F.softmax(router_logits, dim=-1)
-        top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)  # [B, T, K]
-        top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)   # normalize
-        # Expert computation — iterate over top-k (K is small so this is fine)
         expert_out = torch.zeros_like(x)
         x_flat = x.view(B * T, H)
         for k in range(self.top_k):
-            expert_ids = top_indices[:, :, k].reshape(B * T)   # [B*T]
-            weights    = top_weights[:, :, k].reshape(B * T, 1) # [B*T, 1]
-            # Batched expert forward using einsum
-            # For each token, pick its expert's weights
-            # experts_fc: [NE, EI, H] → gather → [B*T, EI, H]
-            fc_w   = self.experts_fc[expert_ids]    # [B*T, EI, H]
-            proj_w = self.experts_proj[expert_ids]  # [B*T, H, EI]
-            # up: [B*T, EI]
             hidden = F.gelu(torch.bmm(fc_w, x_flat.unsqueeze(-1)).squeeze(-1))
-            # down: [B*T, H]
-            out = torch.bmm(proj_w, hidden.unsqueeze(-1)).squeeze(-1)
             expert_out = expert_out + (out * weights).view(B, T, H)
         return dense_out + expert_out
@@ -240,59 +199,42 @@ class SmartCoderMoEMLP(nn.Module):
 class SmartCoderDecoderLayer(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
-        self.input_layernorm         = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
-        self.self_attn               = SmartCoderAttention(config)
         self.post_attention_layernorm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
-        self.mlp                     = SmartCoderMoEMLP(config)
-    def forward(self, hidden_states, attention_mask=None, past_key_value=None, use_cache=False):
-        # Attention
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, present = self.self_attn(
-            hidden_states, attention_mask=attention_mask,
-            past_key_value=past_key_value, use_cache=use_cache
-        )
         hidden_states = residual + hidden_states
-        # MLP
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
-        return hidden_states, present
-# ── Full Model ────────────────────────────────────────────────────────────────
 class SmartCoderMoEModel(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.layers = nn.ModuleList([
-            SmartCoderDecoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
-    def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False):
         hidden_states = self.embed_tokens(input_ids)
-        presents = [] if use_cache else None
-        for i, layer in enumerate(self.layers):
-            pkv = past_key_values[i] if past_key_values else None
-            hidden_states, present = layer(
-                hidden_states, attention_mask=attention_mask,
-                past_key_value=pkv, use_cache=use_cache
-            )
-            if use_cache:
-                presents.append(present)
-        hidden_states = self.norm(hidden_states)
-        return hidden_states, presents
-# ── CausalLM wrapper ──────────────────────────────────────────────────────────
-class SmartCoderMoEForCausalLM(PreTrainedModel):
     config_class = SmartCoderMoEConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = False
@@ -303,11 +245,8 @@ class SmartCoderMoEForCausalLM(PreTrainedModel):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-    def get_output_embeddings(self):
-        return self.lm_head
     def forward(
         self,
@@ -316,13 +255,10 @@ class SmartCoderMoEForCausalLM(PreTrainedModel):
         past_key_values=None,
         inputs_embeds=None,
         labels=None,
-        use_cache=True,
         **kwargs,
     ):
-        hidden_states, presents = self.model(
-            input_ids, attention_mask=attention_mask,
-            past_key_values=past_key_values, use_cache=use_cache
-        )
         logits = self.lm_head(hidden_states)
         loss = None
@@ -335,24 +271,18 @@ class SmartCoderMoEForCausalLM(PreTrainedModel):
                 ignore_index=-100,
             )
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=presents,
-        )
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": True}
 # ── Loader ────────────────────────────────────────────────────────────────────
 def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloat16):
-    """Load SmartCoderMoE with correct custom architecture."""
     import os
     from huggingface_hub import snapshot_download
     from safetensors.torch import load_file
     os.environ["HF_HUB_DISABLE_XET"] = "1"
@@ -364,50 +294,32 @@ def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloa
     model = SmartCoderMoEForCausalLM(config)
     print("Loading weights...")
-    from pathlib import Path
     sf_files = sorted(Path(model_dir).glob("*.safetensors"))
     state_dict = {}
     for f in sf_files:
         state_dict.update(load_file(str(f)))
-    # experts_fc in safetensors: [32, 512, 2048] — matches our [NE, EI, H] ✓
-    # experts_proj in safetensors: [32, 2048, 512] — matches our [NE, H, EI] ✓
-    # router in safetensors: [32, 2048] — stored as Linear weight [out, in] ✓
     missing, unexpected = model.load_state_dict(state_dict, strict=False)
     if missing:
-        print(f"Missing keys: {missing[:5]}{'...' if len(missing)>5 else ''}")
     if unexpected:
-        print(f"Unexpected keys: {unexpected[:5]}{'...' if len(unexpected)>5 else ''}")
     model = model.to(dtype)
     print(f"Loaded! Params: {sum(p.numel() for p in model.parameters())/1e9:.2f}B")
     return model, config
-if __name__ == "__main__":
-    from transformers import AutoTokenizer
-    import torch
-    model, config = load_smartcoder_moe()
-    model.eval()
-    model = model.cuda()
-    tokenizer = AutoTokenizer.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True)
-    prompt = "def fibonacci(n):"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    input_len = inputs["input_ids"].shape[-1]
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=150,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.95,
-            repetition_penalty=1.3,
-            pad_token_id=tokenizer.eos_token_id,
-        )
-    print(tokenizer.decode(out[0][input_len:], skip_special_tokens=True))

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
 # ── Config ────────────────────────────────────────────────────────────────────
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
+        self._cached_len = 0
+    def _build_cache(self, seq_len, device):
+        t = torch.arange(seq_len, device=device).float()
+        freqs = torch.outer(t, self.inv_freq.to(device))
         emb = torch.cat([freqs, freqs], dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        self._cached_len = seq_len
+    def forward(self, seq_len, device):
+        if seq_len > self._cached_len:
+            self._build_cache(seq_len, device)
         return self.cos_cached[:, :, :seq_len, :], \
                self.sin_cached[:, :, :seq_len, :]
+# ── LayerNorm with bias ───────────────────────────────────────────────────────
 class LayerNormWithBias(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
 class SmartCoderAttention(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
+        self.num_heads    = config.num_attention_heads
         self.num_kv_heads = config.num_key_value_heads
+        self.head_dim     = config.head_dim
         self.num_kv_groups = self.num_heads // self.num_kv_heads
         self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=True)
         self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
         self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
         self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=True)
         self.rotary_emb = RotaryEmbedding(config.head_dim, config.max_position_embeddings, config.rope_theta)
+    def forward(self, hidden_states, attention_mask=None, **kwargs):
         B, T, _ = hidden_states.shape
         q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(T, hidden_states.device)
         cos = cos[:, :, :T, :self.head_dim]
         sin = sin[:, :, :T, :self.head_dim]
         q, k = apply_rotary_emb(q, k, cos, sin)
         k = k.repeat_interleave(self.num_kv_groups, dim=1)
         v = v.repeat_interleave(self.num_kv_groups, dim=1)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        causal = torch.triu(torch.full((T, T), float("-inf"), device=q.device, dtype=q.dtype), diagonal=1)
+        attn = attn + causal.unsqueeze(0).unsqueeze(0)
         if attention_mask is not None:
             attn = attn + attention_mask
         attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
+        out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)
+        return self.o_proj(out)
 # ── MoE MLP ───────────────────────────────────────────────────────────────────
 class SmartCoderMoEMLP(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
         H  = config.hidden_size
         DI = config.dense_intermediate_size
         NE = config.num_experts
         EI = config.expert_intermediate_size
+        self.num_experts = NE
+        self.top_k       = config.num_experts_per_tok
+        self.dense_fc    = nn.Linear(H, DI, bias=True)
+        self.dense_proj  = nn.Linear(DI, H, bias=True)
+        self.experts_fc  = nn.Parameter(torch.empty(NE, EI, H))
         self.experts_proj = nn.Parameter(torch.empty(NE, H, EI))
+        self.router      = nn.Linear(H, NE, bias=False)
     def forward(self, x):
         B, T, H = x.shape
         dense_out = self.dense_proj(F.gelu(self.dense_fc(x)))
+        router_logits  = self.router(x)
         router_weights = F.softmax(router_logits, dim=-1)
+        top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)
+        top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)
         expert_out = torch.zeros_like(x)
         x_flat = x.view(B * T, H)
         for k in range(self.top_k):
+            expert_ids = top_indices[:, :, k].reshape(B * T)
+            weights    = top_weights[:, :, k].reshape(B * T, 1)
+            fc_w   = self.experts_fc[expert_ids]
+            proj_w = self.experts_proj[expert_ids]
             hidden = F.gelu(torch.bmm(fc_w, x_flat.unsqueeze(-1)).squeeze(-1))
+            out    = torch.bmm(proj_w, hidden.unsqueeze(-1)).squeeze(-1)
             expert_out = expert_out + (out * weights).view(B, T, H)
         return dense_out + expert_out
 class SmartCoderDecoderLayer(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
+        self.input_layernorm          = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+        self.self_attn                = SmartCoderAttention(config)
         self.post_attention_layernorm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+        self.mlp                      = SmartCoderMoEMLP(config)
+    def forward(self, hidden_states, attention_mask=None, **kwargs):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
+        return hidden_states
+# ── Model ─────────────────────────────────────────────────────────────────────
 class SmartCoderMoEModel(nn.Module):
     def __init__(self, config: SmartCoderMoEConfig):
         super().__init__()
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([SmartCoderDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm   = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+    def forward(self, input_ids, attention_mask=None, **kwargs):
         hidden_states = self.embed_tokens(input_ids)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask=attention_mask)
+        return self.norm(hidden_states)
+# ── CausalLM ──────────────────────────────────────────────────────────────────
+class SmartCoderMoEForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = SmartCoderMoEConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = False
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
+    def get_input_embeddings(self): return self.model.embed_tokens
+    def get_output_embeddings(self): return self.lm_head
     def forward(
         self,
         past_key_values=None,
         inputs_embeds=None,
         labels=None,
+        use_cache=None,
         **kwargs,
     ):
+        hidden_states = self.model(input_ids, attention_mask=attention_mask)
         logits = self.lm_head(hidden_states)
         loss = None
                 ignore_index=-100,
             )
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}
 # ── Loader ────────────────────────────────────────────────────────────────────
 def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloat16):
     import os
     from huggingface_hub import snapshot_download
     from safetensors.torch import load_file
+    from pathlib import Path
     os.environ["HF_HUB_DISABLE_XET"] = "1"
     model = SmartCoderMoEForCausalLM(config)
     print("Loading weights...")
     sf_files = sorted(Path(model_dir).glob("*.safetensors"))
     state_dict = {}
     for f in sf_files:
         state_dict.update(load_file(str(f)))
+    # Remap expert keys — safetensors has .weight suffix, our params don't
+    remapped = {}
+    for k, v in state_dict.items():
+        if 'experts_fc.weight' in k:
+            remapped[k.replace('experts_fc.weight', 'experts_fc')] = v
+        elif 'experts_proj.weight' in k:
+            remapped[k.replace('experts_proj.weight', 'experts_proj')] = v
+        else:
+            remapped[k] = v
+    state_dict = remapped
     missing, unexpected = model.load_state_dict(state_dict, strict=False)
     if missing:
+        print(f"Missing: {missing[:3]}{'...' if len(missing)>3 else ''}")
     if unexpected:
+        print(f"Unexpected: {unexpected[:3]}{'...' if len(unexpected)>3 else ''}")
     model = model.to(dtype)
     print(f"Loaded! Params: {sum(p.numel() for p in model.parameters())/1e9:.2f}B")
     return model, config
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("smartcoder_moe", SmartCoderMoEConfig)
+AutoModelForCausalLM.register(SmartCoderMoEConfig, SmartCoderMoEForCausalLM)