Instructions to use Johnblick187/SmartCoderMoE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Johnblick187/SmartCoderMoE with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Johnblick187/SmartCoderMoE", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use Johnblick187/SmartCoderMoE with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Johnblick187/SmartCoderMoE"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Johnblick187/SmartCoderMoE

SGLang

How to use Johnblick187/SmartCoderMoE with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Johnblick187/SmartCoderMoE" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Johnblick187/SmartCoderMoE" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Johnblick187/SmartCoderMoE",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Johnblick187/SmartCoderMoE with Docker Model Runner:
```
docker model run hf.co/Johnblick187/SmartCoderMoE
```

Johnblick187 commited on 16 days ago

Commit

32dba37

verified ·

1 Parent(s): 21425ee

Upload modeling_smartcoder.py

Browse files

Files changed (1) hide show

modeling_smartcoder.py +413 -0

modeling_smartcoder.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+modeling_smartcoder_moe.py
+Custom model class for SmartCoderMoE.
+Architecture (from tensor inspection):
+- vocab_size: 65536, hidden: 2048, layers: 40
+- Attention: q[2048,2048], k/v[512,2048] — 16 heads, 4 KV heads, head_dim=128
+- MLP (hybrid dense + MoE):
+    dense_fc:     [8192, 2048]    up
+    dense_proj:   [2048, 8192]    down
+    experts_fc:   [32, 512, 2048] expert up (batched)
+    experts_proj: [32, 2048, 512] expert down (batched)
+    router:       [32, 2048]      router logits
+- LayerNorm: weight+bias (input_layernorm, post_attention_layernorm)
+- Final norm: model.norm.weight/bias
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from typing import Optional, Tuple, List
+# ── Config ────────────────────────────────────────────────────────────────────
+class SmartCoderMoEConfig(PretrainedConfig):
+    model_type = "smartcoder_moe"
+    def __init__(
+        self,
+        vocab_size=65536,
+        hidden_size=2048,
+        num_hidden_layers=40,
+        num_attention_heads=16,
+        num_key_value_heads=4,
+        dense_intermediate_size=8192,
+        num_experts=32,
+        expert_intermediate_size=512,
+        num_experts_per_tok=2,
+        max_position_embeddings=16384,
+        rope_theta=10000.0,
+        rms_norm_eps=1e-5,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=0,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = hidden_size // num_attention_heads
+        self.dense_intermediate_size = dense_intermediate_size
+        self.num_experts = num_experts
+        self.expert_intermediate_size = expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rms_norm_eps = rms_norm_eps
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+# ── RoPE ──────────────────────────────────────────────────────────────────────
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
+    return torch.cat([-x2, x1], dim=-1)
+def apply_rotary_emb(q, k, cos, sin):
+    return (q * cos) + (rotate_half(q) * sin), \
+           (k * cos) + (rotate_half(k) * sin)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_pos=16384, base=10000.0):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.max_pos = max_pos
+        self._build_cache(max_pos)
+    def _build_cache(self, seq_len):
+        t = torch.arange(seq_len, device=self.inv_freq.device).float()
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat([freqs, freqs], dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
+    def forward(self, seq_len):
+        if seq_len > self.max_pos:
+            self._build_cache(seq_len)
+        return self.cos_cached[:, :, :seq_len, :], \
+               self.sin_cached[:, :, :seq_len, :]
+# ── LayerNorm (with bias) ─────────────────────────────────────────────────────
+class LayerNormWithBias(nn.Module):
+    def __init__(self, hidden_size, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias   = nn.Parameter(torch.zeros(hidden_size))
+        self.eps = eps
+    def forward(self, x):
+        return F.layer_norm(x, x.shape[-1:], self.weight, self.bias, self.eps)
+# ── Attention ─────────────────────────────────────────────────────────────────
+class SmartCoderAttention(nn.Module):
+    def __init__(self, config: SmartCoderMoEConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads   = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim    = config.head_dim
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=True)
+        self.rotary_emb = RotaryEmbedding(config.head_dim, config.max_position_embeddings, config.rope_theta)
+    def forward(self, hidden_states, attention_mask=None, past_key_value=None, use_cache=False):
+        B, T, _ = hidden_states.shape
+        q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(T)
+        cos = cos[:, :, :T, :self.head_dim]
+        sin = sin[:, :, :T, :self.head_dim]
+        q, k = apply_rotary_emb(q, k, cos, sin)
+        if past_key_value is not None:
+            k = torch.cat([past_key_value[0], k], dim=2)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        present = (k, v) if use_cache else None
+        # Expand KV heads to match Q heads (GQA)
+        k = k.repeat_interleave(self.num_kv_groups, dim=1)
+        v = v.repeat_interleave(self.num_kv_groups, dim=1)
+        scale = math.sqrt(self.head_dim)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / scale
+        kv_len = k.shape[2]
+        causal_mask = torch.triu(
+            torch.full((T, kv_len), float("-inf"), device=q.device, dtype=q.dtype),
+            diagonal=1 + kv_len - T
+        )
+        attn = attn + causal_mask.unsqueeze(0).unsqueeze(0)
+        if attention_mask is not None:
+            attn = attn + attention_mask
+        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, T, -1)
+        return self.o_proj(out), present
+# ── MoE MLP ───────────────────────────────────────────────────────────────────
+class SmartCoderMoEMLP(nn.Module):
+    """
+    Hybrid Dense + MoE MLP.
+    dense path:   hidden -> dense_fc (8192) -> gelu -> dense_proj (2048)
+    expert path:  router picks top-k experts from experts_fc/experts_proj
+    output = dense_out + expert_out
+    """
+    def __init__(self, config: SmartCoderMoEConfig):
+        super().__init__()
+        H  = config.hidden_size
+        DI = config.dense_intermediate_size
+        NE = config.num_experts
+        EI = config.expert_intermediate_size
+        K  = config.num_experts_per_tok
+        self.num_experts     = NE
+        self.top_k           = K
+        # Dense residual path
+        self.dense_fc   = nn.Linear(H, DI, bias=True)
+        self.dense_proj = nn.Linear(DI, H, bias=True)
+        # MoE path — stored as batched weight matrices matching safetensors layout
+        # experts_fc:   [NE, EI, H]
+        # experts_proj: [NE, H, EI]
+        self.experts_fc   = nn.Parameter(torch.empty(NE, EI, H))
+        self.experts_proj = nn.Parameter(torch.empty(NE, H, EI))
+        self.router       = nn.Linear(H, NE, bias=False)
+    def forward(self, x):
+        B, T, H = x.shape
+        # Dense path
+        dense_out = self.dense_proj(F.gelu(self.dense_fc(x)))
+        # Router
+        router_logits = self.router(x)                          # [B, T, NE]
+        router_weights = F.softmax(router_logits, dim=-1)
+        top_weights, top_indices = router_weights.topk(self.top_k, dim=-1)  # [B, T, K]
+        top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True)   # normalize
+        # Expert computation — iterate over top-k (K is small so this is fine)
+        expert_out = torch.zeros_like(x)
+        x_flat = x.view(B * T, H)
+        for k in range(self.top_k):
+            expert_ids = top_indices[:, :, k].reshape(B * T)   # [B*T]
+            weights    = top_weights[:, :, k].reshape(B * T, 1) # [B*T, 1]
+            # Batched expert forward using einsum
+            # For each token, pick its expert's weights
+            # experts_fc: [NE, EI, H] → gather → [B*T, EI, H]
+            fc_w   = self.experts_fc[expert_ids]    # [B*T, EI, H]
+            proj_w = self.experts_proj[expert_ids]  # [B*T, H, EI]
+            # up: [B*T, EI]
+            hidden = F.gelu(torch.bmm(fc_w, x_flat.unsqueeze(-1)).squeeze(-1))
+            # down: [B*T, H]
+            out = torch.bmm(proj_w, hidden.unsqueeze(-1)).squeeze(-1)
+            expert_out = expert_out + (out * weights).view(B, T, H)
+        return dense_out + expert_out
+# ── Decoder Layer ─────────────────────────────────────────────────────────────
+class SmartCoderDecoderLayer(nn.Module):
+    def __init__(self, config: SmartCoderMoEConfig):
+        super().__init__()
+        self.input_layernorm         = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+        self.self_attn               = SmartCoderAttention(config)
+        self.post_attention_layernorm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+        self.mlp                     = SmartCoderMoEMLP(config)
+    def forward(self, hidden_states, attention_mask=None, past_key_value=None, use_cache=False):
+        # Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, present = self.self_attn(
+            hidden_states, attention_mask=attention_mask,
+            past_key_value=past_key_value, use_cache=use_cache
+        )
+        hidden_states = residual + hidden_states
+        # MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, present
+# ── Full Model ────────────────────────────────────────────────────────────────
+class SmartCoderMoEModel(nn.Module):
+    def __init__(self, config: SmartCoderMoEConfig):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([
+            SmartCoderDecoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps)
+    def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False):
+        hidden_states = self.embed_tokens(input_ids)
+        presents = [] if use_cache else None
+        for i, layer in enumerate(self.layers):
+            pkv = past_key_values[i] if past_key_values else None
+            hidden_states, present = layer(
+                hidden_states, attention_mask=attention_mask,
+                past_key_value=pkv, use_cache=use_cache
+            )
+            if use_cache:
+                presents.append(present)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, presents
+# ── CausalLM wrapper ──────────────────────────────────────────────────────────
+class SmartCoderMoEForCausalLM(PreTrainedModel):
+    config_class = SmartCoderMoEConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    def __init__(self, config: SmartCoderMoEConfig):
+        super().__init__(config)
+        self.model   = SmartCoderMoEModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def get_output_embeddings(self):
+        return self.lm_head
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        hidden_states, presents = self.model(
+            input_ids, attention_mask=attention_mask,
+            past_key_values=past_key_values, use_cache=use_cache
+        )
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=presents,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": True}
+# ── Loader ────────────────────────────────────────────────────────────────────
+def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloat16):
+    """Load SmartCoderMoE with correct custom architecture."""
+    import os
+    from huggingface_hub import snapshot_download
+    from safetensors.torch import load_file
+    os.environ["HF_HUB_DISABLE_XET"] = "1"
+    print(f"Downloading {model_id}...")
+    model_dir = snapshot_download(model_id)
+    config = SmartCoderMoEConfig()
+    print("Initializing model...")
+    model = SmartCoderMoEForCausalLM(config)
+    print("Loading weights...")
+    from pathlib import Path
+    sf_files = sorted(Path(model_dir).glob("*.safetensors"))
+    state_dict = {}
+    for f in sf_files:
+        state_dict.update(load_file(str(f)))
+    # experts_fc in safetensors: [32, 512, 2048] — matches our [NE, EI, H] ✓
+    # experts_proj in safetensors: [32, 2048, 512] — matches our [NE, H, EI] ✓
+    # router in safetensors: [32, 2048] — stored as Linear weight [out, in] ✓
+    missing, unexpected = model.load_state_dict(state_dict, strict=False)
+    if missing:
+        print(f"Missing keys: {missing[:5]}{'...' if len(missing)>5 else ''}")
+    if unexpected:
+        print(f"Unexpected keys: {unexpected[:5]}{'...' if len(unexpected)>5 else ''}")
+    model = model.to(dtype)
+    print(f"Loaded! Params: {sum(p.numel() for p in model.parameters())/1e9:.2f}B")
+    return model, config
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    import torch
+    model, config = load_smartcoder_moe()
+    model.eval()
+    model = model.cuda()
+    tokenizer = AutoTokenizer.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True)
+    prompt = "def fibonacci(n):"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    input_len = inputs["input_ids"].shape[-1]
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=150,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.95,
+            repetition_penalty=1.3,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    print(tokenizer.decode(out[0][input_len:], skip_special_tokens=True))