Text Generation
Transformers
Safetensors
English
smartcoder_moe
Mixture of Experts
starcoder2
mixture-of-experts
code
smartcoder
conversational
custom_code
Instructions to use Johnblick187/SmartCoderMoE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Johnblick187/SmartCoderMoE with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Johnblick187/SmartCoderMoE", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("Johnblick187/SmartCoderMoE", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use Johnblick187/SmartCoderMoE with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Johnblick187/SmartCoderMoE" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Johnblick187/SmartCoderMoE", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Johnblick187/SmartCoderMoE
- SGLang
How to use Johnblick187/SmartCoderMoE with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Johnblick187/SmartCoderMoE" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Johnblick187/SmartCoderMoE", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Johnblick187/SmartCoderMoE" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Johnblick187/SmartCoderMoE", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Johnblick187/SmartCoderMoE with Docker Model Runner:
docker model run hf.co/Johnblick187/SmartCoderMoE
| # modeling_smartcoder_moe.py | |
| #Architecture (from tensor inspection): | |
| #- vocab_size: 65536, hidden: 2048, layers: 40 | |
| #- Attention: q[2048,2048], k/v[512,2048] - 16 heads, 4 KV heads, head_dim=128 | |
| #- MLP (hybrid dense + MoE): | |
| # dense_fc: [8192, 2048] up | |
| # dense_proj: [2048, 8192] down | |
| # experts_fc: [32, 512, 2048] expert up (batched) | |
| # experts_proj: [32, 2048, 512] expert down (batched) | |
| # router: [32, 2048] router logits | |
| #- LayerNorm: weight+bias (input_layernorm, post_attention_layernorm) | |
| #- Final norm: model.norm.weight/bias | |
| import math | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin | |
| from transformers.modeling_outputs import CausalLMOutputWithPast | |
| # ── Config ──────────────────────────────────────────────────────────────────── | |
| class SmartCoderMoEConfig(PretrainedConfig): | |
| model_type = "smartcoder_moe" | |
| def __init__( | |
| self, | |
| vocab_size=65536, | |
| hidden_size=2048, | |
| num_hidden_layers=40, | |
| num_attention_heads=16, | |
| num_key_value_heads=4, | |
| dense_intermediate_size=8192, | |
| num_experts=32, | |
| expert_intermediate_size=512, | |
| num_experts_per_tok=2, | |
| max_position_embeddings=16384, | |
| rope_theta=10000.0, | |
| rms_norm_eps=1e-5, | |
| pad_token_id=0, | |
| bos_token_id=1, | |
| eos_token_id=0, | |
| tie_word_embeddings=False, | |
| **kwargs, | |
| ): | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.num_attention_heads = num_attention_heads | |
| self.num_key_value_heads = num_key_value_heads | |
| self.head_dim = hidden_size // num_attention_heads | |
| self.dense_intermediate_size = dense_intermediate_size | |
| self.num_experts = num_experts | |
| self.expert_intermediate_size = expert_intermediate_size | |
| self.num_experts_per_tok = num_experts_per_tok | |
| self.max_position_embeddings = max_position_embeddings | |
| self.rope_theta = rope_theta | |
| self.rms_norm_eps = rms_norm_eps | |
| super().__init__( | |
| pad_token_id=pad_token_id, | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs, | |
| ) | |
| # ── RoPE ────────────────────────────────────────────────────────────────────── | |
| def rotate_half(x): | |
| x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:] | |
| return torch.cat([-x2, x1], dim=-1) | |
| def apply_rotary_emb(q, k, cos, sin): | |
| return (q * cos) + (rotate_half(q) * sin), \ | |
| (k * cos) + (rotate_half(k) * sin) | |
| class RotaryEmbedding(nn.Module): | |
| def __init__(self, dim, max_pos=16384, base=10000.0): | |
| super().__init__() | |
| inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) | |
| self.register_buffer("inv_freq", inv_freq) | |
| self._cached_len = 0 | |
| def _build_cache(self, seq_len, device): | |
| t = torch.arange(seq_len, device=device).float() | |
| freqs = torch.outer(t, self.inv_freq.to(device)) | |
| emb = torch.cat([freqs, freqs], dim=-1) | |
| self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False) | |
| self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False) | |
| self._cached_len = seq_len | |
| def forward(self, seq_len, device): | |
| if seq_len > self._cached_len: | |
| self._build_cache(seq_len, device) | |
| return self.cos_cached[:, :, :seq_len, :], \ | |
| self.sin_cached[:, :, :seq_len, :] | |
| # ── LayerNorm with bias ─────────────────────────────────────────────────────── | |
| class LayerNormWithBias(nn.Module): | |
| def __init__(self, hidden_size, eps=1e-5): | |
| super().__init__() | |
| self.weight = nn.Parameter(torch.ones(hidden_size)) | |
| self.bias = nn.Parameter(torch.zeros(hidden_size)) | |
| self.eps = eps | |
| def forward(self, x): | |
| return F.layer_norm(x, x.shape[-1:], self.weight, self.bias, self.eps) | |
| # ── Attention ───────────────────────────────────────────────────────────────── | |
| class SmartCoderAttention(nn.Module): | |
| def __init__(self, config: SmartCoderMoEConfig): | |
| super().__init__() | |
| self.num_heads = config.num_attention_heads | |
| self.num_kv_heads = config.num_key_value_heads | |
| self.head_dim = config.head_dim | |
| self.num_kv_groups = self.num_heads // self.num_kv_heads | |
| self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * config.head_dim, bias=True) | |
| self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True) | |
| self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=True) | |
| self.o_proj = nn.Linear(config.num_attention_heads * config.head_dim, config.hidden_size, bias=True) | |
| self.rotary_emb = RotaryEmbedding(config.head_dim, config.max_position_embeddings, config.rope_theta) | |
| def forward(self, hidden_states, attention_mask=None, **kwargs): | |
| B, T, _ = hidden_states.shape | |
| q = self.q_proj(hidden_states).view(B, T, self.num_heads, self.head_dim).transpose(1, 2) | |
| k = self.k_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2) | |
| v = self.v_proj(hidden_states).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2) | |
| cos, sin = self.rotary_emb(T, hidden_states.device) | |
| cos = cos[:, :, :T, :self.head_dim] | |
| sin = sin[:, :, :T, :self.head_dim] | |
| q, k = apply_rotary_emb(q, k, cos, sin) | |
| k = k.repeat_interleave(self.num_kv_groups, dim=1) | |
| v = v.repeat_interleave(self.num_kv_groups, dim=1) | |
| attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim) | |
| causal = torch.triu(torch.full((T, T), float("-inf"), device=q.device, dtype=q.dtype), diagonal=1) | |
| attn = attn + causal.unsqueeze(0).unsqueeze(0) | |
| if attention_mask is not None: | |
| attn = attn + attention_mask | |
| attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype) | |
| out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1) | |
| return self.o_proj(out) | |
| # ── MoE MLP ─────────────────────────────────────────────────────────────────── | |
| class SmartCoderMoEMLP(nn.Module): | |
| def __init__(self, config: SmartCoderMoEConfig): | |
| super().__init__() | |
| H = config.hidden_size | |
| DI = config.dense_intermediate_size | |
| NE = config.num_experts | |
| EI = config.expert_intermediate_size | |
| self.num_experts = NE | |
| self.top_k = config.num_experts_per_tok | |
| self.dense_fc = nn.Linear(H, DI, bias=True) | |
| self.dense_proj = nn.Linear(DI, H, bias=True) | |
| self.experts_fc = nn.Parameter(torch.empty(NE, EI, H)) | |
| self.experts_proj = nn.Parameter(torch.empty(NE, H, EI)) | |
| self.router = nn.Linear(H, NE, bias=False) | |
| def forward(self, x): | |
| B, T, H = x.shape | |
| dense_out = self.dense_proj(F.gelu(self.dense_fc(x))) | |
| router_logits = self.router(x) | |
| router_weights = F.softmax(router_logits, dim=-1) | |
| top_weights, top_indices = router_weights.topk(self.top_k, dim=-1) | |
| top_weights = top_weights / top_weights.sum(dim=-1, keepdim=True) | |
| expert_out = torch.zeros_like(x) | |
| x_flat = x.view(B * T, H) | |
| for k in range(self.top_k): | |
| expert_ids = top_indices[:, :, k].reshape(B * T) | |
| weights = top_weights[:, :, k].reshape(B * T, 1) | |
| fc_w = self.experts_fc[expert_ids] | |
| proj_w = self.experts_proj[expert_ids] | |
| hidden = F.gelu(torch.bmm(fc_w, x_flat.unsqueeze(-1)).squeeze(-1)) | |
| out = torch.bmm(proj_w, hidden.unsqueeze(-1)).squeeze(-1) | |
| expert_out = expert_out + (out * weights).view(B, T, H) | |
| return dense_out + expert_out | |
| # ── Decoder Layer ───────────────────────────────────────────────────────────── | |
| class SmartCoderDecoderLayer(nn.Module): | |
| def __init__(self, config: SmartCoderMoEConfig): | |
| super().__init__() | |
| self.input_layernorm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps) | |
| self.self_attn = SmartCoderAttention(config) | |
| self.post_attention_layernorm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps) | |
| self.mlp = SmartCoderMoEMLP(config) | |
| def forward(self, hidden_states, attention_mask=None, **kwargs): | |
| residual = hidden_states | |
| hidden_states = self.input_layernorm(hidden_states) | |
| hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask) | |
| hidden_states = residual + hidden_states | |
| residual = hidden_states | |
| hidden_states = self.post_attention_layernorm(hidden_states) | |
| hidden_states = self.mlp(hidden_states) | |
| hidden_states = residual + hidden_states | |
| return hidden_states | |
| # ── Model ───────────────────────────────────────────────────────────────────── | |
| class SmartCoderMoEModel(nn.Module): | |
| def __init__(self, config: SmartCoderMoEConfig): | |
| super().__init__() | |
| self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) | |
| self.layers = nn.ModuleList([SmartCoderDecoderLayer(config) for _ in range(config.num_hidden_layers)]) | |
| self.norm = LayerNormWithBias(config.hidden_size, config.rms_norm_eps) | |
| def forward(self, input_ids, attention_mask=None, **kwargs): | |
| hidden_states = self.embed_tokens(input_ids) | |
| for layer in self.layers: | |
| hidden_states = layer(hidden_states, attention_mask=attention_mask) | |
| return self.norm(hidden_states) | |
| # ── CausalLM ────────────────────────────────────────────────────────────────── | |
| class SmartCoderMoEForCausalLM(PreTrainedModel, GenerationMixin): | |
| config_class = SmartCoderMoEConfig | |
| base_model_prefix = "model" | |
| supports_gradient_checkpointing = False | |
| def __init__(self, config: SmartCoderMoEConfig): | |
| super().__init__(config) | |
| self.model = SmartCoderMoEModel(config) | |
| self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) | |
| self.post_init() | |
| def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): | |
| remapped = {} | |
| for k, v in state_dict.items(): | |
| k = k.replace('experts_fc.weight', 'experts_fc') | |
| k = k.replace('experts_proj.weight', 'experts_proj') | |
| remapped[k] = v | |
| super()._load_from_state_dict(remapped, prefix, *args, **kwargs) | |
| def get_input_embeddings(self): return self.model.embed_tokens | |
| def get_output_embeddings(self): return self.lm_head | |
| def forward( | |
| self, | |
| input_ids=None, | |
| attention_mask=None, | |
| past_key_values=None, | |
| inputs_embeds=None, | |
| labels=None, | |
| use_cache=None, | |
| **kwargs, | |
| ): | |
| hidden_states = self.model(input_ids, attention_mask=attention_mask) | |
| logits = self.lm_head(hidden_states) | |
| loss = None | |
| if labels is not None: | |
| shift_logits = logits[..., :-1, :].contiguous() | |
| shift_labels = labels[..., 1:].contiguous() | |
| loss = F.cross_entropy( | |
| shift_logits.view(-1, shift_logits.size(-1)), | |
| shift_labels.view(-1), | |
| ignore_index=-100, | |
| ) | |
| return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=None) | |
| def prepare_inputs_for_generation(self, input_ids, **kwargs): | |
| return {"input_ids": input_ids} | |
| # ── Loader ──────────────────────────────────────────────────────────────────── | |
| def load_smartcoder_moe(model_id="Johnblick187/SmartCoderMoE", dtype=torch.bfloat16): | |
| import os | |
| from huggingface_hub import snapshot_download | |
| from safetensors.torch import load_file | |
| from pathlib import Path | |
| os.environ["HF_HUB_DISABLE_XET"] = "1" | |
| print(f"Downloading {model_id}...") | |
| model_dir = snapshot_download(model_id) | |
| config = SmartCoderMoEConfig() | |
| print("Initializing model...") | |
| model = SmartCoderMoEForCausalLM(config) | |
| print("Loading weights...") | |
| sf_files = sorted(Path(model_dir).glob("*.safetensors")) | |
| state_dict = {} | |
| for f in sf_files: | |
| state_dict.update(load_file(str(f))) | |
| # Remap expert keys — safetensors has .weight suffix, our params don't | |
| remapped = {} | |
| for k, v in state_dict.items(): | |
| if 'experts_fc.weight' in k: | |
| remapped[k.replace('experts_fc.weight', 'experts_fc')] = v | |
| elif 'experts_proj.weight' in k: | |
| remapped[k.replace('experts_proj.weight', 'experts_proj')] = v | |
| else: | |
| remapped[k] = v | |
| state_dict = remapped | |
| missing, unexpected = model.load_state_dict(state_dict, strict=False) | |
| if missing: | |
| print(f"Missing: {missing[:3]}{'...' if len(missing)>3 else ''}") | |
| if unexpected: | |
| print(f"Unexpected: {unexpected[:3]}{'...' if len(unexpected)>3 else ''}") | |
| model = model.to(dtype) | |
| print(f"Loaded! Params: {sum(p.numel() for p in model.parameters())/1e9:.2f}B") | |
| return model, config | |
| from transformers import AutoConfig, AutoModelForCausalLM | |
| AutoConfig.register("smartcoder_moe", SmartCoderMoEConfig) | |
| AutoModelForCausalLM.register(SmartCoderMoEConfig, SmartCoderMoEForCausalLM) |