| | --- |
| | library_name: transformers |
| | pipeline_tag: text-generation |
| | inference: true |
| | widget: |
| | - text: Hello! |
| | example_title: Hello world |
| | group: Python |
| | base_model: |
| | - zai-org/GLM-4.5 |
| | --- |
| | |
| | This tiny model is for debugging. It is randomly initialized with the config adapted from [zai-org/GLM-4.5](https://huggingface.co/zai-org/GLM-4.5). |
| |
|
| | Note: The `transformers` implementation does not have multi-token prediction (MTP) support. So you might see some "weights not loaded" warnings. This is expected. |
| |
|
| | ### Example usage: |
| |
|
| | - vLLM |
| |
|
| | ```bash |
| | model_id=tiny-random/glm-4.5 |
| | vllm serve $model_id \ |
| | --tensor-parallel-size 1 \ |
| | --tool-call-parser glm4_moe \ |
| | --reasoning-parser glm4_moe \ |
| | --enable-auto-tool-choice |
| | ``` |
| |
|
| | - SGLang |
| |
|
| | ```bash |
| | # Multi-token prediction is supported |
| | model_id=tiny-random/glm-4.5 |
| | python3 -m sglang.launch_server \ |
| | --model-path $model_id \ |
| | --tp-size 1 \ |
| | --cuda-graph-max-bs 4 \ |
| | --tool-call-parser glm45 \ |
| | --reasoning-parser glm45 \ |
| | --speculative-algorithm EAGLE \ |
| | --speculative-num-steps 3 \ |
| | --speculative-eagle-topk 1 \ |
| | --speculative-num-draft-tokens 4 \ |
| | --mem-fraction-static 0.4 |
| | ``` |
| |
|
| | - Transformers |
| |
|
| | ```python |
| | from transformers import pipeline |
| | model_id = "tiny-random/glm-4.5" |
| | pipe = pipeline( |
| | "text-generation", model=model_id, device="cuda", |
| | trust_remote_code=True, max_new_tokens=20, |
| | ) |
| | print(pipe("Hello World!")) |
| | ``` |
| |
|
| | ### Codes to create this repo: |
| |
|
| | ```python |
| | from copy import deepcopy |
| | |
| | import torch |
| | import torch.nn as nn |
| | from transformers import ( |
| | AutoConfig, |
| | AutoModelForCausalLM, |
| | AutoTokenizer, |
| | GenerationConfig, |
| | pipeline, |
| | set_seed, |
| | ) |
| | from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeDecoderLayer, Glm4MoeRMSNorm |
| | |
| | source_model_id = "zai-org/GLM-4.5" |
| | save_folder = "/tmp/tiny-random/glm-4.5" |
| | |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | source_model_id, trust_remote_code=True, |
| | ) |
| | tokenizer.save_pretrained(save_folder) |
| | |
| | config = AutoConfig.from_pretrained( |
| | source_model_id, trust_remote_code=True, |
| | ) |
| | config.hidden_size = 16 |
| | config.head_dim = 64 |
| | config.intermediate_size = 64 |
| | config.num_attention_heads = 4 |
| | config.num_hidden_layers = 2 # 1 dense, 1 moe |
| | config.num_key_value_heads = 2 |
| | config.moe_intermediate_size = 64 |
| | config.n_routed_experts = 16 |
| | config.n_shared_experts = 1 |
| | config.first_k_dense_replace = 1 |
| | config.num_experts_per_tok = 8 |
| | config.num_nextn_predict_layers = 1 # after layer 0 and 1, there will be a another MTP layer |
| | config.tie_word_embeddings = True |
| | |
| | torch.set_default_dtype(torch.bfloat16) |
| | model = AutoModelForCausalLM.from_config( |
| | config, |
| | torch_dtype=torch.bfloat16, |
| | trust_remote_code=True, |
| | ) |
| | |
| | class SharedHead(nn.Module): |
| | def __init__(self, config) -> None: |
| | super().__init__() |
| | self.norm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| | # self.head = deepcopy(model.get_output_embeddings()) |
| | |
| | class Glm4MoeDecoderMTP(Glm4MoeDecoderLayer): |
| | def __init__(self, config, layer_idx): |
| | super().__init__(config, layer_idx=layer_idx) |
| | self.enorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| | self.hnorm = Glm4MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) |
| | self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) |
| | self.shared_head = SharedHead(config=config) |
| | # self.embed_tokens = deepcopy(model.get_input_embeddings()) |
| | |
| | last_extra_layer = Glm4MoeDecoderMTP(config, layer_idx=config.num_hidden_layers) |
| | model.model.layers.append(last_extra_layer) |
| | model.generation_config = GenerationConfig.from_pretrained( |
| | source_model_id, trust_remote_code=True, |
| | ) |
| | set_seed(42) |
| | with torch.no_grad(): |
| | for name, p in sorted(model.named_parameters()): |
| | torch.nn.init.normal_(p, 0, 0.2) |
| | print(name, p.shape) |
| | model.save_pretrained(save_folder) |
| | ``` |