File size: 1,217 Bytes
a8b22df
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
CUDA_VISIBLE_DEVICES=0,1

# vllm serve Qwen/Qwen3-0.6B-GPTQ-Int8 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3


# vllm serve Qwen/Qwen3-30B-A3B-GPTQ-Int4 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3

# vllm serve Qwen/Qwen3-14B-MLX-8bit --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --served-model-name Qwen3

# vllm serve Qwen/Qwen3-1.7B-GPTQ-Int8 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3

vllm serve Qwen/Qwen3-32B-AWQ --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization awq --served-model-name Qwen3