DouDou
Upload data3/run.sh with huggingface_hub
a8b22df verified
CUDA_VISIBLE_DEVICES=0,1
# vllm serve Qwen/Qwen3-0.6B-GPTQ-Int8 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3
# vllm serve Qwen/Qwen3-30B-A3B-GPTQ-Int4 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3
# vllm serve Qwen/Qwen3-14B-MLX-8bit --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --served-model-name Qwen3
# vllm serve Qwen/Qwen3-1.7B-GPTQ-Int8 --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization gptq --served-model-name Qwen3
vllm serve Qwen/Qwen3-32B-AWQ --tensor-parallel-size 2 --dtype auto --no-enable-chunked-prefill --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' --quantization awq --served-model-name Qwen3