File size: 893 Bytes
b6ae7b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | # Stack 2.9 Configuration
# This file provides additional configuration options
server:
host: "0.0.0.0"
port: 8000
workers: 1
model:
# Default model - can be overridden by MODEL_ID env var
id: "TheBloke/Llama-2-7B-Chat-AWQ"
cache_dir: "/home/vllm/.cache/huggingface"
trust_remote_code: true
# vLLM engine configuration
vllm:
tensor_parallel_size: 1
gpu_memory_utilization: 0.9
max_model_len: 4096
max_num_seqs: 64
max_num_batched_tokens: 4096
quantization: "awq"
enforce_eager: false
disable_log_stats: false
# Performance tuning
performance:
# Thread configuration
omp_num_threads: 4
# CUDA settings
cuda_launch_blocking: 0
cudnn_loginfo_dbg: 1
# CORS (if needed for web UI)
cors:
enabled: false
allow_origins: ["*"]
allow_methods: ["*"]
allow_headers: ["*"]
# Logging
logging:
level: "INFO"
format: "json"
include_timestamps: true
|