# Stack 2.9 Configuration # This file provides additional configuration options server: host: "0.0.0.0" port: 8000 workers: 1 model: # Default model - can be overridden by MODEL_ID env var id: "TheBloke/Llama-2-7B-Chat-AWQ" cache_dir: "/home/vllm/.cache/huggingface" trust_remote_code: true # vLLM engine configuration vllm: tensor_parallel_size: 1 gpu_memory_utilization: 0.9 max_model_len: 4096 max_num_seqs: 64 max_num_batched_tokens: 4096 quantization: "awq" enforce_eager: false disable_log_stats: false # Performance tuning performance: # Thread configuration omp_num_threads: 4 # CUDA settings cuda_launch_blocking: 0 cudnn_loginfo_dbg: 1 # CORS (if needed for web UI) cors: enabled: false allow_origins: ["*"] allow_methods: ["*"] allow_headers: ["*"] # Logging logging: level: "INFO" format: "json" include_timestamps: true