| # Stack 2.9 Configuration | |
| # This file provides additional configuration options | |
| server: | |
| host: "0.0.0.0" | |
| port: 8000 | |
| workers: 1 | |
| model: | |
| # Default model - can be overridden by MODEL_ID env var | |
| id: "TheBloke/Llama-2-7B-Chat-AWQ" | |
| cache_dir: "/home/vllm/.cache/huggingface" | |
| trust_remote_code: true | |
| # vLLM engine configuration | |
| vllm: | |
| tensor_parallel_size: 1 | |
| gpu_memory_utilization: 0.9 | |
| max_model_len: 4096 | |
| max_num_seqs: 64 | |
| max_num_batched_tokens: 4096 | |
| quantization: "awq" | |
| enforce_eager: false | |
| disable_log_stats: false | |
| # Performance tuning | |
| performance: | |
| # Thread configuration | |
| omp_num_threads: 4 | |
| # CUDA settings | |
| cuda_launch_blocking: 0 | |
| cudnn_loginfo_dbg: 1 | |
| # CORS (if needed for web UI) | |
| cors: | |
| enabled: false | |
| allow_origins: ["*"] | |
| allow_methods: ["*"] | |
| allow_headers: ["*"] | |
| # Logging | |
| logging: | |
| level: "INFO" | |
| format: "json" | |
| include_timestamps: true | |