walidsobhie-code
refactor: Squeeze folders further - cleaner structure
65888d5
raw
history blame contribute delete
893 Bytes
# Stack 2.9 Configuration
# This file provides additional configuration options
server:
host: "0.0.0.0"
port: 8000
workers: 1
model:
# Default model - can be overridden by MODEL_ID env var
id: "TheBloke/Llama-2-7B-Chat-AWQ"
cache_dir: "/home/vllm/.cache/huggingface"
trust_remote_code: true
# vLLM engine configuration
vllm:
tensor_parallel_size: 1
gpu_memory_utilization: 0.9
max_model_len: 4096
max_num_seqs: 64
max_num_batched_tokens: 4096
quantization: "awq"
enforce_eager: false
disable_log_stats: false
# Performance tuning
performance:
# Thread configuration
omp_num_threads: 4
# CUDA settings
cuda_launch_blocking: 0
cudnn_loginfo_dbg: 1
# CORS (if needed for web UI)
cors:
enabled: false
allow_origins: ["*"]
allow_methods: ["*"]
allow_headers: ["*"]
# Logging
logging:
level: "INFO"
format: "json"
include_timestamps: true