version: '3.8' services: stack-2.9: build: context: . dockerfile: Dockerfile args: - PYTHON_VERSION=3.10 - VLLM_VERSION=0.6.3 - CUDA_VERSION=12.1.0 container_name: stack-2.9-server restart: unless-stopped ports: - "${STACK_PORT:-8000}:8000" environment: # Model configuration - MODEL_ID=${MODEL_ID:-TheBloke/Llama-2-7B-Chat-AWQ} - HUGGING_FACE_TOKEN=${HUGGING_FACE_TOKEN:-} - QUANTIZATION=${QUANTIZATION:-awq} # vLLM engine parameters - TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1} - GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.9} - MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} - MAX_NUM_SEQS=${MAX_NUM_SEQS:-64} - MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-4096} - ENFORCE_EAGER=${ENFORCE_EAGER:-false} - DISABLE_LOG_STATS=${DISABLE_LOG_STATS:-false} # Server configuration - HOST=${HOST:-0.0.0.0} - PORT=${PORT:-8000} - MODEL_CACHE_DIR=${MODEL_CACHE_DIR:-/home/vllm/.cache/huggingface} # Performance tuning - OMP_NUM_THREADS=${OMP_NUM_THREADS:-4} - CUDA_LAUNCH_BLOCKING=${CUDA_LAUNCH_BLOCKING:-0} - CUDNN_LOGINFO_DBG=1 volumes: # Model cache persistence - model_cache:/home/vllm/.cache/huggingface:rw # Optional: mount custom models - ./models:/app/models:ro networks: - stack-network # GPU configuration - uncomment for GPU support deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] # Runtime configuration runtime: nvidia # Health check healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 60s # Resource limits # mem_limit: ${MEM_LIMIT:-8g} # mem_reservation: ${MEM_RESERVATION:-4g} volumes: model_cache: driver: local networks: stack-network: driver: bridge