# ============================================================================= # Stack 2.9 GPU Dockerfile # Multi-stage build for NVIDIA GPU (CUDA 11.8 + cuDNN 8) # ============================================================================= # Usage: # Build: docker build -f Dockerfile.gpu -t stack-2.9-gpu . # Run: docker compose -f docker-compose.gpu.yml up # Or: docker run --rm --gpus all -p 8000:8000 \ # -v $(pwd)/base_model_qwen7b:/model:ro \ # stack-2.9-gpu # ============================================================================= # ----------------------------------------------------------------------------- # Stage 1: Builder # Install Python deps into a wheel, then discard the bulk of the build layer. # ----------------------------------------------------------------------------- FROM python:3.11-slim AS builder WORKDIR /build # Install build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ curl \ && rm -rf /var/lib/apt/lists/* # Install PyTorch with CUDA 11.8 support (CPU fallback pip wheel works too) # Using PyPI index; for air-gapped envs, swap --index-url for a local mirror. RUN python -m venv /opt/venv \ && /opt/venv/bin/pip install --upgrade pip setuptools wheel # Install ML / inference deps COPY requirements_api.txt . RUN /opt/venv/bin/pip install --no-cache-dir -r requirements_api.txt # Install torch with CUDA support RUN /opt/venv/bin/pip install --no-cache-dir \ torch==2.1.2 \ torchvision==0.16.2 \ --index-url https://download.pytorch.org/whl/cu118 # Install transformers ecosystem (GPU-ready builds) RUN /opt/venv/bin/pip install --no-cache-dir \ transformers==4.39.3 \ peft==0.10.0 \ accelerate==0.28.0 \ bitsandbytes==0.43.1 \ huggingface_hub>=0.21.0 # ----------------------------------------------------------------------------- # Stage 2: Runtime # Slim runtime image with CUDA libraries, running as non-root. # ----------------------------------------------------------------------------- FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 AS runtime ENV DEBIAN_FRONTEND=noninteractive \ PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PIP_NO_CACHE_DIR=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ TRANSFORMERS_CACHE=/model/.cache \ HF_HOME=/model/.cache \ CUDA_VISIBLE_DEVICES=0 \ PORT=8000 \ HOST=0.0.0.0 WORKDIR /app # Install runtime Python + basic utils (no compilers needed here) RUN apt-get update && apt-get install -y --no-install-recommends \ python3.11 \ python3.11-venv \ python3-pip \ curl \ git \ && rm -rf /var/lib/apt/lists/* \ && ln -sf python3.11 /usr/bin/python # Copy virtualenv from builder COPY --from=builder /opt/venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # Create non-root user for security ARG UID=1000 ARG GID=1000 RUN groupadd --gid $GID stack && useradd --uid $UID --gid $GID --shell /bin/bash --create-home stack # Create model mount point RUN mkdir -p /model && chown stack:stack /model # Copy inference entrypoint COPY --chown=stack:stack inference_api.py . # Switch to non-root USER stack:stack # Healthcheck — confirm CUDA libraries are visible HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl -sf http://localhost:${PORT}/health || exit 1 EXPOSE ${PORT} # Model is expected to be mounted at /model at runtime. # Example: docker run -v /path/to/base_model_qwen7b:/model:ro stack-2.9-gpu ENV MODEL_PATH=/model ENTRYPOINT ["python", "inference_api.py"]