Stack-2-9-finetuned / config /Dockerfile.gpu
walidsobhie-code
reorganize: consolidate root level to 20 folders
b8e3e42
# =============================================================================
# Stack 2.9 GPU Dockerfile
# Multi-stage build for NVIDIA GPU (CUDA 11.8 + cuDNN 8)
# =============================================================================
# Usage:
# Build: docker build -f Dockerfile.gpu -t stack-2.9-gpu .
# Run: docker compose -f docker-compose.gpu.yml up
# Or: docker run --rm --gpus all -p 8000:8000 \
# -v $(pwd)/base_model_qwen7b:/model:ro \
# stack-2.9-gpu
# =============================================================================
# -----------------------------------------------------------------------------
# Stage 1: Builder
# Install Python deps into a wheel, then discard the bulk of the build layer.
# -----------------------------------------------------------------------------
FROM python:3.11-slim AS builder
WORKDIR /build
# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install PyTorch with CUDA 11.8 support (CPU fallback pip wheel works too)
# Using PyPI index; for air-gapped envs, swap --index-url for a local mirror.
RUN python -m venv /opt/venv \
&& /opt/venv/bin/pip install --upgrade pip setuptools wheel
# Install ML / inference deps
COPY requirements_api.txt .
RUN /opt/venv/bin/pip install --no-cache-dir -r requirements_api.txt
# Install torch with CUDA support
RUN /opt/venv/bin/pip install --no-cache-dir \
torch==2.1.2 \
torchvision==0.16.2 \
--index-url https://download.pytorch.org/whl/cu118
# Install transformers ecosystem (GPU-ready builds)
RUN /opt/venv/bin/pip install --no-cache-dir \
transformers==4.39.3 \
peft==0.10.0 \
accelerate==0.28.0 \
bitsandbytes==0.43.1 \
huggingface_hub>=0.21.0
# -----------------------------------------------------------------------------
# Stage 2: Runtime
# Slim runtime image with CUDA libraries, running as non-root.
# -----------------------------------------------------------------------------
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 AS runtime
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
TRANSFORMERS_CACHE=/model/.cache \
HF_HOME=/model/.cache \
CUDA_VISIBLE_DEVICES=0 \
PORT=8000 \
HOST=0.0.0.0
WORKDIR /app
# Install runtime Python + basic utils (no compilers needed here)
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3-pip \
curl \
git \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf python3.11 /usr/bin/python
# Copy virtualenv from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create non-root user for security
ARG UID=1000
ARG GID=1000
RUN groupadd --gid $GID stack && useradd --uid $UID --gid $GID --shell /bin/bash --create-home stack
# Create model mount point
RUN mkdir -p /model && chown stack:stack /model
# Copy inference entrypoint
COPY --chown=stack:stack inference_api.py .
# Switch to non-root
USER stack:stack
# Healthcheck — confirm CUDA libraries are visible
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -sf http://localhost:${PORT}/health || exit 1
EXPOSE ${PORT}
# Model is expected to be mounted at /model at runtime.
# Example: docker run -v /path/to/base_model_qwen7b:/model:ro stack-2.9-gpu
ENV MODEL_PATH=/model
ENTRYPOINT ["python", "inference_api.py"]