my-ai-stack
/

Stack-2-9-finetuned

Text Generation

code-generation

agent-framework

Eval Results (legacy)

Model card Files Files and versions

Stack-2-9-finetuned / config /Dockerfile.gpu

walidsobhie-code

reorganize: consolidate root level to 20 folders

b8e3e42 16 days ago

history blame contribute delete

3.66 kB

	# =============================================================================
	# Stack 2.9 GPU Dockerfile
	# Multi-stage build for NVIDIA GPU (CUDA 11.8 + cuDNN 8)
	# =============================================================================
	# Usage:
	# Build: docker build -f Dockerfile.gpu -t stack-2.9-gpu .
	# Run: docker compose -f docker-compose.gpu.yml up
	# Or: docker run --rm --gpus all -p 8000:8000 \
	# -v $(pwd)/base_model_qwen7b:/model:ro \
	# stack-2.9-gpu
	# =============================================================================

	# -----------------------------------------------------------------------------
	# Stage 1: Builder
	# Install Python deps into a wheel, then discard the bulk of the build layer.
	# -----------------------------------------------------------------------------
	FROM python:3.11-slim AS builder

	WORKDIR /build

	# Install build dependencies
	RUN apt-get update && apt-get install -y --no-install-recommends \
	build-essential \
	curl \
	&& rm -rf /var/lib/apt/lists/*

	# Install PyTorch with CUDA 11.8 support (CPU fallback pip wheel works too)
	# Using PyPI index; for air-gapped envs, swap --index-url for a local mirror.
	RUN python -m venv /opt/venv \
	&& /opt/venv/bin/pip install --upgrade pip setuptools wheel

	# Install ML / inference deps
	COPY requirements_api.txt .
	RUN /opt/venv/bin/pip install --no-cache-dir -r requirements_api.txt

	# Install torch with CUDA support
	RUN /opt/venv/bin/pip install --no-cache-dir \
	torch==2.1.2 \
	torchvision==0.16.2 \
	--index-url https://download.pytorch.org/whl/cu118

	# Install transformers ecosystem (GPU-ready builds)
	RUN /opt/venv/bin/pip install --no-cache-dir \
	transformers==4.39.3 \
	peft==0.10.0 \
	accelerate==0.28.0 \
	bitsandbytes==0.43.1 \
	huggingface_hub>=0.21.0

	# -----------------------------------------------------------------------------
	# Stage 2: Runtime
	# Slim runtime image with CUDA libraries, running as non-root.
	# -----------------------------------------------------------------------------
	FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 AS runtime

	ENV DEBIAN_FRONTEND=noninteractive \
	PYTHONDONTWRITEBYTECODE=1 \
	PYTHONUNBUFFERED=1 \
	PIP_NO_CACHE_DIR=1 \
	PIP_DISABLE_PIP_VERSION_CHECK=1 \
	TRANSFORMERS_CACHE=/model/.cache \
	HF_HOME=/model/.cache \
	CUDA_VISIBLE_DEVICES=0 \
	PORT=8000 \
	HOST=0.0.0.0

	WORKDIR /app

	# Install runtime Python + basic utils (no compilers needed here)
	RUN apt-get update && apt-get install -y --no-install-recommends \
	python3.11 \
	python3.11-venv \
	python3-pip \
	curl \
	git \
	&& rm -rf /var/lib/apt/lists/* \
	&& ln -sf python3.11 /usr/bin/python

	# Copy virtualenv from builder
	COPY --from=builder /opt/venv /opt/venv
	ENV PATH="/opt/venv/bin:$PATH"

	# Create non-root user for security
	ARG UID=1000
	ARG GID=1000
	RUN groupadd --gid $GID stack && useradd --uid $UID --gid $GID --shell /bin/bash --create-home stack

	# Create model mount point
	RUN mkdir -p /model && chown stack:stack /model

	# Copy inference entrypoint
	COPY --chown=stack:stack inference_api.py .

	# Switch to non-root
	USER stack:stack

	# Healthcheck — confirm CUDA libraries are visible
	HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
	CMD curl -sf http://localhost:${PORT}/health \|\| exit 1

	EXPOSE ${PORT}

	# Model is expected to be mounted at /model at runtime.
	# Example: docker run -v /path/to/base_model_qwen7b:/model:ro stack-2.9-gpu
	ENV MODEL_PATH=/model

	ENTRYPOINT ["python", "inference_api.py"]