FROM python:3.9-slim

# Install compilation tools
RUN apt-get update && apt-get install -y build-essential curl

# Downgrade Uvicorn to avoid strict HTTP header parsing issues behind HF Envoy proxy
RUN pip install "uvicorn<0.30.0"

# Install llama-cpp-python with a server (Using pre-compiled CPU wheels to skip the long C++ build!)
RUN pip install "llama-cpp-python[server]" huggingface-hub --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu

# CRITICAL FIX: Hugging Face proxy injects a non-UUID 'X-Request-ID' header (e.g. 'qKsNoh').
# starlette_context's RequestIdPlugin strictly validates this as UUIDv4 by default and throws a 400 Bad Request with an empty body.
# We must patch llama_cpp to disable this validation.
RUN python3 -c "import llama_cpp.server.app as app; path = app.__file__; data = open(path, 'r').read(); open(path, 'w').write(data.replace('RequestIdPlugin()', 'RequestIdPlugin(validate=False)'))"

# Download a fast, compressed model (Qwen 1.5B is perfect for free CPUs)
RUN python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='Qwen/Qwen2.5-1.5B-Instruct-GGUF', filename='qwen2.5-1.5b-instruct-q4_k_m.gguf', local_dir='/model')"

EXPOSE 7860

# Start the OpenAI-compatible API server with explicit limits for free CPU instances
# - n_threads: 2 (matches 2 vCPU limit)
# - n_ctx: 2048 (limits context memory usage)
CMD ["python3", "-m", "llama_cpp.server", "--model", "/model/qwen2.5-1.5b-instruct-q4_k_m.gguf", "--chat_format", "chatml", "--host", "0.0.0.0", "--port", "7860", "--n_ctx", "2048", "--n_threads", "2", "--n_threads_batch", "2"]