FROM python:3.9-slim # Install compilation tools RUN apt-get update && apt-get install -y build-essential curl # Downgrade Uvicorn to avoid strict HTTP header parsing issues behind HF Envoy proxy RUN pip install "uvicorn<0.30.0" # Install llama-cpp-python with a server (Using pre-compiled CPU wheels to skip the long C++ build!) RUN pip install "llama-cpp-python[server]" huggingface-hub --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # CRITICAL FIX: Hugging Face proxy injects a non-UUID 'X-Request-ID' header (e.g. 'qKsNoh'). # starlette_context's RequestIdPlugin strictly validates this as UUIDv4 by default and throws a 400 Bad Request with an empty body. # We must patch llama_cpp to disable this validation. RUN python3 -c "import llama_cpp.server.app as app; path = app.__file__; data = open(path, 'r').read(); open(path, 'w').write(data.replace('RequestIdPlugin()', 'RequestIdPlugin(validate=False)'))" # Download a fast, compressed model (Qwen 1.5B is perfect for free CPUs) RUN python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='Qwen/Qwen2.5-1.5B-Instruct-GGUF', filename='qwen2.5-1.5b-instruct-q4_k_m.gguf', local_dir='/model')" EXPOSE 7860 # Start the OpenAI-compatible API server with explicit limits for free CPU instances # - n_threads: 2 (matches 2 vCPU limit) # - n_ctx: 2048 (limits context memory usage) CMD ["python3", "-m", "llama_cpp.server", "--model", "/model/qwen2.5-1.5b-instruct-q4_k_m.gguf", "--chat_format", "chatml", "--host", "0.0.0.0", "--port", "7860", "--n_ctx", "2048", "--n_threads", "2", "--n_threads_batch", "2"]