set -e echo "==========================================" echo "Phi-4-mini-instruct Agentic Model Server" echo "==========================================" echo "" # Configuration MODEL_REPO="${MODEL_REPO:-unsloth/Phi-4-mini-instruct-GGUF}" MODEL_FILE="${MODEL_FILE:-Phi-4-mini-instruct-Q4_K_M.gguf}" N_CTX="${N_CTX:-8192}" N_THREADS="${N_THREADS:-2}" HOST="${HOST:-0.0.0.0}" PORT="${PORT:-7860}" MODEL_DIR="${MODEL_DIR:-/app/models}" echo "Configuration:" echo " Model Repo: $MODEL_REPO" echo " Model File: $MODEL_FILE" echo " Context Length: $N_CTX" echo " Threads: $N_THREADS" echo " Host: $HOST" echo " Port: $PORT" echo "" # Create model directory mkdir -p "$MODEL_DIR" # Download model if not exists MODEL_PATH="$MODEL_DIR/$MODEL_FILE" if [ ! -f "$MODEL_PATH" ]; then echo "Downloading model from HuggingFace..." echo "This may take a few minutes on first run..." python3 << 'PYTHON_DOWNLOAD' import os from huggingface_hub import hf_hub_download repo = os.environ.get("MODEL_REPO", "unsloth/Phi-4-mini-instruct-GGUF") filename = os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf") model_dir = os.environ.get("MODEL_DIR", "/app/models") print(f"Downloading {filename} from {repo}...") model_path = hf_hub_download( repo_id=repo, filename=filename, local_dir=model_dir, local_dir_use_symlinks=False ) print(f"Model saved to: {model_path}") PYTHON_DOWNLOAD echo "Model download complete!" else echo "Model already cached at: $MODEL_PATH" fi echo "" echo "Starting Phi-4-mini-instruct server on port $PORT..." echo "OpenAI-compatible API endpoints available:" echo " - POST /v1/chat/completions" echo " - POST /v1/completions" echo " - GET /v1/models" echo " - GET /health" echo "" echo "Tool calling example:" echo ' curl -X POST http://localhost:7860/v1/chat/completions \\' echo ' -H "Content-Type: application/json" \\' echo ' -d '"'"'{"model": "phi-4-mini", "messages": [...], "tools": [...]}'"'"'' echo "" # Start the FastAPI server python3 << 'PYTHON_SERVER' import os import json import time from typing import List, Optional, Dict, Any, Union from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException from fastapi.responses import StreamingResponse, JSONResponse from pydantic import BaseModel, Field import uvicorn from llama_cpp import Llama # Configuration MODEL_PATH = os.environ.get("MODEL_DIR", "/app/models") + "/" + os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf") N_CTX = int(os.environ.get("N_CTX", "8192")) N_THREADS = int(os.environ.get("N_THREADS", "2")) # Global model instance llm = None @asynccontextmanager async def lifespan(app: FastAPI): global llm print(f"Loading model from: {MODEL_PATH}") print(f"Context length: {N_CTX}, Threads: {N_THREADS}") llm = Llama( model_path=MODEL_PATH, n_ctx=N_CTX, n_threads=N_THREADS, verbose=False ) print("Model loaded successfully!") yield print("Shutting down...") app = FastAPI( title="Phi-4-mini-instruct API", description="OpenAI-compatible API for Phi-4-mini-instruct with tool calling support", version="1.0.0", lifespan=lifespan ) # Pydantic models for OpenAI compatibility class ChatMessage(BaseModel): role: str content: Optional[str] = None name: Optional[str] = None tool_calls: Optional[List[Dict]] = None tool_call_id: Optional[str] = None class ToolFunction(BaseModel): name: str description: Optional[str] = "" parameters: Optional[Dict] = {} class Tool(BaseModel): type: str = "function" function: ToolFunction class ChatCompletionRequest(BaseModel): model: str = "phi-4-mini" messages: List[ChatMessage] tools: Optional[List[Tool]] = None tool_choice: Optional[Union[str, Dict]] = "auto" temperature: Optional[float] = 0.7 max_tokens: Optional[int] = 2048 stream: Optional[bool] = False class ModelInfo(BaseModel): id: str object: str = "model" created: int owned_by: str = "microsoft" @app.get("/health") async def health_check(): return {"status": "healthy", "model_loaded": llm is not None} @app.get("/v1/models") async def list_models(): return { "object": "list", "data": [ { "id": "phi-4-mini", "object": "model", "created": int(time.time()), "owned_by": "microsoft" } ] } @app.post("/v1/chat/completions") async def chat_completions(request: ChatCompletionRequest): if llm is None: raise HTTPException(status_code=503, detail="Model not loaded") # Convert messages to llama.cpp format messages = [] for msg in request.messages: message = {"role": msg.role, "content": msg.content or ""} if msg.tool_calls: message["tool_calls"] = msg.tool_calls if msg.tool_call_id: message["tool_call_id"] = msg.tool_call_id messages.append(message) # Prepare tools if provided tools = None if request.tools: tools = [t.model_dump() for t in request.tools] try: response = llm.create_chat_completion( messages=messages, tools=tools, tool_choice=request.tool_choice if tools else None, temperature=request.temperature, max_tokens=request.max_tokens, stream=request.stream ) if request.stream: async def generate(): for chunk in response: yield f"data: {json.dumps(chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") return JSONResponse(content=response) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/completions") async def completions(request: dict): if llm is None: raise HTTPException(status_code=503, detail="Model not loaded") prompt = request.get("prompt", "") max_tokens = request.get("max_tokens", 2048) temperature = request.get("temperature", 0.7) stream = request.get("stream", False) try: response = llm( prompt=prompt, max_tokens=max_tokens, temperature=temperature, stream=stream ) if stream: async def generate(): for chunk in response: yield f"data: {json.dumps(chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") return JSONResponse(content=response) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": uvicorn.run( app, host=os.environ.get("HOST", "0.0.0.0"), port=int(os.environ.get("PORT", 7860)) ) PYTHON_SERVER