|
|
| set -e |
|
|
| echo "==========================================" |
| echo "Phi-4-mini-instruct Agentic Model Server" |
| echo "==========================================" |
| echo "" |
|
|
| |
| MODEL_REPO="${MODEL_REPO:-unsloth/Phi-4-mini-instruct-GGUF}" |
| MODEL_FILE="${MODEL_FILE:-Phi-4-mini-instruct-Q4_K_M.gguf}" |
| N_CTX="${N_CTX:-8192}" |
| N_THREADS="${N_THREADS:-2}" |
| HOST="${HOST:-0.0.0.0}" |
| PORT="${PORT:-7860}" |
| MODEL_DIR="${MODEL_DIR:-/app/models}" |
|
|
| echo "Configuration:" |
| echo " Model Repo: $MODEL_REPO" |
| echo " Model File: $MODEL_FILE" |
| echo " Context Length: $N_CTX" |
| echo " Threads: $N_THREADS" |
| echo " Host: $HOST" |
| echo " Port: $PORT" |
| echo "" |
|
|
| |
| mkdir -p "$MODEL_DIR" |
|
|
| |
| MODEL_PATH="$MODEL_DIR/$MODEL_FILE" |
| if [ ! -f "$MODEL_PATH" ]; then |
| echo "Downloading model from HuggingFace..." |
| echo "This may take a few minutes on first run..." |
| python3 << 'PYTHON_DOWNLOAD' |
| import os |
| from huggingface_hub import hf_hub_download |
|
|
| repo = os.environ.get("MODEL_REPO", "unsloth/Phi-4-mini-instruct-GGUF") |
| filename = os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf") |
| model_dir = os.environ.get("MODEL_DIR", "/app/models") |
|
|
| print(f"Downloading {filename} from {repo}...") |
| model_path = hf_hub_download( |
| repo_id=repo, |
| filename=filename, |
| local_dir=model_dir, |
| local_dir_use_symlinks=False |
| ) |
| print(f"Model saved to: {model_path}") |
| PYTHON_DOWNLOAD |
| echo "Model download complete!" |
| else |
| echo "Model already cached at: $MODEL_PATH" |
| fi |
|
|
| echo "" |
| echo "Starting Phi-4-mini-instruct server on port $PORT..." |
| echo "OpenAI-compatible API endpoints available:" |
| echo " - POST /v1/chat/completions" |
| echo " - POST /v1/completions" |
| echo " - GET /v1/models" |
| echo " - GET /health" |
| echo "" |
| echo "Tool calling example:" |
| echo ' curl -X POST http://localhost:7860/v1/chat/completions \\' |
| echo ' -H "Content-Type: application/json" \\' |
| echo ' -d '"'"'{"model": "phi-4-mini", "messages": [...], "tools": [...]}'"'"'' |
| echo "" |
|
|
| |
| python3 << 'PYTHON_SERVER' |
| import os |
| import json |
| import time |
| from typing import List, Optional, Dict, Any, Union |
| from contextlib import asynccontextmanager |
|
|
| from fastapi import FastAPI, HTTPException |
| from fastapi.responses import StreamingResponse, JSONResponse |
| from pydantic import BaseModel, Field |
| import uvicorn |
| from llama_cpp import Llama |
|
|
| |
| MODEL_PATH = os.environ.get("MODEL_DIR", "/app/models") + "/" + os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf") |
| N_CTX = int(os.environ.get("N_CTX", "8192")) |
| N_THREADS = int(os.environ.get("N_THREADS", "2")) |
|
|
| |
| llm = None |
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| global llm |
| print(f"Loading model from: {MODEL_PATH}") |
| print(f"Context length: {N_CTX}, Threads: {N_THREADS}") |
| |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=N_CTX, |
| n_threads=N_THREADS, |
| verbose=False |
| ) |
| print("Model loaded successfully!") |
| yield |
| print("Shutting down...") |
|
|
| app = FastAPI( |
| title="Phi-4-mini-instruct API", |
| description="OpenAI-compatible API for Phi-4-mini-instruct with tool calling support", |
| version="1.0.0", |
| lifespan=lifespan |
| ) |
|
|
| |
| class ChatMessage(BaseModel): |
| role: str |
| content: Optional[str] = None |
| name: Optional[str] = None |
| tool_calls: Optional[List[Dict]] = None |
| tool_call_id: Optional[str] = None |
|
|
| class ToolFunction(BaseModel): |
| name: str |
| description: Optional[str] = "" |
| parameters: Optional[Dict] = {} |
|
|
| class Tool(BaseModel): |
| type: str = "function" |
| function: ToolFunction |
|
|
| class ChatCompletionRequest(BaseModel): |
| model: str = "phi-4-mini" |
| messages: List[ChatMessage] |
| tools: Optional[List[Tool]] = None |
| tool_choice: Optional[Union[str, Dict]] = "auto" |
| temperature: Optional[float] = 0.7 |
| max_tokens: Optional[int] = 2048 |
| stream: Optional[bool] = False |
|
|
| class ModelInfo(BaseModel): |
| id: str |
| object: str = "model" |
| created: int |
| owned_by: str = "microsoft" |
|
|
| @app.get("/health") |
| async def health_check(): |
| return {"status": "healthy", "model_loaded": llm is not None} |
|
|
| @app.get("/v1/models") |
| async def list_models(): |
| return { |
| "object": "list", |
| "data": [ |
| { |
| "id": "phi-4-mini", |
| "object": "model", |
| "created": int(time.time()), |
| "owned_by": "microsoft" |
| } |
| ] |
| } |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions(request: ChatCompletionRequest): |
| if llm is None: |
| raise HTTPException(status_code=503, detail="Model not loaded") |
| |
| |
| messages = [] |
| for msg in request.messages: |
| message = {"role": msg.role, "content": msg.content or ""} |
| if msg.tool_calls: |
| message["tool_calls"] = msg.tool_calls |
| if msg.tool_call_id: |
| message["tool_call_id"] = msg.tool_call_id |
| messages.append(message) |
| |
| |
| tools = None |
| if request.tools: |
| tools = [t.model_dump() for t in request.tools] |
| |
| try: |
| response = llm.create_chat_completion( |
| messages=messages, |
| tools=tools, |
| tool_choice=request.tool_choice if tools else None, |
| temperature=request.temperature, |
| max_tokens=request.max_tokens, |
| stream=request.stream |
| ) |
| |
| if request.stream: |
| async def generate(): |
| for chunk in response: |
| yield f"data: {json.dumps(chunk)}\n\n" |
| yield "data: [DONE]\n\n" |
| return StreamingResponse(generate(), media_type="text/event-stream") |
| |
| return JSONResponse(content=response) |
| |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @app.post("/v1/completions") |
| async def completions(request: dict): |
| if llm is None: |
| raise HTTPException(status_code=503, detail="Model not loaded") |
| |
| prompt = request.get("prompt", "") |
| max_tokens = request.get("max_tokens", 2048) |
| temperature = request.get("temperature", 0.7) |
| stream = request.get("stream", False) |
| |
| try: |
| response = llm( |
| prompt=prompt, |
| max_tokens=max_tokens, |
| temperature=temperature, |
| stream=stream |
| ) |
| |
| if stream: |
| async def generate(): |
| for chunk in response: |
| yield f"data: {json.dumps(chunk)}\n\n" |
| yield "data: [DONE]\n\n" |
| return StreamingResponse(generate(), media_type="text/event-stream") |
| |
| return JSONResponse(content=response) |
| |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| if __name__ == "__main__": |
| uvicorn.run( |
| app, |
| host=os.environ.get("HOST", "0.0.0.0"), |
| port=int(os.environ.get("PORT", 7860)) |
| ) |
| PYTHON_SERVER |
|
|