File size: 3,115 Bytes
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
{
  "template_name": "stack-2.9-vastai",
  "description": "Stack 2.9 LLM Inference Server for Vast.ai",
  "created_at": "2025-06-18",
  "docker_image": "your-registry/stack-2.9:latest",
  "docker_tag": "latest",
  "instance_type": "recommended",

  "instance": {
    "gpu_type": "RTX_4090",
    "min_gpu_mem_gb": 24,
    "min_disk_space_gb": 50,
    "min_ram_gb": 16,
    "min_vcpu_count": 4,
    "max_price_usd_per_hour": 0.50,
    "min_duration_seconds": 3600,
    "max_duration_seconds": 86400
  },

  "environment": {
    "MODEL_ID": "TheBloke/Llama-2-7B-Chat-AWQ",
    "QUANTIZATION": "awq",
    "TENSOR_PARALLEL_SIZE": "1",
    "GPU_MEMORY_UTILIZATION": "0.9",
    "MAX_MODEL_LEN": "4096",
    "MAX_NUM_SEQS": "64",
    "PORT": "8000",
    "HOST": "0.0.0.0"
  },

  "ssh": {
    "enabled": true,
    "port": 22,
    "username": "root"
  },

  "ports": [
    {
      "host_port": 8000,
      "container_port": 8000,
      "protocol": "tcp"
    },
    {
      "host_port": 2222,
      "container_port": 22,
      "protocol": "tcp",
      "purpose": "ssh"
    }
  ],

  "startup_script": "#!/bin/bash\nset -e\n\n# Wait for NVIDIA drivers\necho 'Waiting for NVIDIA drivers...'\nwhile ! nvidia-smi &> /dev/null; do\n  sleep 2\ndone\necho 'NVIDIA drivers detected'\n\n# Initialize model cache directory\nmkdir -p /home/vllm/.cache/huggingface\nchmod 755 /home/vllm/.cache/huggingface\n\n# Check if Hugging Face token is provided\nif [ -n \"$HUGGING_FACE_TOKEN\" ]; then\n  echo \"Logging in to Hugging Face...\"\n  python3 -c \"from huggingface_hub import login; login(token='$HUGGING_FACE_TOKEN')\"\nfi\n\n# Pre-download model if MODEL_CACHE_DIR exists and is empty\nif [ -d \"/home/vllm/.cache/huggingface\" ] && [ -z \"$(ls -A /home/vllm/.cache/huggingface 2>/dev/null)\" ]; then\n  echo \"Pre-downloading model: $MODEL_ID\"\n  python3 -c \"\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nmodel_id = '$MODEL_ID'\nprint(f'Downloading {model_id}...')\ntry:\n    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n    print('Tokenizer downloaded')\nexcept Exception as e:\n    print(f'Tokenizer error (continuing): {e}')\ntry:\n    model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', trust_remote_code=True)\n    print('Model downloaded')\nexcept Exception as e:\n    print(f'Model error (vLLM will handle): {e}')\n\"\nfi\n\n# Start the server\necho 'Starting Stack 2.9 server...'\nexec python3 /app/app.py",

  "health_check": {
    "type": "HTTP",
    "endpoint": "/health",
    "interval_seconds": 30,
    "timeout_seconds": 10,
    "max_failures": 3
  },

  "pricing": {
    "bid_strategy": "spot",
    "max_bid_multiplier": 1.2,
    "min_bid_usd_per_hour": 0.0
  },

  "setup_commands": [
    "apt-get update && apt-get install -y python3 python3-pip git curl wget libgomp1"
  ],

  "notes": [
    "Based on NVIDIA CUDA 12.1 runtime",
    "Model cache is persisted to /home/vllm/.cache/huggingface",
    "Uses vLLM for high-performance inference",
    "OpenAI-compatible API endpoints",
    "SSH available on port 2222 with forwarded port"
  ]
}