skypilot
1envs:2MODEL_NAME: Qwen/Qwen1.5-7B-Chat3
4service:5# Specifying the path to the endpoint to check the readiness of the replicas.6readiness_probe:7path: /v1/chat/completions8post_data:9model: $MODEL_NAME10messages:11- role: user12content: Hello! What is your name?13max_tokens: 114initial_delay_seconds: 120015# How many replicas to manage.16replicas: 217
18
19resources:20accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}21disk_tier: best22ports: 800023
24setup: |25conda activate qwen
26if [ $? -ne 0 ]; then
27conda create -n qwen python=3.10 -y
28conda activate qwen
29fi
30pip install -U vllm==0.3.2
31pip install -U transformers==4.38.0
32
33run: |34conda activate qwen
35export PATH=$PATH:/sbin
36python -m vllm.entrypoints.openai.api_server \
37--host 0.0.0.0 \
38--model $MODEL_NAME \
39--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
40--max-model-len 1024 | tee ~/openai_api_server.log
41