vllm
/
Dockerfile
105 строк · 3.5 Кб
1# The vLLM Dockerfile is used to construct vLLM image that can be directly used
2# to run the OpenAI compatible server.
3
4#################### BASE BUILD IMAGE ####################
5FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
6
7RUN apt-get update -y \
8&& apt-get install -y python3-pip git
9
10# Workaround for https://github.com/openai/triton/issues/2507 and
11# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
12# this won't be needed for future versions of this docker image
13# or future versions of triton.
14RUN ldconfig /usr/local/cuda-12.1/compat/
15
16WORKDIR /workspace
17
18# install build and runtime dependencies
19COPY requirements.txt requirements.txt
20RUN --mount=type=cache,target=/root/.cache/pip \
21pip install -r requirements.txt
22
23# install development dependencies
24COPY requirements-dev.txt requirements-dev.txt
25RUN --mount=type=cache,target=/root/.cache/pip \
26pip install -r requirements-dev.txt
27#################### BASE BUILD IMAGE ####################
28
29
30#################### EXTENSION BUILD IMAGE ####################
31FROM dev AS build
32
33# install build dependencies
34COPY requirements-build.txt requirements-build.txt
35RUN --mount=type=cache,target=/root/.cache/pip \
36pip install -r requirements-build.txt
37
38# copy input files
39COPY csrc csrc
40COPY setup.py setup.py
41COPY requirements.txt requirements.txt
42COPY pyproject.toml pyproject.toml
43COPY vllm/__init__.py vllm/__init__.py
44
45# cuda arch list used by torch
46ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
47ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
48# max jobs used by Ninja to build extensions
49ARG max_jobs=2
50ENV MAX_JOBS=${max_jobs}
51# number of threads used by nvcc
52ARG nvcc_threads=8
53ENV NVCC_THREADS=$nvcc_threads
54# make sure punica kernels are built (for LoRA)
55ENV VLLM_INSTALL_PUNICA_KERNELS=1
56
57RUN python3 setup.py build_ext --inplace
58#################### EXTENSION Build IMAGE ####################
59
60
61#################### TEST IMAGE ####################
62# image to run unit testing suite
63FROM dev AS test
64
65# copy pytorch extensions separately to avoid having to rebuild
66# when python code changes
67WORKDIR /vllm-workspace
68# ADD is used to preserve directory structure
69ADD . /vllm-workspace/
70COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
71# ignore build dependencies installation because we are using pre-complied extensions
72RUN rm pyproject.toml
73RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
74#################### TEST IMAGE ####################
75
76
77#################### RUNTIME BASE IMAGE ####################
78# We used base cuda image because pytorch installs its own cuda libraries.
79# However cupy depends on cuda libraries so we had to switch to the runtime image
80# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
81FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
82
83# libnccl required for ray
84RUN apt-get update -y \
85&& apt-get install -y python3-pip
86
87WORKDIR /workspace
88COPY requirements.txt requirements.txt
89RUN --mount=type=cache,target=/root/.cache/pip \
90pip install -r requirements.txt
91#################### RUNTIME BASE IMAGE ####################
92
93
94#################### OPENAI API SERVER ####################
95# openai api server alternative
96FROM vllm-base AS vllm-openai
97# install additional dependencies for openai api server
98RUN --mount=type=cache,target=/root/.cache/pip \
99pip install accelerate
100
101COPY --from=build /workspace/vllm/*.so /workspace/vllm/
102COPY vllm vllm
103
104ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
105#################### OPENAI API SERVER ####################
106