vllm

Dockerfile
105 строк · 3.5 Кб
Перенос по словам
1
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
2
# to run the OpenAI compatible server.
3

4
#################### BASE BUILD IMAGE ####################
5
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
6

7
RUN apt-get update -y \
8
    && apt-get install -y python3-pip git
9

10
# Workaround for https://github.com/openai/triton/issues/2507 and
11
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
12
# this won't be needed for future versions of this docker image
13
# or future versions of triton.
14
RUN ldconfig /usr/local/cuda-12.1/compat/
15

16
WORKDIR /workspace
17

18
# install build and runtime dependencies
19
COPY requirements.txt requirements.txt
20
RUN --mount=type=cache,target=/root/.cache/pip \
21
    pip install -r requirements.txt
22

23
# install development dependencies
24
COPY requirements-dev.txt requirements-dev.txt
25
RUN --mount=type=cache,target=/root/.cache/pip \
26
    pip install -r requirements-dev.txt
27
#################### BASE BUILD IMAGE ####################
28

29

30
#################### EXTENSION BUILD IMAGE ####################
31
FROM dev AS build
32

33
# install build dependencies
34
COPY requirements-build.txt requirements-build.txt
35
RUN --mount=type=cache,target=/root/.cache/pip \
36
    pip install -r requirements-build.txt
37

38
# copy input files
39
COPY csrc csrc
40
COPY setup.py setup.py
41
COPY requirements.txt requirements.txt
42
COPY pyproject.toml pyproject.toml
43
COPY vllm/__init__.py vllm/__init__.py
44

45
# cuda arch list used by torch
46
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
47
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
48
# max jobs used by Ninja to build extensions
49
ARG max_jobs=2
50
ENV MAX_JOBS=${max_jobs}
51
# number of threads used by nvcc
52
ARG nvcc_threads=8
53
ENV NVCC_THREADS=$nvcc_threads
54
# make sure punica kernels are built (for LoRA)
55
ENV VLLM_INSTALL_PUNICA_KERNELS=1
56

57
RUN python3 setup.py build_ext --inplace
58
#################### EXTENSION Build IMAGE ####################
59

60

61
#################### TEST IMAGE ####################
62
# image to run unit testing suite
63
FROM dev AS test
64

65
# copy pytorch extensions separately to avoid having to rebuild
66
# when python code changes
67
WORKDIR /vllm-workspace
68
# ADD is used to preserve directory structure
69
ADD . /vllm-workspace/
70
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
71
# ignore build dependencies installation because we are using pre-complied extensions
72
RUN rm pyproject.toml
73
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
74
#################### TEST IMAGE ####################
75

76

77
#################### RUNTIME BASE IMAGE ####################
78
# We used base cuda image because pytorch installs its own cuda libraries.
79
# However cupy depends on cuda libraries so we had to switch to the runtime image
80
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
81
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
82

83
# libnccl required for ray
84
RUN apt-get update -y \
85
    && apt-get install -y python3-pip
86

87
WORKDIR /workspace
88
COPY requirements.txt requirements.txt
89
RUN --mount=type=cache,target=/root/.cache/pip \
90
    pip install -r requirements.txt
91
#################### RUNTIME BASE IMAGE ####################
92

93

94
#################### OPENAI API SERVER ####################
95
# openai api server alternative
96
FROM vllm-base AS vllm-openai
97
# install additional dependencies for openai api server
98
RUN --mount=type=cache,target=/root/.cache/pip \
99
    pip install accelerate
100

101
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
102
COPY vllm vllm
103

104
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
105
#################### OPENAI API SERVER ####################
106
vllm

Использование cookies