text-generation-inference

Dockerfile_amd
173 строки · 5.4 Кб
Перенос по словам
1
# Rust builder
2
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
3
WORKDIR /usr/src
4

5
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
6

7
FROM chef as planner
8
COPY Cargo.toml Cargo.toml
9
COPY rust-toolchain.toml rust-toolchain.toml
10
COPY proto proto
11
COPY benchmark benchmark
12
COPY router router
13
COPY launcher launcher
14
RUN cargo chef prepare --recipe-path recipe.json
15

16
FROM chef AS builder
17

18
ARG GIT_SHA
19
ARG DOCKER_LABEL
20

21
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
22
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
23
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
24
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
25
    rm -f $PROTOC_ZIP
26

27
COPY --from=planner /usr/src/recipe.json recipe.json
28
RUN cargo chef cook --release --recipe-path recipe.json
29

30
COPY Cargo.toml Cargo.toml
31
COPY rust-toolchain.toml rust-toolchain.toml
32
COPY proto proto
33
COPY benchmark benchmark
34
COPY router router
35
COPY launcher launcher
36
RUN cargo build --release
37

38
# Text Generation Inference base image for RoCm
39
FROM rocm/dev-ubuntu-22.04:5.7 as base
40

41
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
42
    build-essential \
43
    ca-certificates \
44
    ccache \
45
    curl \
46
    git \
47
    make \
48
    libssl-dev \
49
    g++ \
50
    # Needed to build VLLM & flash.
51
    rocthrust-dev \
52
    hipsparse-dev \
53
    hipblas-dev && \
54
    rm -rf /var/lib/apt/lists/*
55

56
# Keep in sync with `server/pyproject.toml
57
ARG MAMBA_VERSION=23.1.0-1
58
ARG PYTORCH_VERSION='2.2.0.dev0'
59
ARG ROCM_VERSION='5.7'
60
ARG PYTHON_VERSION='3.10.10'
61
# Automatically set by buildx
62
ARG TARGETPLATFORM
63
ENV PATH /opt/conda/bin:$PATH
64

65
# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
66
# Install mamba
67
# translating Docker's TARGETPLATFORM into mamba arches
68
RUN case ${TARGETPLATFORM} in \
69
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
70
         *)              MAMBA_ARCH=x86_64   ;; \
71
    esac && \
72
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
73
RUN chmod +x ~/mambaforge.sh && \
74
    bash ~/mambaforge.sh -b -p /opt/conda && \
75
    mamba init && \
76
    rm ~/mambaforge.sh
77

78
# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
79
RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/
80

81
FROM base AS kernel-builder
82

83
# Build vllm kernels
84
FROM kernel-builder AS vllm-builder
85
WORKDIR /usr/src
86

87
COPY server/Makefile-vllm Makefile
88

89
# Build specific version of vllm
90
RUN make build-vllm-rocm
91

92
# Build Flash Attention v2 kernels
93
FROM kernel-builder AS flash-att-v2-builder
94
WORKDIR /usr/src
95

96
COPY server/Makefile-flash-att-v2 Makefile
97

98
# Build specific version of flash attention v2
99
RUN make build-flash-attention-v2-rocm
100

101
# Build Transformers CUDA kernels (gpt-neox and bloom)
102
FROM kernel-builder as custom-kernels-builder
103
WORKDIR /usr/src
104
COPY server/custom_kernels/ .
105
RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build
106

107
# Build exllama kernels
108
FROM kernel-builder as exllama-kernels-builder
109
WORKDIR /usr/src
110
COPY server/exllama_kernels/ .
111

112
RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
113

114
# Build exllama v2 kernels
115
FROM kernel-builder as exllamav2-kernels-builder
116
WORKDIR /usr/src
117
COPY server/exllamav2_kernels/ .
118

119
RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
120

121
FROM base as base-copy
122

123
# Text Generation Inference base env
124
ENV HUGGINGFACE_HUB_CACHE=/data \
125
    HF_HUB_ENABLE_HF_TRANSFER=1 \
126
    PORT=80
127

128
# Copy builds artifacts from vllm builder
129
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
130

131
# Copy build artifacts from flash attention v2 builder
132
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
133

134
# Copy build artifacts from custom kernels builder
135
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
136

137
# Copy build artifacts from exllama kernels builder
138
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
139

140
# Copy build artifacts from exllamav2 kernels builder
141
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
142

143
# Install flash-attention dependencies
144
RUN pip install einops --no-cache-dir
145

146
# Install server
147
COPY proto proto
148
COPY server server
149
COPY server/Makefile server/Makefile
150
RUN cd server && \
151
    make gen-server && \
152
    pip install -r requirements_rocm.txt && \
153
    pip install ".[accelerate, peft, outlines]" --no-cache-dir
154

155
# Install benchmarker
156
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
157
# Install router
158
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
159
# Install launcher
160
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
161

162
# AWS Sagemaker compatible image
163
FROM base-copy as sagemaker
164
COPY sagemaker-entrypoint.sh entrypoint.sh
165
RUN chmod +x entrypoint.sh
166

167
ENTRYPOINT ["./entrypoint.sh"]
168

169
# Final image
170
FROM base-copy
171

172
ENTRYPOINT ["text-generation-launcher"]
173
CMD ["--json-output"]
174
text-generation-inference

Использование cookies