text-generation-inference
/
Dockerfile_amd
173 строки · 5.4 Кб
1# Rust builder
2FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
3WORKDIR /usr/src
4
5ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
6
7FROM chef as planner
8COPY Cargo.toml Cargo.toml
9COPY rust-toolchain.toml rust-toolchain.toml
10COPY proto proto
11COPY benchmark benchmark
12COPY router router
13COPY launcher launcher
14RUN cargo chef prepare --recipe-path recipe.json
15
16FROM chef AS builder
17
18ARG GIT_SHA
19ARG DOCKER_LABEL
20
21RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
22curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
23unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
24unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
25rm -f $PROTOC_ZIP
26
27COPY --from=planner /usr/src/recipe.json recipe.json
28RUN cargo chef cook --release --recipe-path recipe.json
29
30COPY Cargo.toml Cargo.toml
31COPY rust-toolchain.toml rust-toolchain.toml
32COPY proto proto
33COPY benchmark benchmark
34COPY router router
35COPY launcher launcher
36RUN cargo build --release
37
38# Text Generation Inference base image for RoCm
39FROM rocm/dev-ubuntu-22.04:5.7 as base
40
41RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
42build-essential \
43ca-certificates \
44ccache \
45curl \
46git \
47make \
48libssl-dev \
49g++ \
50# Needed to build VLLM & flash.
51rocthrust-dev \
52hipsparse-dev \
53hipblas-dev && \
54rm -rf /var/lib/apt/lists/*
55
56# Keep in sync with `server/pyproject.toml
57ARG MAMBA_VERSION=23.1.0-1
58ARG PYTORCH_VERSION='2.2.0.dev0'
59ARG ROCM_VERSION='5.7'
60ARG PYTHON_VERSION='3.10.10'
61# Automatically set by buildx
62ARG TARGETPLATFORM
63ENV PATH /opt/conda/bin:$PATH
64
65# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
66# Install mamba
67# translating Docker's TARGETPLATFORM into mamba arches
68RUN case ${TARGETPLATFORM} in \
69"linux/arm64") MAMBA_ARCH=aarch64 ;; \
70*) MAMBA_ARCH=x86_64 ;; \
71esac && \
72curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
73RUN chmod +x ~/mambaforge.sh && \
74bash ~/mambaforge.sh -b -p /opt/conda && \
75mamba init && \
76rm ~/mambaforge.sh
77
78# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
79RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/
80
81FROM base AS kernel-builder
82
83# Build vllm kernels
84FROM kernel-builder AS vllm-builder
85WORKDIR /usr/src
86
87COPY server/Makefile-vllm Makefile
88
89# Build specific version of vllm
90RUN make build-vllm-rocm
91
92# Build Flash Attention v2 kernels
93FROM kernel-builder AS flash-att-v2-builder
94WORKDIR /usr/src
95
96COPY server/Makefile-flash-att-v2 Makefile
97
98# Build specific version of flash attention v2
99RUN make build-flash-attention-v2-rocm
100
101# Build Transformers CUDA kernels (gpt-neox and bloom)
102FROM kernel-builder as custom-kernels-builder
103WORKDIR /usr/src
104COPY server/custom_kernels/ .
105RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build
106
107# Build exllama kernels
108FROM kernel-builder as exllama-kernels-builder
109WORKDIR /usr/src
110COPY server/exllama_kernels/ .
111
112RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
113
114# Build exllama v2 kernels
115FROM kernel-builder as exllamav2-kernels-builder
116WORKDIR /usr/src
117COPY server/exllamav2_kernels/ .
118
119RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
120
121FROM base as base-copy
122
123# Text Generation Inference base env
124ENV HUGGINGFACE_HUB_CACHE=/data \
125HF_HUB_ENABLE_HF_TRANSFER=1 \
126PORT=80
127
128# Copy builds artifacts from vllm builder
129COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
130
131# Copy build artifacts from flash attention v2 builder
132COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
133
134# Copy build artifacts from custom kernels builder
135COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
136
137# Copy build artifacts from exllama kernels builder
138COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
139
140# Copy build artifacts from exllamav2 kernels builder
141COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
142
143# Install flash-attention dependencies
144RUN pip install einops --no-cache-dir
145
146# Install server
147COPY proto proto
148COPY server server
149COPY server/Makefile server/Makefile
150RUN cd server && \
151make gen-server && \
152pip install -r requirements_rocm.txt && \
153pip install ".[accelerate, peft, outlines]" --no-cache-dir
154
155# Install benchmarker
156COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
157# Install router
158COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
159# Install launcher
160COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
161
162# AWS Sagemaker compatible image
163FROM base-copy as sagemaker
164COPY sagemaker-entrypoint.sh entrypoint.sh
165RUN chmod +x entrypoint.sh
166
167ENTRYPOINT ["./entrypoint.sh"]
168
169# Final image
170FROM base-copy
171
172ENTRYPOINT ["text-generation-launcher"]
173CMD ["--json-output"]
174