From 646bae3b6e3bf3b4e15a7ebbc2edae6ed9af765a Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:52:02 +0200
Subject: [PATCH 1/4] Update releases.json

---
 releases.json | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/releases.json b/releases.json
index 3139826..0636989 100644
--- a/releases.json
+++ b/releases.json
@@ -37,6 +37,15 @@
                 "python_version": "py310",
                 "pytorch_version": "2.3.0"
             },
+            {
+                "device": "gpu",
+                "min_version": "2.3.1",
+                "max_version": "2.3.1",
+                "os_version": "ubuntu22.04",
+                "cuda_version": "cu124",
+                "python_version": "py311",
+                "pytorch_version": "2.4.0"
+            },
             {
                 "device": "inf2",
                 "min_version": "0.0.16",
@@ -80,11 +89,11 @@
         {
             "framework": "TGI",
             "device": "gpu",
-            "version": "2.2.0",
+            "version": "2.3.1",
             "os_version": "ubuntu22.04",
-            "python_version": "py310",
-            "pytorch_version": "2.3.0",
-            "cuda_version": "cu121"
+            "python_version": "py311",
+            "pytorch_version": "2.4.0",
+            "cuda_version": "cu124"
         }
     ]
 }

From 0c0dd8323c68e4c6f2fd5556baf1860df26f8523 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 7 Oct 2024 12:52:31 +0200
Subject: [PATCH 2/4] init prev image

---
 .../pytorch/tgi/docker/2.3.0/Dockerfile       | 303 ++++++++++++++++++
 1 file changed, 303 insertions(+)
 create mode 100644 huggingface/pytorch/tgi/docker/2.3.0/Dockerfile

diff --git a/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile b/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
new file mode 100644
index 0000000..e65c6f4
--- /dev/null
+++ b/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
@@ -0,0 +1,303 @@
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+# Python builder
+# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
+
+# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
+ARG PYTORCH_VERSION=2.3.0
+ARG PYTHON_VERSION=3.10
+# Keep in sync with `server/pyproject.toml
+ARG CUDA_VERSION=12.1
+ARG MAMBA_VERSION=24.3.0-0
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+# Automatically set by buildx
+ARG TARGETPLATFORM
+
+ENV PATH /opt/conda/bin:$PATH
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    ccache \
+    curl \
+    libexpat1 \
+    libgssapi-krb5-2 \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+# CUDA kernels builder image
+FROM pytorch-install AS kernel-builder
+
+ARG MAX_JOBS=8
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ninja-build cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Build Flash Attention CUDA kernels
+FROM kernel-builder AS flash-att-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att Makefile
+
+# Build specific version of flash attention
+RUN make build-flash-attention
+
+# Build Flash Attention v2 CUDA kernels
+FROM kernel-builder AS flash-att-v2-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att-v2 Makefile
+
+# Build specific version of flash attention v2
+RUN make build-flash-attention-v2-cuda
+
+# Build Transformers exllama kernels
+FROM kernel-builder AS exllama-kernels-builder
+WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers exllama kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .
+
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers awq kernels
+FROM kernel-builder AS awq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-awq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+
+# Build eetq kernels
+FROM kernel-builder AS eetq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-eetq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+
+# Build marlin kernels
+FROM kernel-builder AS marlin-kernels-builder
+WORKDIR /usr/src
+COPY server/marlin/ .
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Lorax Punica kernels
+FROM kernel-builder AS lorax-punica-builder
+WORKDIR /usr/src
+COPY server/Makefile-lorax-punica Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
+
+# Build Transformers CUDA kernels
+FROM kernel-builder AS custom-kernels-builder
+WORKDIR /usr/src
+COPY server/custom_kernels/ .
+# Build specific version of transformers
+RUN python setup.py build
+
+# Build FBGEMM CUDA kernels
+FROM kernel-builder AS fbgemm-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-fbgemm Makefile
+COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
+COPY server/fix_torch90a.sh fix_torch90a.sh
+
+RUN make build-fbgemm
+
+# Build vllm CUDA kernels
+FROM kernel-builder AS vllm-builder
+
+WORKDIR /usr/src
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+COPY server/Makefile-vllm Makefile
+
+# Build specific version of vllm
+RUN make build-vllm-cuda
+
+# Build mamba kernels
+FROM kernel-builder AS mamba-builder
+WORKDIR /usr/src
+COPY server/Makefile-selective-scan Makefile
+RUN make build-all
+
+# Text Generation Inference base image
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
+
+# Conda env
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+WORKDIR /usr/src
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    libssl-dev \
+    ca-certificates \
+    make \
+    unzip \
+    curl \
+    git \
+    libexpat1 \
+    libgssapi-krb5-2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from marlin kernels builder
+COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from fbgemm builder
+COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+
+# Install flash-attention dependencies
+RUN pip install einops --no-cache-dir
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_cuda.txt && \
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
+    pip install nvidia-nccl-cu12==2.22.3
+
+ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
+
+# Deps before the binaries
+# The binaries change on every build given we burn the SHA into them
+# The deps change less often.
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    libgssapi-krb5-2 \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base as sagemaker
+
+COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
+
+RUN HOME_DIR=/root && \
+    pip install requests && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+COPY /huggingface/pytorch/tgi/docker/2.2.0/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
+
+RUN /opt/conda/bin/conda clean -py
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
+
+LABEL dlc_major_version="2"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
\ No newline at end of file

From c5435fdd535cdd232deae07a448e3605bd1ba60d Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 7 Oct 2024 12:54:26 +0200
Subject: [PATCH 3/4] add image

---
 .../pytorch/tgi/docker/2.3.0/Dockerfile       | 100 +++++++++---------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile b/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
index e65c6f4..4a538ba 100644
--- a/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
+++ b/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -11,11 +11,15 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
+
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -30,18 +34,20 @@ COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
 
 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
-ARG PYTORCH_VERSION=2.3.0
-ARG PYTHON_VERSION=3.10
+ARG PYTORCH_VERSION=2.4.0
+
+ARG PYTHON_VERSION=3.11
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
@@ -50,13 +56,11 @@ ARG TARGETPLATFORM
 
 ENV PATH /opt/conda/bin:$PATH
 
-RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     build-essential \
     ca-certificates \
     ccache \
     curl \
-    libexpat1 \
-    libgssapi-krb5-2 \
     git && \
     rm -rf /var/lib/apt/lists/*
 
@@ -84,6 +88,7 @@ RUN case ${TARGETPLATFORM} in \
 FROM pytorch-install AS kernel-builder
 
 ARG MAX_JOBS=8
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     ninja-build cmake \
@@ -114,36 +119,29 @@ FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
 
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN python setup.py build
 
 # Build Transformers exllama kernels
 FROM kernel-builder AS exllamav2-kernels-builder
 WORKDIR /usr/src
-COPY server/exllamav2_kernels/ .
+COPY server/Makefile-exllamav2/ Makefile
 
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN make build-exllamav2
 
 # Build Transformers awq kernels
 FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+RUN make build-awq
 
 # Build eetq kernels
 FROM kernel-builder AS eetq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
-
-# Build marlin kernels
-FROM kernel-builder AS marlin-kernels-builder
-WORKDIR /usr/src
-COPY server/marlin/ .
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN make build-eetq
 
 # Build Lorax Punica kernels
 FROM kernel-builder AS lorax-punica-builder
@@ -165,8 +163,6 @@ FROM kernel-builder AS fbgemm-builder
 WORKDIR /usr/src
 
 COPY server/Makefile-fbgemm Makefile
-COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
-COPY server/fix_torch90a.sh fix_torch90a.sh
 
 RUN make build-fbgemm
 
@@ -188,6 +184,12 @@ WORKDIR /usr/src
 COPY server/Makefile-selective-scan Makefile
 RUN make build-all
 
+# Build flashinfer
+FROM kernel-builder AS flashinfer-builder
+WORKDIR /usr/src
+COPY server/Makefile-flashinfer Makefile
+RUN make install-flashinfer
+
 # Text Generation Inference base image
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
 
@@ -196,54 +198,52 @@ ENV PATH=/opt/conda/bin:$PATH \
     CONDA_PREFIX=/opt/conda
 
 # Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/tmp \
+ENV HF_HOME=/tmp \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
 WORKDIR /usr/src
 
-RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     libssl-dev \
     ca-certificates \
     make \
     unzip \
     curl \
     git \
-    libexpat1 \
-    libgssapi-krb5-2 \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy conda with PyTorch installed
 COPY --from=pytorch-install /opt/conda /opt/conda
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from marlin kernels builder
-COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+# Copy build artifacts from lorax punica kernels builder
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from fbgemm builder
-COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
+COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@@ -255,17 +255,21 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
+    pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
     pip install nvidia-nccl-cu12==2.22.3
 
-ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
+ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+# Required to find libpython within the rust binaries
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
+# This is needed because exl2 tries to load flash-attn
+# And fails with our builds.
+ENV EXLLAMA_NO_FLASH_ATTN=1
 
 # Deps before the binaries
 # The binaries change on every build given we burn the SHA into them
 # The deps change less often.
-RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     build-essential \
-    libgssapi-krb5-2 \
     g++ \
     && rm -rf /var/lib/apt/lists/*
 
@@ -278,7 +282,7 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/lo
 
 
 # AWS Sagemaker compatible image
-FROM base as sagemaker
+FROM base AS sagemaker
 
 COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
 
@@ -291,7 +295,7 @@ RUN HOME_DIR=/root && \
     chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
     ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
     rm -rf ${HOME_DIR}/oss_compliance*
-COPY /huggingface/pytorch/tgi/docker/2.2.0/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
+COPY /huggingface/pytorch/tgi/docker/2.3.1/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
 
 RUN /opt/conda/bin/conda clean -py
 

From 849a007e8d749cb5ad4eb3a4121d21e5e1b447f9 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 7 Oct 2024 15:58:03 +0200
Subject: [PATCH 4/4] fix

---
 huggingface/pytorch/tgi/docker/{2.3.0 => 2.3.1}/Dockerfile | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename huggingface/pytorch/tgi/docker/{2.3.0 => 2.3.1}/Dockerfile (100%)

diff --git a/huggingface/pytorch/tgi/docker/2.3.0/Dockerfile b/huggingface/pytorch/tgi/docker/2.3.1/Dockerfile
similarity index 100%
rename from huggingface/pytorch/tgi/docker/2.3.0/Dockerfile
rename to huggingface/pytorch/tgi/docker/2.3.1/Dockerfile