awslabs · fgbelidji · Apr 15, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
@@ -19,7 +19,7 @@
 FRAMEWORK_DEVICE_DICT: Dict[str, List[str]] = {
     "TGI": ["GPU", "INF2"],
     "TEI": ["GPU", "CPU"],
-    "TGILLAMACPP": ["CPU"],
+    "TGILLAMACPP": ["GPU", "CPU"],
 }
 Framework = enum.Enum("Framework", ["TGI", "OPTIMUM", "TEI", "TGILLAMACPP"])
 Device = enum.Enum("Device", ["GPU", "INF2", "CPU"])

@@ -0,0 +1,114 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4827
+ARG llamacpp_cuda=OFF
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    unzip \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    python3-venv \
+    unzip \
+    curl \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+RUN pip3 install --no-cache-dir \
+    -r requirements.txt \
+    -e gguf-py
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+
+RUN HOME_DIR=/root && \
+    pip3 install requests PTable setuptools && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
+ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:cpu:inference:tgi-llamacpp
+
+COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
+COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh 
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
+
+LABEL dlc_major_version="1"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
@@ -0,0 +1,27 @@
+#!/bin/bash
+if [[ -z "${HF_MODEL_ID}" ]]; then
+  echo "HF_MODEL_ID must be set"
+  exit 1
+fi
+export MODEL_ID="${HF_MODEL_ID}"
+
+mkdir -p models
+
+if [[ -n "${HF_MODEL_GGUF}" ]]; then
+  if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then
+    huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}"
+    echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}"
+    export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)"
+  else
+    huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}"
+    echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}"
+    export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)"
+  fi
+
+  if [[ -z "${MODEL_GGUF}" ]]; then
+    echo "No gguf files found in ./models/${HF_MODEL_GGUF}"
+    exit 1
+  fi
+fi
+
+text-generation-router-llamacpp --port 8080
@@ -0,0 +1,115 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4827
+ARG llamacpp_cuda=ON
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    unzip \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    python3-venv \
+    unzip \
+    curl \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+RUN pip3 install --no-cache-dir \
+    -r requirements.txt \
+    -e gguf-py
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp
+
+RUN HOME_DIR=/root && \
+    pip3 install requests PTable setuptools && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+
+
+COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
+COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh
+RUN chmod +x start-cuda-compat.sh
+
+COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh 
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
+
+LABEL dlc_major_version="1"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
@@ -0,0 +1,27 @@
+#!/bin/bash
+if [[ -z "${HF_MODEL_ID}" ]]; then
+  echo "HF_MODEL_ID must be set"
+  exit 1
+fi
+export MODEL_ID="${HF_MODEL_ID}"
+
+mkdir -p models
+
+if [[ -n "${HF_MODEL_GGUF}" ]]; then
+  if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then
+    huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}"
+    echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}"
+    export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)"
+  else
+    huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}"
+    echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}"
+    export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)"
+  fi
+
+  if [[ -z "${MODEL_GGUF}" ]]; then
+    echo "No gguf files found in ./models/${HF_MODEL_GGUF}"
+    exit 1
+  fi
+fi
+
+text-generation-router-llamacpp --port 8080
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+verlt() {
+    [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
+    CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
+    echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+    echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+    if verlt $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+        echo "Adding CUDA compat to LD_LIBRARY_PATH"
+        export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+        echo $LD_LIBRARY_PATH
+    else
+        echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+    fi
+else
+    echo "Skipping CUDA compat setup as package not found"
+fi
@@ -100,37 +100,43 @@
                 "python_version": "py310",
                 "pytorch_version": "2.0.1"
             }
-        ]
-    },
-    "ignore_vulnerabilities": [
-        "CVE-2024-42154 - linux",
-        "CVE-2025-32434 - torch"
-    ],
-    "releases": [
+        ],
+
+    "TGILLAMACPP": [
         {
-            "framework": "TEI",
             "device": "gpu",
-            "version": "1.7.0",
+            "min_version": "3.2.3",
+            "max_version": "3.2.3",
             "os_version": "ubuntu22.04",
-            "python_version": "py310",
-            "pytorch_version": "2.0.1",
-            "cuda_version": "cu122"
+            "cuda_version": "cu128",
+            "python_version": "py311",
+            "pytorch_version": "2.6.0"
         },
         {
-            "framework": "TEI",
             "device": "cpu",
-            "version": "1.7.0",
+            "min_version": "3.2.3",
+            "max_version": "3.2.3",
             "os_version": "ubuntu22.04",
-            "python_version": "py310",
-            "pytorch_version": "2.0.1"
-        },
+            "cuda_version": "cu128",
+            "python_version": "py311",
+            "pytorch_version": "2.6.0"
+        }
+    ]
+},
+
+    "ignore_vulnerabilities": [
+        "CVE-2024-42154 - linux",
+        "CVE-2025-32434 - torch"
+    ],
+    "releases": [
         {
-            "framework": "TGI",
-            "device": "inf2",
-            "version": "0.0.28",
+            "framework": "TGILLAMACPP",
+            "device": "gpu",
+            "version": "3.2.3",
+            "cuda_version": "cu128",
             "os_version": "ubuntu22.04",
-            "python_version": "py310",
-            "pytorch_version": "2.1.2"
+            "python_version": "py311",
+            "pytorch_version": "2.6.0"
         }
     ]
 }