-
Notifications
You must be signed in to change notification settings - Fork 51
Tgi llamacpp #147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Tgi llamacpp #147
Changes from all commits
3bb20ab
a18a1a0
dded0ef
f04339a
5e8b18f
42d19fb
238021f
411a8a7
4dd4d55
d4d7990
c7e9246
a82bbb4
718cb3f
11b09a7
beec1b0
2929320
c5561d1
afe8d84
94d5108
7a8a453
13040f2
505c499
6e86ba6
434854a
fbd94e9
b95b21c
109550a
95b191f
6996a75
70eaac5
6bae751
371ca77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps | ||
|
|
||
| ARG llamacpp_version=b4827 | ||
| ARG llamacpp_cuda=OFF | ||
| ARG llamacpp_native=ON | ||
| ARG llamacpp_cpu_arm_arch=native | ||
| ARG cuda_arch=75-real;80-real;86-real;89-real;90-real | ||
|
|
||
| WORKDIR /opt/src | ||
|
|
||
| ENV DEBIAN_FRONTEND=noninteractive | ||
| RUN apt update && apt upgrade -y && apt install -y \ | ||
| clang \ | ||
| cmake \ | ||
| curl \ | ||
| git \ | ||
| python3-dev \ | ||
| unzip \ | ||
| libssl-dev \ | ||
| pkg-config \ | ||
| tar | ||
|
|
||
| ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ | ||
| RUN mkdir -p llama.cpp \ | ||
| && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ | ||
| && cd llama.cpp \ | ||
| && cmake -B build \ | ||
| -DCMAKE_INSTALL_PREFIX=/usr \ | ||
| -DCMAKE_INSTALL_LIBDIR=/usr/lib \ | ||
| -DCMAKE_C_COMPILER=clang \ | ||
| -DCMAKE_CXX_COMPILER=clang++ \ | ||
| -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ | ||
| -DGGML_CUDA=${llamacpp_cuda} \ | ||
| -DGGML_NATIVE=${llamacpp_native} \ | ||
| -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ | ||
| -DLLAMA_BUILD_COMMON=OFF \ | ||
| -DLLAMA_BUILD_TESTS=OFF \ | ||
| -DLLAMA_BUILD_EXAMPLES=OFF \ | ||
| -DLLAMA_BUILD_SERVER=OFF \ | ||
| && cmake --build build --parallel --config Release \ | ||
| && cmake --install build | ||
|
|
||
| WORKDIR /app | ||
| COPY rust-toolchain.toml rust-toolchain.toml | ||
| RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y | ||
| ENV PATH="/root/.cargo/bin:$PATH" | ||
| RUN cargo install cargo-chef --locked | ||
|
|
||
| FROM deps AS planner | ||
| COPY . . | ||
| RUN cargo chef prepare --recipe-path recipe.json | ||
|
|
||
| FROM deps AS builder | ||
| COPY --from=planner /app/recipe.json recipe.json | ||
| RUN cargo chef cook \ | ||
| --recipe-path recipe.json \ | ||
| --profile release \ | ||
| --package text-generation-router-llamacpp | ||
| COPY . . | ||
| RUN cargo build \ | ||
| --profile release \ | ||
| --package text-generation-router-llamacpp --frozen | ||
|
|
||
| FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker | ||
| WORKDIR /app | ||
|
|
||
| ENV DEBIAN_FRONTEND=noninteractive | ||
| RUN apt update && apt upgrade -y && apt install -y \ | ||
| python3-venv \ | ||
| unzip \ | ||
| curl \ | ||
| python3-pip | ||
|
|
||
| RUN python3 -m venv /venv | ||
| ENV PATH="/venv/bin:$PATH" | ||
|
|
||
| COPY backends/llamacpp/requirements.txt requirements.txt | ||
| COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py | ||
| COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ | ||
|
|
||
| RUN pip3 install --no-cache-dir \ | ||
| -r requirements.txt \ | ||
| -e gguf-py | ||
|
|
||
| COPY --from=builder /usr/lib/libllama.so /usr/lib/ | ||
| COPY --from=builder /usr/lib/libggml*.so /usr/lib/ | ||
| COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ | ||
|
|
||
| ENV HF_HUB_ENABLE_HF_TRANSFER=1 | ||
|
|
||
|
|
||
| RUN HOME_DIR=/root && \ | ||
| pip3 install requests PTable setuptools && \ | ||
| curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ | ||
| unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ | ||
| cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ | ||
| chmod +x /usr/local/bin/testOSSCompliance && \ | ||
| chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ | ||
| ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ | ||
| rm -rf ${HOME_DIR}/oss_compliance* | ||
|
|
||
| ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" | ||
| ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:cpu:inference:tgi-llamacpp | ||
|
|
||
| COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES | ||
| COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh entrypoint.sh | ||
| RUN chmod +x entrypoint.sh | ||
|
|
||
| ENTRYPOINT ["./entrypoint.sh"] | ||
| CMD ["--json-output"] | ||
|
|
||
| LABEL dlc_major_version="1" | ||
| LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" | ||
| LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| #!/bin/bash | ||
| if [[ -z "${HF_MODEL_ID}" ]]; then | ||
| echo "HF_MODEL_ID must be set" | ||
| exit 1 | ||
| fi | ||
| export MODEL_ID="${HF_MODEL_ID}" | ||
|
|
||
| mkdir -p models | ||
|
|
||
| if [[ -n "${HF_MODEL_GGUF}" ]]; then | ||
| if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then | ||
| huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" | ||
| echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" | ||
| export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" | ||
| else | ||
| huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" | ||
| echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" | ||
| export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" | ||
| fi | ||
|
|
||
| if [[ -z "${MODEL_GGUF}" ]]; then | ||
| echo "No gguf files found in ./models/${HF_MODEL_GGUF}" | ||
| exit 1 | ||
| fi | ||
| fi | ||
|
|
||
| text-generation-router-llamacpp --port 8080 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps | ||
|
|
||
| ARG llamacpp_version=b4827 | ||
| ARG llamacpp_cuda=ON | ||
| ARG llamacpp_native=ON | ||
| ARG llamacpp_cpu_arm_arch=native | ||
| ARG cuda_arch=75-real;80-real;86-real;89-real;90-real | ||
|
|
||
| WORKDIR /opt/src | ||
|
|
||
| ENV DEBIAN_FRONTEND=noninteractive | ||
| RUN apt update && apt upgrade -y && apt install -y \ | ||
| clang \ | ||
| cmake \ | ||
| curl \ | ||
| git \ | ||
| python3-dev \ | ||
| unzip \ | ||
| libssl-dev \ | ||
| pkg-config \ | ||
| tar | ||
|
|
||
| ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ | ||
| RUN mkdir -p llama.cpp \ | ||
| && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ | ||
| && cd llama.cpp \ | ||
| && cmake -B build \ | ||
| -DCMAKE_INSTALL_PREFIX=/usr \ | ||
| -DCMAKE_INSTALL_LIBDIR=/usr/lib \ | ||
| -DCMAKE_C_COMPILER=clang \ | ||
| -DCMAKE_CXX_COMPILER=clang++ \ | ||
| -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ | ||
| -DGGML_CUDA=${llamacpp_cuda} \ | ||
| -DGGML_NATIVE=${llamacpp_native} \ | ||
| -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ | ||
| -DLLAMA_BUILD_COMMON=OFF \ | ||
| -DLLAMA_BUILD_TESTS=OFF \ | ||
| -DLLAMA_BUILD_EXAMPLES=OFF \ | ||
| -DLLAMA_BUILD_SERVER=OFF \ | ||
| && cmake --build build --parallel --config Release \ | ||
| && cmake --install build | ||
|
|
||
| WORKDIR /app | ||
| COPY rust-toolchain.toml rust-toolchain.toml | ||
| RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y | ||
| ENV PATH="/root/.cargo/bin:$PATH" | ||
| RUN cargo install cargo-chef --locked | ||
|
|
||
| FROM deps AS planner | ||
| COPY . . | ||
| RUN cargo chef prepare --recipe-path recipe.json | ||
|
|
||
| FROM deps AS builder | ||
| COPY --from=planner /app/recipe.json recipe.json | ||
| RUN cargo chef cook \ | ||
| --recipe-path recipe.json \ | ||
| --profile release \ | ||
| --package text-generation-router-llamacpp | ||
| COPY . . | ||
| RUN cargo build \ | ||
| --profile release \ | ||
| --package text-generation-router-llamacpp --frozen | ||
|
|
||
| FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker | ||
| WORKDIR /app | ||
|
|
||
| ENV DEBIAN_FRONTEND=noninteractive | ||
| RUN apt update && apt upgrade -y && apt install -y \ | ||
| python3-venv \ | ||
| unzip \ | ||
| curl \ | ||
| python3-pip | ||
|
|
||
| RUN python3 -m venv /venv | ||
| ENV PATH="/venv/bin:$PATH" | ||
|
|
||
| COPY backends/llamacpp/requirements.txt requirements.txt | ||
| COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py | ||
| COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ | ||
|
|
||
| RUN pip3 install --no-cache-dir \ | ||
| -r requirements.txt \ | ||
| -e gguf-py | ||
|
|
||
| COPY --from=builder /usr/lib/libllama.so /usr/lib/ | ||
| COPY --from=builder /usr/lib/libggml*.so /usr/lib/ | ||
| COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ | ||
|
|
||
| ENV HF_HUB_ENABLE_HF_TRANSFER=1 | ||
| ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp | ||
|
|
||
| RUN HOME_DIR=/root && \ | ||
| pip3 install requests PTable setuptools && \ | ||
| curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ | ||
| unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ | ||
| cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ | ||
| chmod +x /usr/local/bin/testOSSCompliance && \ | ||
| chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ | ||
| ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ | ||
| rm -rf ${HOME_DIR}/oss_compliance* | ||
|
|
||
|
|
||
| COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES | ||
| COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh | ||
| RUN chmod +x start-cuda-compat.sh | ||
|
|
||
| COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh | ||
| RUN chmod +x entrypoint.sh | ||
|
|
||
| ENTRYPOINT ["./entrypoint.sh"] | ||
| CMD ["--json-output"] | ||
|
|
||
| LABEL dlc_major_version="1" | ||
| LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" | ||
| LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| #!/bin/bash | ||
| if [[ -z "${HF_MODEL_ID}" ]]; then | ||
| echo "HF_MODEL_ID must be set" | ||
| exit 1 | ||
| fi | ||
| export MODEL_ID="${HF_MODEL_ID}" | ||
|
|
||
| mkdir -p models | ||
|
|
||
| if [[ -n "${HF_MODEL_GGUF}" ]]; then | ||
| if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then | ||
| huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" | ||
| echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" | ||
| export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" | ||
| else | ||
| huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" | ||
| echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" | ||
| export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" | ||
| fi | ||
|
|
||
| if [[ -z "${MODEL_GGUF}" ]]; then | ||
| echo "No gguf files found in ./models/${HF_MODEL_GGUF}" | ||
| exit 1 | ||
| fi | ||
| fi | ||
|
|
||
| text-generation-router-llamacpp --port 8080 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| #!/bin/bash | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. correct me if I am wrong but this is because of the vulnerability right? I'll take an AI to check with hosting platform if they have got any long term remediation for this vulnerability or not.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my understanding is, this is an overhead every time we are gonna add a new image right?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed that's for the vulnerability, and yes this is a an overhead for all the new cuda based images |
||
|
|
||
| verlt() { | ||
| [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] | ||
| } | ||
|
|
||
| if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then | ||
| CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-) | ||
| echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}" | ||
| NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) | ||
| echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}" | ||
| if verlt $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then | ||
| echo "Adding CUDA compat to LD_LIBRARY_PATH" | ||
| export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH | ||
| echo $LD_LIBRARY_PATH | ||
| else | ||
| echo "Skipping CUDA compat setup as newer NVIDIA driver is installed" | ||
| fi | ||
| else | ||
| echo "Skipping CUDA compat setup as package not found" | ||
| fi | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -100,37 +100,43 @@ | |
| "python_version": "py310", | ||
| "pytorch_version": "2.0.1" | ||
| } | ||
| ] | ||
| }, | ||
| "ignore_vulnerabilities": [ | ||
| "CVE-2024-42154 - linux", | ||
| "CVE-2025-32434 - torch" | ||
| ], | ||
| "releases": [ | ||
| ], | ||
|
|
||
| "TGILLAMACPP": [ | ||
| { | ||
| "framework": "TEI", | ||
| "device": "gpu", | ||
| "version": "1.7.0", | ||
| "min_version": "3.2.3", | ||
| "max_version": "3.2.3", | ||
| "os_version": "ubuntu22.04", | ||
| "python_version": "py310", | ||
| "pytorch_version": "2.0.1", | ||
| "cuda_version": "cu122" | ||
| "cuda_version": "cu128", | ||
| "python_version": "py311", | ||
| "pytorch_version": "2.6.0" | ||
| }, | ||
| { | ||
| "framework": "TEI", | ||
| "device": "cpu", | ||
| "version": "1.7.0", | ||
| "min_version": "3.2.3", | ||
| "max_version": "3.2.3", | ||
| "os_version": "ubuntu22.04", | ||
| "python_version": "py310", | ||
| "pytorch_version": "2.0.1" | ||
| }, | ||
| "cuda_version": "cu128", | ||
| "python_version": "py311", | ||
| "pytorch_version": "2.6.0" | ||
| } | ||
| ] | ||
| }, | ||
|
|
||
| "ignore_vulnerabilities": [ | ||
| "CVE-2024-42154 - linux", | ||
| "CVE-2025-32434 - torch" | ||
| ], | ||
| "releases": [ | ||
| { | ||
| "framework": "TGI", | ||
| "device": "inf2", | ||
| "version": "0.0.28", | ||
| "framework": "TGILLAMACPP", | ||
| "device": "gpu", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont you want to add both CPU and GPU images?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, tested both independently locally, but will ad that to build the two |
||
| "version": "3.2.3", | ||
| "cuda_version": "cu128", | ||
| "os_version": "ubuntu22.04", | ||
| "python_version": "py310", | ||
| "pytorch_version": "2.1.2" | ||
| "python_version": "py311", | ||
| "pytorch_version": "2.6.0" | ||
| } | ||
| ] | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
interesting, I thought it's just a CPU image.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah and ideally for Graviton instances there should be a different container to maximize the performance, see https://huggingface.co/docs/text-generation-inference/backends/llamacpp#build-docker-image