From 3bb20abd6f7042406232dfaf534085a5d6fa716b Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Tue, 15 Apr 2025 17:22:57 +0200 Subject: [PATCH 01/26] Added TGI llamacpp ressources --- .../docker/3.2.3/THIRD-PARTY-LICENSES | 583 ++++++++++++++++++ .../tgillamacpp/docker/3.2.3/gpu/Dockerfile | 114 ++++ .../docker/3.2.3/gpu/entrypoint.sh | 30 + .../docker/3.2.3/gpu/start-cuda-compat.sh | 21 + 4 files changed, 748 insertions(+) create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES b/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES new file mode 100644 index 0000000..4f3f5dc --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES @@ -0,0 +1,583 @@ +** text-generation-inference; version 3.2.3 -- https://github.com/huggingface/text-generation-inference + +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Hugging Face + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------ + +** transformers; version 4.49.0 -- https://github.com/huggingface/transformers + +Copyright 2018- The Hugging Face team. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +* For transformers see also this required NOTICE: + Copyright 2018- The Hugging Face team. All rights reserved. + + https://github.com/huggingface/transformers/blob/main/LICENSE + +------ + +** Flash Attention; version 2.3.3 -- https://github.com/Dao-AILab/flash-attention/tree/v2.3.2 +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +BSD 3-Clause License + +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------ + +** PyTorch; version 2.6.0 -- https://github.com/pytorch/pytorch +From PyTorch: + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +From Caffe2: + +Copyright (c) 2016-present, Facebook Inc. All rights reserved. + +All contributions by Facebook: +Copyright (c) 2016 Facebook Inc. + +All contributions by Google: +Copyright (c) 2015 Google Inc. +All rights reserved. + +All contributions by Yangqing Jia: +Copyright (c) 2015 Yangqing Jia +All rights reserved. + +All contributions by Kakao Brain: +Copyright 2019-2020 Kakao Brain + +All contributions by Cruise LLC: +Copyright (c) 2022 Cruise LLC. +All rights reserved. + +All contributions by Arm: +Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates + +All contributions from Caffe: +Copyright(c) 2013, 2014, 2015, the respective contributors +All rights reserved. + +All other contributions: +Copyright(c) 2015, 2016 the respective contributors +All rights reserved. + +Caffe2 uses a copyright model similar to Caffe: each contributor holds +copyright over their contributions to Caffe2. The project versioning records +all such contribution and copyright details. If a contributor wants to further +mark their specific copyright on a particular contribution, they should +indicate their copyright solely in the commit message of the change when it is +committed. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +------ + +** miniforge; version 23.3.1-1 -- https://github.com/conda-forge/miniforge/tree/23.3.1-1 +Copyright (c) 2019-2022, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Miniforge installer code uses BSD-3-Clause license as stated below. + +Binary packages that come with it have their own licensing terms +and by installing miniforge you agree to the licensing terms of individual +packages as well. They include different OSI-approved licenses including +the GNU General Public License and can be found in pkgs//info/licenses +folders. + +Miniforge installer comes with a boostrapping executable that is used +when installing miniforge and is deleted after miniforge is installed. +The bootstrapping executable uses micromamba, cli11, cpp-filesystem, +curl, c-ares, krb5, libarchive, libev, lz4, nghttp2, openssl, libsolv, +nlohmann-json, reproc and zstd which are licensed under BSD-3-Clause, +MIT and OpenSSL licenses. Licenses and copyright notices of these +projects can be found at the following URL. +https://github.com/conda-forge/micromamba-feedstock/tree/master/recipe. diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile new file mode 100644 index 0000000..3edcf45 --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -0,0 +1,114 @@ +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps + +ARG llamacpp_version=b4827 +ARG llamacpp_cuda=ON +ARG llamacpp_native=ON +ARG llamacpp_cpu_arm_arch=native +ARG cuda_arch=75-real;80-real;86-real;89-real;90-real + +WORKDIR /opt/src + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + clang \ + cmake \ + curl \ + git \ + python3-dev \ + unzip \ + libssl-dev \ + pkg-config \ + tar + +ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ + -DGGML_CUDA=${llamacpp_cuda} \ + -DGGML_NATIVE=${llamacpp_native} \ + -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + && cmake --build build --parallel --config Release \ + && cmake --install build + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release \ + --package text-generation-router-llamacpp --frozen + +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + python3-venv \ + python3-pip + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements.txt requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ + +RUN pip3 install --no-cache-dir \ + -r requirements.txt \ + -e gguf-py + +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +RUN HOME_DIR=/root && \ + pip3 requests PTable && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ + rm -rf ${HOME_DIR}/oss_compliance* + +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/start-cuda-compat.sh /root/start-cuda-compat.sh +RUN chmod +x /root/start-cuda-compat.sh + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" +ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp + +COPY sagemaker-entrypoint.sh entrypoint.sh +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["--json-output"] + +LABEL dlc_major_version="2" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh new file mode 100644 index 0000000..cc9fd9b --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh @@ -0,0 +1,30 @@ +#!/bin/bash +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "HF_MODEL_ID must be set" + exit 1 +fi +export MODEL_ID="${HF_MODEL_ID}" + +if [[ -z "${HF_MODEL_GGUF}" ]]; then + echo "HF_MODEL_GGUF must be set" + exit 1 +fi + +mkdir models + +if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "{$HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" +else + huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" +fi + +if [[ -z "${MODEL_GGUF}" ]]; then + echo "No gguf files found in ./models/${HF_MODEL_GGUF}" + exit 1 +fi + +text-generation-router-llamacpp --port 8080 \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh new file mode 100644 index 0000000..363cc14 --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +verlt() { + [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] +} + +if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then + CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-) + echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}" + NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}" + if verlt $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then + echo "Adding CUDA compat to LD_LIBRARY_PATH" + export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH + echo $LD_LIBRARY_PATH + else + echo "Skipping CUDA compat setup as newer NVIDIA driver is installed" + fi +else + echo "Skipping CUDA compat setup as package not found" +fi \ No newline at end of file From a18a1a0a86fb72965efa8d6de55cd671b7b2eceb Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Wed, 16 Apr 2025 14:11:56 +0200 Subject: [PATCH 02/26] Added new tests for llamacpp backend --- tests/huggingface/sagemaker_dlc_test.py | 140 ++++++++++++++++++------ 1 file changed, 106 insertions(+), 34 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 78272e5..15f8f3b 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -30,76 +30,144 @@ def run_test(args): default_env["MAX_BATCH_SIZE"] = "1" default_env["MAX_INPUT_TOKENS"] = "2048" default_env["MAX_TOTAL_TOKENS"] = "4096" + elif args.image_type == "TGILLAMACPP": + llamacpp_env = default_env.copy() + llamacpp_env["HF_MODEL_GGUF"] = args.model_gguf else: default_env["SM_NUM_GPUS"] = "4" signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(int(args.timeout)) predictor = None + second_predictor = None + try: # Create Hugging Face Model Class endpoint_name = args.model_id.replace("/","-").replace(".", "-")[:40] endpoint_name = endpoint_name + "-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) - model = HuggingFaceModel( - name=endpoint_name, - env=default_env, - role=args.role, - image_uri=args.image_uri - ) - deploy_parameters = { - "instance_type": args.instance_type, - "initial_instance_count": 1, - "endpoint_name": endpoint_name, - "container_startup_health_check_timeout": 1800, - } - if args.instance_type.startswith("ml.inf2"): - deploy_parameters["volume_size"] = 256 - predictor = model.deploy(**deploy_parameters) - - logging.info("Endpoint deployment complete.") - + data = { - "inputs": "What is Deep Learning?", - "parameters": {"max_new_tokens": 50, "top_k": 50, "top_p": 0.95, "do_sample": True}, - } - output = predictor.predict(data) - logging.info("Output: " + json.dumps(output)) + "inputs": "What is Deep Learning?", + "parameters": {"max_new_tokens": 50, "top_k": 50, "top_p": 0.95, "do_sample": True}, + } + + # For TGILLAMACPP, test both with and without the GGUF environment + if args.image_type == "TGILLAMACPP": + # First endpoint with default environment (without GGUF) + logging.info("Deploying first endpoint with default environment (without GGUF)") + model = HuggingFaceModel( + name=endpoint_name, + env=default_env, + role=args.role, + image_uri=args.image_uri + ) + deploy_parameters = { + "instance_type": args.instance_type, + "initial_instance_count": 1, + "endpoint_name": endpoint_name, + "container_startup_health_check_timeout": 1800, + } + if args.instance_type.startswith("ml.inf2"): + deploy_parameters["volume_size"] = 256 + + predictor = model.deploy(**deploy_parameters) + logging.info("First endpoint deployment complete.") + + output = predictor.predict(data) + logging.info("First endpoint output: " + json.dumps(output)) + + # Second endpoint with llamacpp environment (with GGUF) + logging.info("Deploying second endpoint with llamacpp environment (with GGUF)") + second_endpoint_name = endpoint_name + "-gguf" + second_model = HuggingFaceModel( + name=second_endpoint_name, + env=llamacpp_env, + role=args.role, + image_uri=args.image_uri + ) + + second_deploy_parameters = deploy_parameters.copy() + second_deploy_parameters["endpoint_name"] = second_endpoint_name + + second_predictor = second_model.deploy(**second_deploy_parameters) + logging.info("Second endpoint deployment complete.") + + second_output = second_predictor.predict(data) + logging.info("Second endpoint output: " + json.dumps(second_output)) + + else: + # For other image types, just deploy a single endpoint + env_to_use = default_env + model = HuggingFaceModel( + name=endpoint_name, + env=env_to_use, + role=args.role, + image_uri=args.image_uri + ) + deploy_parameters = { + "instance_type": args.instance_type, + "initial_instance_count": 1, + "endpoint_name": endpoint_name, + "container_startup_health_check_timeout": 1800, + } + if args.instance_type.startswith("ml.inf2"): + deploy_parameters["volume_size"] = 256 + + predictor = model.deploy(**deploy_parameters) + logging.info("Endpoint deployment complete.") + + output = predictor.predict(data) + logging.info("Output: " + json.dumps(output)) + # TODO: we need to clearly define the expected output format for each models. # assert "generated_text" in output[0] finally: if predictor: predictor.delete_model() predictor.delete_endpoint() + if second_predictor: + second_predictor.delete_model() + second_predictor.delete_endpoint() + signal.alarm(0) signal.alarm(0) def get_models_for_image(image_type, device_type): if image_type == "TGI": if device_type == "gpu": return [ - ("bigscience/bloom-560m", None, "ml.g5.12xlarge"), - ("EleutherAI/gpt-neox-20b", None, "ml.g5.12xlarge"), - ("google/flan-t5-xxl", None, "ml.g5.12xlarge"), + ("bigscience/bloom-560m", None, "ml.g5.12xlarge", None), + ("EleutherAI/gpt-neox-20b", None, "ml.g5.12xlarge", None), + ("google/flan-t5-xxl", None, "ml.g5.12xlarge", None), ] elif device_type == "inf2": - return [ ("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge") ] + return [ ("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge", None) ] else: raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " f"please check whether the image_type and instance_type are supported.") elif image_type == "TEI": if device_type == "gpu": return [ - ("BAAI/bge-m3", None, "ml.g5.12xlarge"), - ("intfloat/multilingual-e5-base", None, "ml.g5.12xlarge"), - ("thenlper/gte-base", None, "ml.g5.12xlarge"), - ("sentence-transformers/all-MiniLM-L6-v2", None, "ml.g5.12xlarge") + ("BAAI/bge-m3", None, "ml.g5.12xlarge", None), + ("intfloat/multilingual-e5-base", None, "ml.g5.12xlarge", None), + ("thenlper/gte-base", None, "ml.g5.12xlarge", None), + ("sentence-transformers/all-MiniLM-L6-v2", None, "ml.g5.12xlarge", None) ] elif device_type == "cpu": - return [("BAAI/bge-m3", None, "ml.g5.12xlarge")] + return [("BAAI/bge-m3", None, "ml.g5.12xlarge", None)] + else: + raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported.") + + elif image_type == "TGILLAMACPP": + if device_type == "gpu": + return [("HuggingFaceTB/SmolLM2-1.7B-Instruct", None, "ml.g5.12xlarge", "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF")] + elif device_type == "cpu": + return [("HuggingFaceTB/SmolLM2-1.7B-Instruct", None, "ml.g5.12xlarge", "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF")] else: raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " f"please check whether the image_type and instance_type are supported.") else: - raise ValueError("Invalid image type. Supported types are 'TGI' and 'TEI'.") + raise ValueError("Invalid image type. Supported types are 'TGI', 'TEI', and 'TGILLAMACPP'.") def should_run_test_for_image(test_type, target_type): return test_type == target_type @@ -109,6 +177,8 @@ def should_run_test_for_image(test_type, target_type): pytest.param("TGI", "inf2", marks=pytest.mark.inf2), pytest.param("TEI", "gpu", marks=pytest.mark.gpu), pytest.param("TEI", "cpu", marks=pytest.mark.cpu), + pytest.param("TGILLAMACPP", "gpu", marks=pytest.mark.gpu), + pytest.param("TGILLAMACPP", "cpu", marks=pytest.mark.cpu), ]) def test(image_type, device_type, timeout: str = "3000"): test_target_image_type = os.getenv("TARGET_IMAGE_TYPE") @@ -125,11 +195,12 @@ def test(image_type, device_type, timeout: str = "3000"): assert test_role_arn, f"Please set TEST_ROLE_ARN environment variable." models = get_models_for_image(image_type, device_type) - for model_id, model_revision, instance_type in models: + for model_id, model_revision, instance_type, model_gguf in models: args = argparse.Namespace( image_uri=image_uri, instance_type=instance_type, model_id=model_id, + model_gguf=model_gguf, model_revision=model_revision, role=test_role_arn, timeout=timeout @@ -143,6 +214,7 @@ def test(image_type, device_type, timeout: str = "3000"): arg_parser.add_argument("--image_uri", type=str, required=True) arg_parser.add_argument("--instance_type", type=str, required=True) arg_parser.add_argument("--model_id", type=str, required=True) + arg_parser.add_argument("--model_gguf", type=str, required=False) arg_parser.add_argument("--model_revision", type=str, required=False) arg_parser.add_argument("--role", type=str, required=True) arg_parser.add_argument("--timeout", type=str, required=True) From dded0efbce8c0556d820bd9dad23838483d5e2dd Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Wed, 16 Apr 2025 14:16:46 +0200 Subject: [PATCH 03/26] Fix Dockerfile --- .../pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index 3edcf45..6b75273 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -86,6 +86,8 @@ COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin ENV HF_HUB_ENABLE_HF_TRANSFER=1 +FROM deps AS sagemaker + RUN HOME_DIR=/root && \ pip3 requests PTable && \ curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ @@ -96,14 +98,14 @@ RUN HOME_DIR=/root && \ ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ rm -rf ${HOME_DIR}/oss_compliance* + ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" + ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp + COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES -COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/start-cuda-compat.sh /root/start-cuda-compat.sh +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh /root/start-cuda-compat.sh RUN chmod +x /root/start-cuda-compat.sh -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" -ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp - -COPY sagemaker-entrypoint.sh entrypoint.sh +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] From f04339aaf2ec95d76b6b1cb1fb172aa5efbb0c8d Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Wed, 16 Apr 2025 14:36:35 +0200 Subject: [PATCH 04/26] updated releases.json --- releases.json | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/releases.json b/releases.json index 068ff8d..97ad45c 100644 --- a/releases.json +++ b/releases.json @@ -100,28 +100,34 @@ "python_version": "py310", "pytorch_version": "2.0.1" } - ] - }, + ], + + "TGILLAMACPP": [ + { + "device": "gpu", + "min_version": "3.2.3", + "max_version": "3.2.3", + "os_version": "ubuntu22.04", + "cuda_version": "cu124", + "python_version": "py311", + "pytorch_version": "2.6.0" + } + ] +}, + "ignore_vulnerabilities": [ "CVE-2024-42154 - linux" ], "releases": [ + { - "framework": "TEI", - "device": "cpu", - "version": "1.6.0", - "os_version": "ubuntu22.04", - "python_version": "py310", - "pytorch_version": "2.0.1" - }, - { - "framework": "TEI", + "framework": "TGILLAMACPP", "device": "gpu", - "version": "1.6.0", + "version": "3.2.3", "os_version": "ubuntu22.04", - "cuda_version": "cu122", - "python_version": "py310", - "pytorch_version": "2.0.1" + "cuda_version": "cu124", + "python_version": "py311", + "pytorch_version": "2.6.0" } ] } From 5e8b18fcf756ee808b8531c4a2304a234cef866a Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Wed, 16 Apr 2025 14:55:54 +0200 Subject: [PATCH 05/26] Changed permitted devices --- huggingface/pytorch/release_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface/pytorch/release_utils.py b/huggingface/pytorch/release_utils.py index a111bbe..f705772 100644 --- a/huggingface/pytorch/release_utils.py +++ b/huggingface/pytorch/release_utils.py @@ -19,7 +19,7 @@ FRAMEWORK_DEVICE_DICT: Dict[str, List[str]] = { "TGI": ["GPU", "INF2"], "TEI": ["GPU", "CPU"], - "TGILLAMACPP": ["CPU"], + "TGILLAMACPP": ["GPU", "CPU"], } Framework = enum.Enum("Framework", ["TGI", "OPTIMUM", "TEI", "TGILLAMACPP"]) Device = enum.Enum("Device", ["GPU", "INF2", "CPU"]) From 42d19fbac8fccbdd27a6737d126d2d89f69f5cd3 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 16 Apr 2025 17:47:47 +0000 Subject: [PATCH 06/26] Added test for llamacpp backend --- tests/huggingface/sagemaker_dlc_test.py | 160 ++++++++++++++++-------- 1 file changed, 110 insertions(+), 50 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 15f8f3b..32331e1 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -20,8 +20,9 @@ class TimeoutError(Exception): def timeout_handler(signum, frame): raise TimeoutError("Test timed out") + def run_test(args): - default_env = { "HF_MODEL_ID": args.model_id } + default_env = {"HF_MODEL_ID": args.model_id} if args.model_revision: default_env["HF_MODEL_REVISION"] = args.model_revision if args.instance_type.startswith("ml.inf2"): @@ -30,9 +31,14 @@ def run_test(args): default_env["MAX_BATCH_SIZE"] = "1" default_env["MAX_INPUT_TOKENS"] = "2048" default_env["MAX_TOTAL_TOKENS"] = "4096" - elif args.image_type == "TGILLAMACPP": + if os.getenv("FRAMEWORK") == "TGILLAMACPP": + default_env["N_GPU_LAYERS"] = "99" + default_env["MAX_TOTAL_TOKENS"] = "2048" + default_env["MAX_BATCH_SIZE"] = "1" + default_env["TYPE_K"] = "q4-0" llamacpp_env = default_env.copy() llamacpp_env["HF_MODEL_GGUF"] = args.model_gguf + else: default_env["SM_NUM_GPUS"] = "4" @@ -40,26 +46,35 @@ def run_test(args): signal.alarm(int(args.timeout)) predictor = None second_predictor = None - + try: # Create Hugging Face Model Class - endpoint_name = args.model_id.replace("/","-").replace(".", "-")[:40] - endpoint_name = endpoint_name + "-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) - + endpoint_name = args.model_id.replace("/", "-").replace(".", "-")[:40] + endpoint_name = ( + endpoint_name + "-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) + ) + data = { - "inputs": "What is Deep Learning?", - "parameters": {"max_new_tokens": 50, "top_k": 50, "top_p": 0.95, "do_sample": True}, - } - + "inputs": "What is Deep Learning?", + "parameters": { + "max_new_tokens": 50, + "top_k": 50, + "top_p": 0.95, + "do_sample": True, + }, + } + # For TGILLAMACPP, test both with and without the GGUF environment - if args.image_type == "TGILLAMACPP": + if args.framework == "TGILLAMACPP": # First endpoint with default environment (without GGUF) - logging.info("Deploying first endpoint with default environment (without GGUF)") + logging.info( + "Deploying first endpoint with default environment (without GGUF)" + ) model = HuggingFaceModel( name=endpoint_name, env=default_env, role=args.role, - image_uri=args.image_uri + image_uri=args.image_uri, ) deploy_parameters = { "instance_type": args.instance_type, @@ -69,32 +84,34 @@ def run_test(args): } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 - + predictor = model.deploy(**deploy_parameters) logging.info("First endpoint deployment complete.") output = predictor.predict(data) logging.info("First endpoint output: " + json.dumps(output)) - + # Second endpoint with llamacpp environment (with GGUF) - logging.info("Deploying second endpoint with llamacpp environment (with GGUF)") + logging.info( + "Deploying second endpoint with llamacpp environment (with GGUF)" + ) second_endpoint_name = endpoint_name + "-gguf" second_model = HuggingFaceModel( name=second_endpoint_name, env=llamacpp_env, role=args.role, - image_uri=args.image_uri + image_uri=args.image_uri, ) - + second_deploy_parameters = deploy_parameters.copy() second_deploy_parameters["endpoint_name"] = second_endpoint_name - + second_predictor = second_model.deploy(**second_deploy_parameters) logging.info("Second endpoint deployment complete.") - + second_output = second_predictor.predict(data) logging.info("Second endpoint output: " + json.dumps(second_output)) - + else: # For other image types, just deploy a single endpoint env_to_use = default_env @@ -102,7 +119,7 @@ def run_test(args): name=endpoint_name, env=env_to_use, role=args.role, - image_uri=args.image_uri + image_uri=args.image_uri, ) deploy_parameters = { "instance_type": args.instance_type, @@ -112,13 +129,13 @@ def run_test(args): } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 - + predictor = model.deploy(**deploy_parameters) logging.info("Endpoint deployment complete.") output = predictor.predict(data) logging.info("Output: " + json.dumps(output)) - + # TODO: we need to clearly define the expected output format for each models. # assert "generated_text" in output[0] finally: @@ -131,6 +148,7 @@ def run_test(args): signal.alarm(0) signal.alarm(0) + def get_models_for_image(image_type, device_type): if image_type == "TGI": if device_type == "gpu": @@ -140,54 +158,94 @@ def get_models_for_image(image_type, device_type): ("google/flan-t5-xxl", None, "ml.g5.12xlarge", None), ] elif device_type == "inf2": - return [ ("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge", None) ] + return [("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge", None)] else: - raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " - f"please check whether the image_type and instance_type are supported.") + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) elif image_type == "TEI": if device_type == "gpu": return [ ("BAAI/bge-m3", None, "ml.g5.12xlarge", None), ("intfloat/multilingual-e5-base", None, "ml.g5.12xlarge", None), ("thenlper/gte-base", None, "ml.g5.12xlarge", None), - ("sentence-transformers/all-MiniLM-L6-v2", None, "ml.g5.12xlarge", None) + ( + "sentence-transformers/all-MiniLM-L6-v2", + None, + "ml.g5.12xlarge", + None, + ), ] elif device_type == "cpu": return [("BAAI/bge-m3", None, "ml.g5.12xlarge", None)] else: - raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " - f"please check whether the image_type and instance_type are supported.") - + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) + elif image_type == "TGILLAMACPP": if device_type == "gpu": - return [("HuggingFaceTB/SmolLM2-1.7B-Instruct", None, "ml.g5.12xlarge", "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF")] + return [ + ( + "unsloth/Llama-3.2-1B-Instruct", + None, + "ml.g5.12xlarge", + "unsloth/Llama-3.2-1B-Instruct-GGUF", + ) + ] elif device_type == "cpu": - return [("HuggingFaceTB/SmolLM2-1.7B-Instruct", None, "ml.g5.12xlarge", "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF")] + return [ + ( + "unsloth/Llama-3.2-1B-Instruct", + None, + "ml.g5.12xlarge", + "unsloth/Llama-3.2-1B-Instruct-GGUF", + ) + ] else: - raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " - f"please check whether the image_type and instance_type are supported.") + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) else: - raise ValueError("Invalid image type. Supported types are 'TGI', 'TEI', and 'TGILLAMACPP'.") + raise ValueError( + "Invalid image type. Supported types are 'TGI', 'TEI', and 'TGILLAMACPP'." + ) + def should_run_test_for_image(test_type, target_type): return test_type == target_type -@pytest.mark.parametrize("image_type, device_type", [ - pytest.param("TGI", "gpu", marks=pytest.mark.gpu), - pytest.param("TGI", "inf2", marks=pytest.mark.inf2), - pytest.param("TEI", "gpu", marks=pytest.mark.gpu), - pytest.param("TEI", "cpu", marks=pytest.mark.cpu), - pytest.param("TGILLAMACPP", "gpu", marks=pytest.mark.gpu), - pytest.param("TGILLAMACPP", "cpu", marks=pytest.mark.cpu), -]) + +@pytest.mark.parametrize( + "image_type, device_type", + [ + pytest.param("TGI", "gpu", marks=pytest.mark.gpu), + pytest.param("TGI", "inf2", marks=pytest.mark.inf2), + pytest.param("TEI", "gpu", marks=pytest.mark.gpu), + pytest.param("TEI", "cpu", marks=pytest.mark.cpu), + pytest.param("TGILLAMACPP", "gpu", marks=pytest.mark.gpu), + pytest.param("TGILLAMACPP", "cpu", marks=pytest.mark.cpu), + ], +) def test(image_type, device_type, timeout: str = "3000"): test_target_image_type = os.getenv("TARGET_IMAGE_TYPE") test_device_type = os.getenv("DEVICE_TYPE") - if test_target_image_type and not should_run_test_for_image(image_type, test_target_image_type): - pytest.skip(f"Skipping test for image type {image_type} as it does not match target image type {test_target_image_type}") + if test_target_image_type and not should_run_test_for_image( + image_type, test_target_image_type + ): + pytest.skip( + f"Skipping test for image type {image_type} as it does not match target image type {test_target_image_type}" + ) - if test_device_type and not should_run_test_for_image(device_type, test_device_type): - pytest.skip(f"Skipping test for device type {device_type} as it does not match current device type {test_device_type}") + if test_device_type and not should_run_test_for_image( + device_type, test_device_type + ): + pytest.skip( + f"Skipping test for device type {device_type} as it does not match current device type {test_device_type}" + ) image_uri = os.getenv("IMAGE_URI") test_role_arn = os.getenv("TEST_ROLE_ARN") @@ -201,15 +259,16 @@ def test(image_type, device_type, timeout: str = "3000"): instance_type=instance_type, model_id=model_id, model_gguf=model_gguf, + framework=test_target_image_type, model_revision=model_revision, role=test_role_arn, - timeout=timeout + timeout=timeout, ) logging.info(f"Running sanity test with the following args: {args}.") run_test(args) -if __name__ == '__main__': +if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--image_uri", type=str, required=True) arg_parser.add_argument("--instance_type", type=str, required=True) @@ -218,6 +277,7 @@ def test(image_type, device_type, timeout: str = "3000"): arg_parser.add_argument("--model_revision", type=str, required=False) arg_parser.add_argument("--role", type=str, required=True) arg_parser.add_argument("--timeout", type=str, required=True) + arg_parser.add_argument("--framework", type=str, required=True) args = arg_parser.parse_args() run_test(args) From 238021f0e1a7f331e5f83901ca11995b94803c98 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 17 Apr 2025 15:45:12 +0000 Subject: [PATCH 07/26] Fix cpu instance tests --- tests/huggingface/sagemaker_dlc_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 32331e1..23066f7 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -32,7 +32,8 @@ def run_test(args): default_env["MAX_INPUT_TOKENS"] = "2048" default_env["MAX_TOTAL_TOKENS"] = "4096" if os.getenv("FRAMEWORK") == "TGILLAMACPP": - default_env["N_GPU_LAYERS"] = "99" + if os.getenv("DEVICE_TYPE") == "GPU": + default_env["N_GPU_LAYERS"] = "99" default_env["MAX_TOTAL_TOKENS"] = "2048" default_env["MAX_BATCH_SIZE"] = "1" default_env["TYPE_K"] = "q4-0" From 411a8a7182bf043485750604128ff99ad8680a07 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 17 Apr 2025 15:46:08 +0000 Subject: [PATCH 08/26] Added cpu image for llamacpp --- .../tgillamacpp/docker/3.2.3/cpu/Dockerfile | 114 ++++++++++++++++++ .../docker/3.2.3/cpu/entrypoint.sh | 27 +++++ 2 files changed, 141 insertions(+) create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile create mode 100644 huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile new file mode 100644 index 0000000..45bc34f --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile @@ -0,0 +1,114 @@ +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps + +ARG llamacpp_version=b4827 +ARG llamacpp_cuda=OFF +ARG llamacpp_native=ON +ARG llamacpp_cpu_arm_arch=native +ARG cuda_arch=75-real;80-real;86-real;89-real;90-real + +WORKDIR /opt/src + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + clang \ + cmake \ + curl \ + git \ + python3-dev \ + unzip \ + libssl-dev \ + pkg-config \ + tar + +ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ + -DGGML_CUDA=${llamacpp_cuda} \ + -DGGML_NATIVE=${llamacpp_native} \ + -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + && cmake --build build --parallel --config Release \ + && cmake --install build + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release \ + --package text-generation-router-llamacpp --frozen + +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + python3-venv \ + unzip \ + curl \ + python3-pip + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements.txt requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ + +RUN pip3 install --no-cache-dir \ + -r requirements.txt \ + -e gguf-py + +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + + +RUN HOME_DIR=/root && \ + pip3 install requests PTable setuptools && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ + rm -rf ${HOME_DIR}/oss_compliance* + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" +ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:cpu:inference:tgi-llamacpp + +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh entrypoint.sh +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["--json-output"] + +LABEL dlc_major_version="2" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh new file mode 100644 index 0000000..2bfa61c --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "HF_MODEL_ID must be set" + exit 1 +fi +export MODEL_ID="${HF_MODEL_ID}" + +mkdir -p models + +if [[ -n "${HF_MODEL_GGUF}" ]]; then + if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + else + huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + fi + + if [[ -z "${MODEL_GGUF}" ]]; then + echo "No gguf files found in ./models/${HF_MODEL_GGUF}" + exit 1 + fi +fi + +text-generation-router-llamacpp --port 8080 \ No newline at end of file From 4dd4d5592842fbad4b5538bd5b4f829b1715e79e Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Tue, 22 Apr 2025 15:59:26 +0200 Subject: [PATCH 09/26] Added cpu version for tgillamacpp in releases.json --- releases.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/releases.json b/releases.json index 97ad45c..7f9f2b8 100644 --- a/releases.json +++ b/releases.json @@ -111,6 +111,14 @@ "cuda_version": "cu124", "python_version": "py311", "pytorch_version": "2.6.0" + }, + { + "device": "cpu", + "min_version": "3.2.3", + "max_version": "3.2.3", + "os_version": "ubuntu22.04", + "python_version": "py311", + "pytorch_version": "2.6.0" } ] }, @@ -128,6 +136,14 @@ "cuda_version": "cu124", "python_version": "py311", "pytorch_version": "2.6.0" + }, + { + "framework": "TGILLAMACPP", + "device": "cpu", + "version": "3.2.3", + "os_version": "ubuntu22.04", + "python_version": "py311", + "pytorch_version": "2.6.0" } ] } From d4d79903f9b0b5f77a42b27e96f9fa7e9f4a3331 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Tue, 22 Apr 2025 16:01:08 +0200 Subject: [PATCH 10/26] dlc major version fix --- .../tgillamacpp/docker/3.2.3/gpu/Dockerfile | 6 ++---- .../docker/3.2.3/gpu/entrypoint.sh | 19 ++++++++----------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index 6b75273..8428235 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -61,7 +61,7 @@ RUN cargo build \ --profile release \ --package text-generation-router-llamacpp --frozen -FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive @@ -86,8 +86,6 @@ COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin ENV HF_HUB_ENABLE_HF_TRANSFER=1 -FROM deps AS sagemaker - RUN HOME_DIR=/root && \ pip3 requests PTable && \ curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ @@ -111,6 +109,6 @@ RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] -LABEL dlc_major_version="2" +LABEL dlc_major_version="1" LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh index cc9fd9b..2bfa61c 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh @@ -5,26 +5,23 @@ if [[ -z "${HF_MODEL_ID}" ]]; then fi export MODEL_ID="${HF_MODEL_ID}" -if [[ -z "${HF_MODEL_GGUF}" ]]; then - echo "HF_MODEL_GGUF must be set" - exit 1 -fi +mkdir -p models -mkdir models - -if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then - huggingface-cli download "{$HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" +if [[ -n "${HF_MODEL_GGUF}" ]]; then + if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" -else + else huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" -fi + fi -if [[ -z "${MODEL_GGUF}" ]]; then + if [[ -z "${MODEL_GGUF}" ]]; then echo "No gguf files found in ./models/${HF_MODEL_GGUF}" exit 1 + fi fi text-generation-router-llamacpp --port 8080 \ No newline at end of file From a82bbb4e0c91a41312874269a90c980babaa1ffe Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Tue, 22 Apr 2025 16:37:11 +0200 Subject: [PATCH 11/26] fix releases.json --- releases.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/releases.json b/releases.json index 7f9f2b8..fa46d45 100644 --- a/releases.json +++ b/releases.json @@ -117,6 +117,7 @@ "min_version": "3.2.3", "max_version": "3.2.3", "os_version": "ubuntu22.04", + "cuda_version": "cu124", "python_version": "py311", "pytorch_version": "2.6.0" } @@ -141,6 +142,7 @@ "framework": "TGILLAMACPP", "device": "cpu", "version": "3.2.3", + "cuda_version": "cu124", "os_version": "ubuntu22.04", "python_version": "py311", "pytorch_version": "2.6.0" From 718cb3fb0cf7fb2718347f23f9830bb363b22afc Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 25 Apr 2025 16:53:31 +0000 Subject: [PATCH 12/26] various fixes gpu --- .../tgillamacpp/docker/3.2.3/gpu/Dockerfile | 12 ++++++------ .../docker/3.2.3/gpu/entrypoint.sh | 19 ++++++++----------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index 6b75273..6d3a6eb 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -61,12 +61,14 @@ RUN cargo build \ --profile release \ --package text-generation-router-llamacpp --frozen -FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive RUN apt update && apt upgrade -y && apt install -y \ python3-venv \ + unzip \ + curl \ python3-pip RUN python3 -m venv /venv @@ -85,11 +87,11 @@ COPY --from=builder /usr/lib/libggml*.so /usr/lib/ COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ ENV HF_HUB_ENABLE_HF_TRANSFER=1 +ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp -FROM deps AS sagemaker RUN HOME_DIR=/root && \ - pip3 requests PTable && \ + pip3 install requests PTable setuptools && \ curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ @@ -98,8 +100,6 @@ RUN HOME_DIR=/root && \ ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ rm -rf ${HOME_DIR}/oss_compliance* - ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" - ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh /root/start-cuda-compat.sh @@ -111,6 +111,6 @@ RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] -LABEL dlc_major_version="2" +LABEL dlc_major_version="1" LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh index cc9fd9b..2bfa61c 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh @@ -5,26 +5,23 @@ if [[ -z "${HF_MODEL_ID}" ]]; then fi export MODEL_ID="${HF_MODEL_ID}" -if [[ -z "${HF_MODEL_GGUF}" ]]; then - echo "HF_MODEL_GGUF must be set" - exit 1 -fi +mkdir -p models -mkdir models - -if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then - huggingface-cli download "{$HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" +if [[ -n "${HF_MODEL_GGUF}" ]]; then + if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" -else + else huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" -fi + fi -if [[ -z "${MODEL_GGUF}" ]]; then + if [[ -z "${MODEL_GGUF}" ]]; then echo "No gguf files found in ./models/${HF_MODEL_GGUF}" exit 1 + fi fi text-generation-router-llamacpp --port 8080 \ No newline at end of file From 11b09a760d9dea93521b4afc96c4db067429ad33 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 25 Apr 2025 16:53:55 +0000 Subject: [PATCH 13/26] various fixes cpu --- huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile index 45bc34f..a481095 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile @@ -109,6 +109,6 @@ RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] -LABEL dlc_major_version="2" +LABEL dlc_major_version="1" LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file From c5561d151c98c58bad0cbdc59281216206f76a31 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 1 May 2025 14:37:12 +0000 Subject: [PATCH 14/26] Changed to smaller model for llamacpp tests --- tests/huggingface/sagemaker_dlc_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 23066f7..efafa7b 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -190,19 +190,19 @@ def get_models_for_image(image_type, device_type): if device_type == "gpu": return [ ( - "unsloth/Llama-3.2-1B-Instruct", + "unsloth/Qwen3-0.6B", None, "ml.g5.12xlarge", - "unsloth/Llama-3.2-1B-Instruct-GGUF", + "unsloth/Qwen3-0.6B-GGUF", ) ] elif device_type == "cpu": return [ ( - "unsloth/Llama-3.2-1B-Instruct", + "unsloth/Qwen3-0.6B", None, "ml.g5.12xlarge", - "unsloth/Llama-3.2-1B-Instruct-GGUF", + "unsloth/Qwen3-0.6B-GGUF", ) ] else: From 94d5108ae472cfb8455ee40b769440a0cab79797 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Tue, 20 May 2025 16:43:52 -0400 Subject: [PATCH 15/26] changed models for tgillamacpp tests --- tests/huggingface/sagemaker_dlc_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index efafa7b..86b5444 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -190,19 +190,19 @@ def get_models_for_image(image_type, device_type): if device_type == "gpu": return [ ( - "unsloth/Qwen3-0.6B", + "Qwen/Qwen2-0.5B-Instruct", None, "ml.g5.12xlarge", - "unsloth/Qwen3-0.6B-GGUF", + "Qwen/Qwen2-0.5B-Instruct-GGUF", ) ] elif device_type == "cpu": return [ ( - "unsloth/Qwen3-0.6B", + "Qwen/Qwen2-0.5B-Instruct", None, "ml.g5.12xlarge", - "unsloth/Qwen3-0.6B-GGUF", + "Qwen/Qwen2-0.5B-Instruct-GGUF", ) ] else: From 7a8a453ea381af9643404f04cb461cd75a573d4c Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 27 May 2025 14:43:48 +0000 Subject: [PATCH 16/26] changed instance --- tests/huggingface/sagemaker_dlc_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 86b5444..810d475 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -192,7 +192,7 @@ def get_models_for_image(image_type, device_type): ( "Qwen/Qwen2-0.5B-Instruct", None, - "ml.g5.12xlarge", + "ml.g5.24xlarge", "Qwen/Qwen2-0.5B-Instruct-GGUF", ) ] @@ -201,7 +201,7 @@ def get_models_for_image(image_type, device_type): ( "Qwen/Qwen2-0.5B-Instruct", None, - "ml.g5.12xlarge", + "ml.g5.24xlarge", "Qwen/Qwen2-0.5B-Instruct-GGUF", ) ] From 13040f2b78e23b8fcd648c4f73915ca3bdd4171b Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 27 May 2025 15:34:47 +0000 Subject: [PATCH 17/26] changed gguf model --- tests/huggingface/sagemaker_dlc_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 810d475..adb5048 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -192,8 +192,8 @@ def get_models_for_image(image_type, device_type): ( "Qwen/Qwen2-0.5B-Instruct", None, - "ml.g5.24xlarge", - "Qwen/Qwen2-0.5B-Instruct-GGUF", + "ml.g5.12xlarge", + None, ) ] elif device_type == "cpu": @@ -201,8 +201,8 @@ def get_models_for_image(image_type, device_type): ( "Qwen/Qwen2-0.5B-Instruct", None, - "ml.g5.24xlarge", - "Qwen/Qwen2-0.5B-Instruct-GGUF", + "ml.g5.12xlarge", + None, ) ] else: From 6e86ba6c1a019c35a57191c57e8270a818fc3cc9 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 12:16:31 +0200 Subject: [PATCH 18/26] change cuda file location --- huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index 8f9eb44..c4c734a 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -101,8 +101,8 @@ RUN HOME_DIR=/root && \ COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES -COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh /root/start-cuda-compat.sh -RUN chmod +x /root/start-cuda-compat.sh +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh +RUN chmod +x start-cuda-compat.sh COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh From fbd94e988a8e1a3148ff874f629105a9463b54a7 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 13:01:14 +0200 Subject: [PATCH 19/26] change cuda version in releases.json --- releases.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/releases.json b/releases.json index 7fa9c3d..398307a 100644 --- a/releases.json +++ b/releases.json @@ -108,7 +108,7 @@ "min_version": "3.2.3", "max_version": "3.2.3", "os_version": "ubuntu22.04", - "cuda_version": "cu124", + "cuda_version": "cu128", "python_version": "py311", "pytorch_version": "2.6.0" }, @@ -117,7 +117,7 @@ "min_version": "3.2.3", "max_version": "3.2.3", "os_version": "ubuntu22.04", - "cuda_version": "cu124", + "cuda_version": "cu128", "python_version": "py311", "pytorch_version": "2.6.0" } @@ -143,7 +143,7 @@ "framework": "TGILLAMACPP", "device": "cpu", "version": "3.2.3", - "cuda_version": "cu124", + "cuda_version": "cu128", "os_version": "ubuntu22.04", "python_version": "py311", "pytorch_version": "2.6.0" From b95b21c62a92787fc6528ba8407da3f170e67eae Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 14:45:46 +0200 Subject: [PATCH 20/26] change cuda version in releases.json - remove cuda compat script --- huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile | 2 +- releases.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index c4c734a..704dbca 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -102,7 +102,7 @@ RUN HOME_DIR=/root && \ COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh -RUN chmod +x start-cuda-compat.sh +#RUN chmod +x start-cuda-compat.sh COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh diff --git a/releases.json b/releases.json index 398307a..147d376 100644 --- a/releases.json +++ b/releases.json @@ -135,7 +135,7 @@ "device": "gpu", "version": "3.2.3", "os_version": "ubuntu22.04", - "cuda_version": "cu124", + "cuda_version": "cu128", "python_version": "py311", "pytorch_version": "2.6.0" }, From 109550a0aa1a1ea26cf22d1b400e63e7b85328e2 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 15:38:51 +0200 Subject: [PATCH 21/26] removed container_startup_health_check_timeout --- tests/huggingface/sagemaker_dlc_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index adb5048..eff433f 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -81,7 +81,7 @@ def run_test(args): "instance_type": args.instance_type, "initial_instance_count": 1, "endpoint_name": endpoint_name, - "container_startup_health_check_timeout": 1800, + #"container_startup_health_check_timeout": 1800, } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 @@ -106,6 +106,7 @@ def run_test(args): second_deploy_parameters = deploy_parameters.copy() second_deploy_parameters["endpoint_name"] = second_endpoint_name + print(second_deploy_parameters) second_predictor = second_model.deploy(**second_deploy_parameters) logging.info("Second endpoint deployment complete.") @@ -126,7 +127,7 @@ def run_test(args): "instance_type": args.instance_type, "initial_instance_count": 1, "endpoint_name": endpoint_name, - "container_startup_health_check_timeout": 1800, + #"container_startup_health_check_timeout": 1800, } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 From 95b191f6364447e1d5b840ba951c75f52902b902 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 16:09:21 +0200 Subject: [PATCH 22/26] changed test payload --- tests/huggingface/sagemaker_dlc_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index eff433f..2d2ff03 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -58,10 +58,10 @@ def run_test(args): data = { "inputs": "What is Deep Learning?", "parameters": { - "max_new_tokens": 50, + "max_new_tokens": 10, "top_k": 50, "top_p": 0.95, - "do_sample": True, + "do_sample": False, }, } From 6996a755919ef73e1c8792143fbb8d6d102c7c7c Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 16:50:15 +0200 Subject: [PATCH 23/26] changed instance for cpu --- releases.json | 18 +++++++++--------- tests/huggingface/sagemaker_dlc_test.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/releases.json b/releases.json index 147d376..b882a47 100644 --- a/releases.json +++ b/releases.json @@ -130,15 +130,15 @@ ], "releases": [ - { - "framework": "TGILLAMACPP", - "device": "gpu", - "version": "3.2.3", - "os_version": "ubuntu22.04", - "cuda_version": "cu128", - "python_version": "py311", - "pytorch_version": "2.6.0" - }, + // { + // "framework": "TGILLAMACPP", + // "device": "gpu", + // "version": "3.2.3", + // "os_version": "ubuntu22.04", + // "cuda_version": "cu128", + // "python_version": "py311", + // "pytorch_version": "2.6.0" + // }, { "framework": "TGILLAMACPP", "device": "cpu", diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 2d2ff03..a3c336f 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -202,7 +202,7 @@ def get_models_for_image(image_type, device_type): ( "Qwen/Qwen2-0.5B-Instruct", None, - "ml.g5.12xlarge", + "ml.m5.12xlarge", None, ) ] From 70eaac52207a9e124e33a79b37078459f927e713 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 16:54:58 +0200 Subject: [PATCH 24/26] changed instance for cpu --- releases.json | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/releases.json b/releases.json index b882a47..cea3e17 100644 --- a/releases.json +++ b/releases.json @@ -129,16 +129,6 @@ "CVE-2025-32434 - torch" ], "releases": [ - - // { - // "framework": "TGILLAMACPP", - // "device": "gpu", - // "version": "3.2.3", - // "os_version": "ubuntu22.04", - // "cuda_version": "cu128", - // "python_version": "py311", - // "pytorch_version": "2.6.0" - // }, { "framework": "TGILLAMACPP", "device": "cpu", From 6bae75136ae249994a14a93103c6a20d485cdba0 Mon Sep 17 00:00:00 2001 From: fgbelidji Date: Fri, 6 Jun 2025 18:06:51 +0200 Subject: [PATCH 25/26] Reverted latest changes --- .../pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile | 2 +- releases.json | 2 +- tests/huggingface/sagemaker_dlc_test.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile index 704dbca..c4c734a 100644 --- a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -102,7 +102,7 @@ RUN HOME_DIR=/root && \ COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh -#RUN chmod +x start-cuda-compat.sh +RUN chmod +x start-cuda-compat.sh COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh diff --git a/releases.json b/releases.json index cea3e17..37e7da7 100644 --- a/releases.json +++ b/releases.json @@ -131,7 +131,7 @@ "releases": [ { "framework": "TGILLAMACPP", - "device": "cpu", + "device": "gpu", "version": "3.2.3", "cuda_version": "cu128", "os_version": "ubuntu22.04", diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index a3c336f..93739a9 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -58,10 +58,10 @@ def run_test(args): data = { "inputs": "What is Deep Learning?", "parameters": { - "max_new_tokens": 10, + "max_new_tokens": 50, "top_k": 50, "top_p": 0.95, - "do_sample": False, + "do_sample": True, }, } @@ -81,7 +81,7 @@ def run_test(args): "instance_type": args.instance_type, "initial_instance_count": 1, "endpoint_name": endpoint_name, - #"container_startup_health_check_timeout": 1800, + "container_startup_health_check_timeout": 1800, } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 @@ -106,7 +106,7 @@ def run_test(args): second_deploy_parameters = deploy_parameters.copy() second_deploy_parameters["endpoint_name"] = second_endpoint_name - print(second_deploy_parameters) + second_predictor = second_model.deploy(**second_deploy_parameters) logging.info("Second endpoint deployment complete.") @@ -127,7 +127,7 @@ def run_test(args): "instance_type": args.instance_type, "initial_instance_count": 1, "endpoint_name": endpoint_name, - #"container_startup_health_check_timeout": 1800, + "container_startup_health_check_timeout": 1800, } if args.instance_type.startswith("ml.inf2"): deploy_parameters["volume_size"] = 256 From 371ca77227ab44f5519bb89ba6eacab42c7dd5a7 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 6 Jun 2025 17:45:48 +0000 Subject: [PATCH 26/26] added missig model-gguf name --- tests/huggingface/sagemaker_dlc_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 93739a9..c4d3228 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -194,7 +194,7 @@ def get_models_for_image(image_type, device_type): "Qwen/Qwen2-0.5B-Instruct", None, "ml.g5.12xlarge", - None, + "Qwen/Qwen2.5-0.5B-Instruct-GGUF", ) ] elif device_type == "cpu": @@ -203,7 +203,7 @@ def get_models_for_image(image_type, device_type): "Qwen/Qwen2-0.5B-Instruct", None, "ml.m5.12xlarge", - None, + "Qwen/Qwen2.5-0.5B-Instruct-GGUF", ) ] else: