diff --git a/huggingface/pytorch/release_utils.py b/huggingface/pytorch/release_utils.py index a111bbe7..f705772e 100644 --- a/huggingface/pytorch/release_utils.py +++ b/huggingface/pytorch/release_utils.py @@ -19,7 +19,7 @@ FRAMEWORK_DEVICE_DICT: Dict[str, List[str]] = { "TGI": ["GPU", "INF2"], "TEI": ["GPU", "CPU"], - "TGILLAMACPP": ["CPU"], + "TGILLAMACPP": ["GPU", "CPU"], } Framework = enum.Enum("Framework", ["TGI", "OPTIMUM", "TEI", "TGILLAMACPP"]) Device = enum.Enum("Device", ["GPU", "INF2", "CPU"]) diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES b/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES new file mode 100644 index 00000000..4f3f5dcf --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES @@ -0,0 +1,583 @@ +** text-generation-inference; version 3.2.3 -- https://github.com/huggingface/text-generation-inference + +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Hugging Face + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------ + +** transformers; version 4.49.0 -- https://github.com/huggingface/transformers + +Copyright 2018- The Hugging Face team. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +* For transformers see also this required NOTICE: + Copyright 2018- The Hugging Face team. All rights reserved. + + https://github.com/huggingface/transformers/blob/main/LICENSE + +------ + +** Flash Attention; version 2.3.3 -- https://github.com/Dao-AILab/flash-attention/tree/v2.3.2 +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +BSD 3-Clause License + +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------ + +** PyTorch; version 2.6.0 -- https://github.com/pytorch/pytorch +From PyTorch: + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + +From Caffe2: + +Copyright (c) 2016-present, Facebook Inc. All rights reserved. + +All contributions by Facebook: +Copyright (c) 2016 Facebook Inc. + +All contributions by Google: +Copyright (c) 2015 Google Inc. +All rights reserved. + +All contributions by Yangqing Jia: +Copyright (c) 2015 Yangqing Jia +All rights reserved. + +All contributions by Kakao Brain: +Copyright 2019-2020 Kakao Brain + +All contributions by Cruise LLC: +Copyright (c) 2022 Cruise LLC. +All rights reserved. + +All contributions by Arm: +Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates + +All contributions from Caffe: +Copyright(c) 2013, 2014, 2015, the respective contributors +All rights reserved. + +All other contributions: +Copyright(c) 2015, 2016 the respective contributors +All rights reserved. + +Caffe2 uses a copyright model similar to Caffe: each contributor holds +copyright over their contributions to Caffe2. The project versioning records +all such contribution and copyright details. If a contributor wants to further +mark their specific copyright on a particular contribution, they should +indicate their copyright solely in the commit message of the change when it is +committed. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +------ + +** miniforge; version 23.3.1-1 -- https://github.com/conda-forge/miniforge/tree/23.3.1-1 +Copyright (c) 2019-2022, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Miniforge installer code uses BSD-3-Clause license as stated below. + +Binary packages that come with it have their own licensing terms +and by installing miniforge you agree to the licensing terms of individual +packages as well. They include different OSI-approved licenses including +the GNU General Public License and can be found in pkgs//info/licenses +folders. + +Miniforge installer comes with a boostrapping executable that is used +when installing miniforge and is deleted after miniforge is installed. +The bootstrapping executable uses micromamba, cli11, cpp-filesystem, +curl, c-ares, krb5, libarchive, libev, lz4, nghttp2, openssl, libsolv, +nlohmann-json, reproc and zstd which are licensed under BSD-3-Clause, +MIT and OpenSSL licenses. Licenses and copyright notices of these +projects can be found at the following URL. +https://github.com/conda-forge/micromamba-feedstock/tree/master/recipe. diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile new file mode 100644 index 00000000..a4810958 --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/Dockerfile @@ -0,0 +1,114 @@ +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps + +ARG llamacpp_version=b4827 +ARG llamacpp_cuda=OFF +ARG llamacpp_native=ON +ARG llamacpp_cpu_arm_arch=native +ARG cuda_arch=75-real;80-real;86-real;89-real;90-real + +WORKDIR /opt/src + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + clang \ + cmake \ + curl \ + git \ + python3-dev \ + unzip \ + libssl-dev \ + pkg-config \ + tar + +ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ + -DGGML_CUDA=${llamacpp_cuda} \ + -DGGML_NATIVE=${llamacpp_native} \ + -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + && cmake --build build --parallel --config Release \ + && cmake --install build + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release \ + --package text-generation-router-llamacpp --frozen + +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + python3-venv \ + unzip \ + curl \ + python3-pip + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements.txt requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ + +RUN pip3 install --no-cache-dir \ + -r requirements.txt \ + -e gguf-py + +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + + +RUN HOME_DIR=/root && \ + pip3 install requests PTable setuptools && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ + rm -rf ${HOME_DIR}/oss_compliance* + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/" +ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:cpu:inference:tgi-llamacpp + +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh entrypoint.sh +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["--json-output"] + +LABEL dlc_major_version="1" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh new file mode 100644 index 00000000..2bfa61ca --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/cpu/entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "HF_MODEL_ID must be set" + exit 1 +fi +export MODEL_ID="${HF_MODEL_ID}" + +mkdir -p models + +if [[ -n "${HF_MODEL_GGUF}" ]]; then + if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + else + huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + fi + + if [[ -z "${MODEL_GGUF}" ]]; then + echo "No gguf files found in ./models/${HF_MODEL_GGUF}" + exit 1 + fi +fi + +text-generation-router-llamacpp --port 8080 \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile new file mode 100644 index 00000000..c4c734ac --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/Dockerfile @@ -0,0 +1,115 @@ +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps + +ARG llamacpp_version=b4827 +ARG llamacpp_cuda=ON +ARG llamacpp_native=ON +ARG llamacpp_cpu_arm_arch=native +ARG cuda_arch=75-real;80-real;86-real;89-real;90-real + +WORKDIR /opt/src + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + clang \ + cmake \ + curl \ + git \ + python3-dev \ + unzip \ + libssl-dev \ + pkg-config \ + tar + +ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN mkdir -p llama.cpp \ + && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ + -DGGML_CUDA=${llamacpp_cuda} \ + -DGGML_NATIVE=${llamacpp_native} \ + -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + && cmake --build build --parallel --config Release \ + && cmake --install build + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release \ + --package text-generation-router-llamacpp --frozen + +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 AS sagemaker +WORKDIR /app + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && apt upgrade -y && apt install -y \ + python3-venv \ + unzip \ + curl \ + python3-pip + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements.txt requirements.txt +COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py +COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/ + +RUN pip3 install --no-cache-dir \ + -r requirements.txt \ + -e gguf-py + +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/ + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 +ENV HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tgi-llamacpp + +RUN HOME_DIR=/root && \ + pip3 install requests PTable setuptools && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ + rm -rf ${HOME_DIR}/oss_compliance* + + +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh start-cuda-compat.sh +RUN chmod +x start-cuda-compat.sh + +COPY /huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh entrypoint.sh +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["--json-output"] + +LABEL dlc_major_version="1" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh new file mode 100644 index 00000000..2bfa61ca --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "HF_MODEL_ID must be set" + exit 1 +fi +export MODEL_ID="${HF_MODEL_ID}" + +mkdir -p models + +if [[ -n "${HF_MODEL_GGUF}" ]]; then + if [[ -n "$HF_MODEL_GGUF_DIR" ]]; then + huggingface-cli download "${HF_MODEL_GGUF}" --include "${HF_MODEL_GGUF_DIR}"/*.gguf --local-dir ./models/"${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}/${HF_MODEL_GGUF_DIR}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}"/"${HF_MODEL_GGUF_DIR}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + else + huggingface-cli download "${HF_MODEL_GGUF}" --local-dir "./models/${HF_MODEL_GGUF}" + echo "Downloaded model gguf files to ./models/${HF_MODEL_GGUF}" + export MODEL_GGUF="$(find ./models/"${HF_MODEL_GGUF}" -maxdepth 1 -type f -name "*.gguf" | sort | head -n 1)" + fi + + if [[ -z "${MODEL_GGUF}" ]]; then + echo "No gguf files found in ./models/${HF_MODEL_GGUF}" + exit 1 + fi +fi + +text-generation-router-llamacpp --port 8080 \ No newline at end of file diff --git a/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh new file mode 100644 index 00000000..363cc141 --- /dev/null +++ b/huggingface/pytorch/tgillamacpp/docker/3.2.3/gpu/start-cuda-compat.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +verlt() { + [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] +} + +if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then + CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-) + echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}" + NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}" + if verlt $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then + echo "Adding CUDA compat to LD_LIBRARY_PATH" + export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH + echo $LD_LIBRARY_PATH + else + echo "Skipping CUDA compat setup as newer NVIDIA driver is installed" + fi +else + echo "Skipping CUDA compat setup as package not found" +fi \ No newline at end of file diff --git a/releases.json b/releases.json index 24781f63..37e7da77 100644 --- a/releases.json +++ b/releases.json @@ -100,37 +100,43 @@ "python_version": "py310", "pytorch_version": "2.0.1" } - ] - }, - "ignore_vulnerabilities": [ - "CVE-2024-42154 - linux", - "CVE-2025-32434 - torch" - ], - "releases": [ + ], + + "TGILLAMACPP": [ { - "framework": "TEI", "device": "gpu", - "version": "1.7.0", + "min_version": "3.2.3", + "max_version": "3.2.3", "os_version": "ubuntu22.04", - "python_version": "py310", - "pytorch_version": "2.0.1", - "cuda_version": "cu122" + "cuda_version": "cu128", + "python_version": "py311", + "pytorch_version": "2.6.0" }, { - "framework": "TEI", "device": "cpu", - "version": "1.7.0", + "min_version": "3.2.3", + "max_version": "3.2.3", "os_version": "ubuntu22.04", - "python_version": "py310", - "pytorch_version": "2.0.1" - }, + "cuda_version": "cu128", + "python_version": "py311", + "pytorch_version": "2.6.0" + } + ] +}, + + "ignore_vulnerabilities": [ + "CVE-2024-42154 - linux", + "CVE-2025-32434 - torch" + ], + "releases": [ { - "framework": "TGI", - "device": "inf2", - "version": "0.0.28", + "framework": "TGILLAMACPP", + "device": "gpu", + "version": "3.2.3", + "cuda_version": "cu128", "os_version": "ubuntu22.04", - "python_version": "py310", - "pytorch_version": "2.1.2" + "python_version": "py311", + "pytorch_version": "2.6.0" } ] } diff --git a/tests/huggingface/sagemaker_dlc_test.py b/tests/huggingface/sagemaker_dlc_test.py index 78272e56..c4d3228e 100644 --- a/tests/huggingface/sagemaker_dlc_test.py +++ b/tests/huggingface/sagemaker_dlc_test.py @@ -20,8 +20,9 @@ class TimeoutError(Exception): def timeout_handler(signum, frame): raise TimeoutError("Test timed out") + def run_test(args): - default_env = { "HF_MODEL_ID": args.model_id } + default_env = {"HF_MODEL_ID": args.model_id} if args.model_revision: default_env["HF_MODEL_REVISION"] = args.model_revision if args.instance_type.startswith("ml.inf2"): @@ -30,94 +31,223 @@ def run_test(args): default_env["MAX_BATCH_SIZE"] = "1" default_env["MAX_INPUT_TOKENS"] = "2048" default_env["MAX_TOTAL_TOKENS"] = "4096" + if os.getenv("FRAMEWORK") == "TGILLAMACPP": + if os.getenv("DEVICE_TYPE") == "GPU": + default_env["N_GPU_LAYERS"] = "99" + default_env["MAX_TOTAL_TOKENS"] = "2048" + default_env["MAX_BATCH_SIZE"] = "1" + default_env["TYPE_K"] = "q4-0" + llamacpp_env = default_env.copy() + llamacpp_env["HF_MODEL_GGUF"] = args.model_gguf + else: default_env["SM_NUM_GPUS"] = "4" signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(int(args.timeout)) predictor = None + second_predictor = None + try: # Create Hugging Face Model Class - endpoint_name = args.model_id.replace("/","-").replace(".", "-")[:40] - endpoint_name = endpoint_name + "-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) - model = HuggingFaceModel( - name=endpoint_name, - env=default_env, - role=args.role, - image_uri=args.image_uri + endpoint_name = args.model_id.replace("/", "-").replace(".", "-")[:40] + endpoint_name = ( + endpoint_name + "-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) ) - deploy_parameters = { - "instance_type": args.instance_type, - "initial_instance_count": 1, - "endpoint_name": endpoint_name, - "container_startup_health_check_timeout": 1800, - } - if args.instance_type.startswith("ml.inf2"): - deploy_parameters["volume_size"] = 256 - predictor = model.deploy(**deploy_parameters) - - logging.info("Endpoint deployment complete.") data = { "inputs": "What is Deep Learning?", - "parameters": {"max_new_tokens": 50, "top_k": 50, "top_p": 0.95, "do_sample": True}, + "parameters": { + "max_new_tokens": 50, + "top_k": 50, + "top_p": 0.95, + "do_sample": True, + }, } - output = predictor.predict(data) - logging.info("Output: " + json.dumps(output)) + + # For TGILLAMACPP, test both with and without the GGUF environment + if args.framework == "TGILLAMACPP": + # First endpoint with default environment (without GGUF) + logging.info( + "Deploying first endpoint with default environment (without GGUF)" + ) + model = HuggingFaceModel( + name=endpoint_name, + env=default_env, + role=args.role, + image_uri=args.image_uri, + ) + deploy_parameters = { + "instance_type": args.instance_type, + "initial_instance_count": 1, + "endpoint_name": endpoint_name, + "container_startup_health_check_timeout": 1800, + } + if args.instance_type.startswith("ml.inf2"): + deploy_parameters["volume_size"] = 256 + + predictor = model.deploy(**deploy_parameters) + logging.info("First endpoint deployment complete.") + + output = predictor.predict(data) + logging.info("First endpoint output: " + json.dumps(output)) + + # Second endpoint with llamacpp environment (with GGUF) + logging.info( + "Deploying second endpoint with llamacpp environment (with GGUF)" + ) + second_endpoint_name = endpoint_name + "-gguf" + second_model = HuggingFaceModel( + name=second_endpoint_name, + env=llamacpp_env, + role=args.role, + image_uri=args.image_uri, + ) + + second_deploy_parameters = deploy_parameters.copy() + second_deploy_parameters["endpoint_name"] = second_endpoint_name + + + second_predictor = second_model.deploy(**second_deploy_parameters) + logging.info("Second endpoint deployment complete.") + + second_output = second_predictor.predict(data) + logging.info("Second endpoint output: " + json.dumps(second_output)) + + else: + # For other image types, just deploy a single endpoint + env_to_use = default_env + model = HuggingFaceModel( + name=endpoint_name, + env=env_to_use, + role=args.role, + image_uri=args.image_uri, + ) + deploy_parameters = { + "instance_type": args.instance_type, + "initial_instance_count": 1, + "endpoint_name": endpoint_name, + "container_startup_health_check_timeout": 1800, + } + if args.instance_type.startswith("ml.inf2"): + deploy_parameters["volume_size"] = 256 + + predictor = model.deploy(**deploy_parameters) + logging.info("Endpoint deployment complete.") + + output = predictor.predict(data) + logging.info("Output: " + json.dumps(output)) + # TODO: we need to clearly define the expected output format for each models. # assert "generated_text" in output[0] finally: if predictor: predictor.delete_model() predictor.delete_endpoint() + if second_predictor: + second_predictor.delete_model() + second_predictor.delete_endpoint() + signal.alarm(0) signal.alarm(0) + def get_models_for_image(image_type, device_type): if image_type == "TGI": if device_type == "gpu": return [ - ("bigscience/bloom-560m", None, "ml.g5.12xlarge"), - ("EleutherAI/gpt-neox-20b", None, "ml.g5.12xlarge"), - ("google/flan-t5-xxl", None, "ml.g5.12xlarge"), + ("bigscience/bloom-560m", None, "ml.g5.12xlarge", None), + ("EleutherAI/gpt-neox-20b", None, "ml.g5.12xlarge", None), + ("google/flan-t5-xxl", None, "ml.g5.12xlarge", None), ] elif device_type == "inf2": - return [ ("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge") ] + return [("princeton-nlp/Sheared-LLaMA-1.3B", None, "ml.inf2.xlarge", None)] else: - raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " - f"please check whether the image_type and instance_type are supported.") + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) elif image_type == "TEI": if device_type == "gpu": return [ - ("BAAI/bge-m3", None, "ml.g5.12xlarge"), - ("intfloat/multilingual-e5-base", None, "ml.g5.12xlarge"), - ("thenlper/gte-base", None, "ml.g5.12xlarge"), - ("sentence-transformers/all-MiniLM-L6-v2", None, "ml.g5.12xlarge") + ("BAAI/bge-m3", None, "ml.g5.12xlarge", None), + ("intfloat/multilingual-e5-base", None, "ml.g5.12xlarge", None), + ("thenlper/gte-base", None, "ml.g5.12xlarge", None), + ( + "sentence-transformers/all-MiniLM-L6-v2", + None, + "ml.g5.12xlarge", + None, + ), + ] + elif device_type == "cpu": + return [("BAAI/bge-m3", None, "ml.g5.12xlarge", None)] + else: + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) + + elif image_type == "TGILLAMACPP": + if device_type == "gpu": + return [ + ( + "Qwen/Qwen2-0.5B-Instruct", + None, + "ml.g5.12xlarge", + "Qwen/Qwen2.5-0.5B-Instruct-GGUF", + ) ] elif device_type == "cpu": - return [("BAAI/bge-m3", None, "ml.g5.12xlarge")] + return [ + ( + "Qwen/Qwen2-0.5B-Instruct", + None, + "ml.m5.12xlarge", + "Qwen/Qwen2.5-0.5B-Instruct-GGUF", + ) + ] else: - raise ValueError(f"No testing models found for {image_type} on instance {device_type}. " - f"please check whether the image_type and instance_type are supported.") + raise ValueError( + f"No testing models found for {image_type} on instance {device_type}. " + f"please check whether the image_type and instance_type are supported." + ) else: - raise ValueError("Invalid image type. Supported types are 'TGI' and 'TEI'.") + raise ValueError( + "Invalid image type. Supported types are 'TGI', 'TEI', and 'TGILLAMACPP'." + ) + def should_run_test_for_image(test_type, target_type): return test_type == target_type -@pytest.mark.parametrize("image_type, device_type", [ - pytest.param("TGI", "gpu", marks=pytest.mark.gpu), - pytest.param("TGI", "inf2", marks=pytest.mark.inf2), - pytest.param("TEI", "gpu", marks=pytest.mark.gpu), - pytest.param("TEI", "cpu", marks=pytest.mark.cpu), -]) + +@pytest.mark.parametrize( + "image_type, device_type", + [ + pytest.param("TGI", "gpu", marks=pytest.mark.gpu), + pytest.param("TGI", "inf2", marks=pytest.mark.inf2), + pytest.param("TEI", "gpu", marks=pytest.mark.gpu), + pytest.param("TEI", "cpu", marks=pytest.mark.cpu), + pytest.param("TGILLAMACPP", "gpu", marks=pytest.mark.gpu), + pytest.param("TGILLAMACPP", "cpu", marks=pytest.mark.cpu), + ], +) def test(image_type, device_type, timeout: str = "3000"): test_target_image_type = os.getenv("TARGET_IMAGE_TYPE") test_device_type = os.getenv("DEVICE_TYPE") - if test_target_image_type and not should_run_test_for_image(image_type, test_target_image_type): - pytest.skip(f"Skipping test for image type {image_type} as it does not match target image type {test_target_image_type}") + if test_target_image_type and not should_run_test_for_image( + image_type, test_target_image_type + ): + pytest.skip( + f"Skipping test for image type {image_type} as it does not match target image type {test_target_image_type}" + ) - if test_device_type and not should_run_test_for_image(device_type, test_device_type): - pytest.skip(f"Skipping test for device type {device_type} as it does not match current device type {test_device_type}") + if test_device_type and not should_run_test_for_image( + device_type, test_device_type + ): + pytest.skip( + f"Skipping test for device type {device_type} as it does not match current device type {test_device_type}" + ) image_uri = os.getenv("IMAGE_URI") test_role_arn = os.getenv("TEST_ROLE_ARN") @@ -125,27 +255,31 @@ def test(image_type, device_type, timeout: str = "3000"): assert test_role_arn, f"Please set TEST_ROLE_ARN environment variable." models = get_models_for_image(image_type, device_type) - for model_id, model_revision, instance_type in models: + for model_id, model_revision, instance_type, model_gguf in models: args = argparse.Namespace( image_uri=image_uri, instance_type=instance_type, model_id=model_id, + model_gguf=model_gguf, + framework=test_target_image_type, model_revision=model_revision, role=test_role_arn, - timeout=timeout + timeout=timeout, ) logging.info(f"Running sanity test with the following args: {args}.") run_test(args) -if __name__ == '__main__': +if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--image_uri", type=str, required=True) arg_parser.add_argument("--instance_type", type=str, required=True) arg_parser.add_argument("--model_id", type=str, required=True) + arg_parser.add_argument("--model_gguf", type=str, required=False) arg_parser.add_argument("--model_revision", type=str, required=False) arg_parser.add_argument("--role", type=str, required=True) arg_parser.add_argument("--timeout", type=str, required=True) + arg_parser.add_argument("--framework", type=str, required=True) args = arg_parser.parse_args() run_test(args)