From 2fd6de93e6416d18cf908e4c03f3aee26bb7222a Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Wed, 13 May 2026 13:50:59 -0700 Subject: [PATCH 1/2] docs(server): update README and versions for r26.05 (#8780) --- Dockerfile.sdk | 2 +- README.md | 11 +++-------- TRITON_VERSION | 2 +- build.py | 4 ++-- deploy/aws/values.yaml | 2 +- deploy/fleetcommand/Chart.yaml | 2 +- deploy/fleetcommand/values.yaml | 6 +++--- deploy/gcp/values.yaml | 2 +- .../perf-analyzer-script/triton_client.yaml | 2 +- .../server-deployer/build_and_push.sh | 4 ++-- .../server-deployer/chart/triton/Chart.yaml | 2 +- .../server-deployer/chart/triton/values.yaml | 6 +++--- .../server-deployer/data-test/schema.yaml | 2 +- .../server-deployer/schema.yaml | 4 ++-- .../gke-marketplace-app/trt-engine/README.md | 6 +++--- deploy/k8s-onprem/values.yaml | 2 +- deploy/oci/values.yaml | 2 +- docs/customization_guide/compose.md | 18 +++++++++--------- docs/getting_started/llm.md | 4 ++-- docs/introduction/release_notes.md | 4 ++-- docs/user_guide/performance_tuning.md | 4 ++-- python/openai/README.md | 6 +++--- qa/common/gen_jetson_trt_models | 2 +- qa/common/gen_qa_model_repository | 2 +- 24 files changed, 48 insertions(+), 53 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 8febb7bf39..b2181abe6e 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.04-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.05-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server diff --git a/README.md b/README.md index b2f3d818b0..6fc36283f3 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,6 @@ --> [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) ->[!WARNING] ->You are currently on the `main` branch which tracks under-development progress ->towards the next release. The current release is version [2.68.0](https://github.com/triton-inference-server/server/releases/latest) ->and corresponds to the 26.04 container release on NVIDIA GPU Cloud (NGC). - # Triton Inference Server Triton Inference Server is an open source inference serving software that @@ -90,16 +85,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r26.04 https://github.com/triton-inference-server/server.git +git clone -b r26.05 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.04-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.05-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.04-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.05-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following Image '/workspace/images/mug.jpg': diff --git a/TRITON_VERSION b/TRITON_VERSION index 5c6f0d3953..a740b92f5e 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.69.0dev +2.69.0 diff --git a/build.py b/build.py index 557f55ae77..e80d62f399 100755 --- a/build.py +++ b/build.py @@ -71,8 +71,8 @@ # DEFAULT_TRITON_VERSION_MAP = { - "release_version": "2.69.0dev", - "triton_container_version": "26.05dev", + "release_version": "2.69.0", + "triton_container_version": "26.05", "upstream_container_version": "26.04", "ort_version": "1.24.4", "ort_openvino_version": "2026.1.0", diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index a140611d4f..c94f832aa8 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:26.04-py3 + imageName: nvcr.io/nvidia/tritonserver:26.05-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index e96abde6f5..bd360e7955 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: 2.68.0 +appVersion: 2.69.0 description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index 6ecf3b351d..b911db4afd 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:26.04-py3 + imageName: nvcr.io/nvidia/tritonserver:26.05-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r26.04/docs/user_guide/model_management.md + # See https://github.com/triton-inference-server/server/blob/r26.05/docs/user_guide/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r26.04/README.md + # see https://github.com/triton-inference-server/server/blob/r26.05/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index c9900d68a0..9784c9d252 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:26.04-py3 + imageName: nvcr.io/nvidia/tritonserver:26.05-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index a732a1da20..0e1347f4fd 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:26.04-py3-sdk + - image: nvcr.io/nvidia/tritonserver:26.05-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index 8e00967f88..4b4468d89d 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -28,8 +28,8 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver export MAJOR_VERSION=2.67 -export MINOR_VERSION=2.68.0 -export NGC_VERSION=26.04-py3 +export MINOR_VERSION=2.69.0 +export NGC_VERSION=26.05-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index 18f83cca68..d150f0e8d7 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -28,4 +28,4 @@ apiVersion: v1 appVersion: "2.68" description: Triton Inference Server name: triton-inference-server -version: 2.68.0 +version: 2.69.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 8cfd8171b8..362107e71a 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/26.04 -publishedVersion: '2.68.0' +modelRepositoryPath: gs://triton_sample_models/26.05 +publishedVersion: '2.69.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 26.04-py3 + tag: 26.05-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 7583068bc6..4c312c9880 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.68.0' + publishedVersion: '2.69.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 457e13d19d..ccf3b157c4 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.68.0' + publishedVersion: '2.69.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/26.04 + default: gs://triton_sample_models/26.05 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index 0200987e6f..fff7466da4 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:26.04-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:26.05-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/26.04/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/26.05/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/26.04/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/26.05/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 4dc4bf2c15..3d788f3f17 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -30,7 +30,7 @@ tags: openshift: false image: - imageName: nvcr.io/nvidia/tritonserver:26.04-py3 + imageName: nvcr.io/nvidia/tritonserver:26.05-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index f8867069c1..df5d60066d 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:26.04-py3 + imageName: nvcr.io/nvidia/tritonserver:26.05-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index e88f0c90ba..e922d27fbe 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -46,8 +46,8 @@ The `compose.py` script can be found in the Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. For example branch - [r26.04](https://github.com/triton-inference-server/server/tree/r26.04) -should be used to create a image based on the NGC 26.04 Triton release. + [r26.05](https://github.com/triton-inference-server/server/tree/r26.05) +should be used to create a image based on the NGC 26.05 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -78,20 +78,20 @@ For example, running ``` python3 compose.py --backend pytorch --repoagent checksum ``` -on branch [r26.04](https://github.com/triton-inference-server/server/tree/r26.04) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:26.04-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:26.04-py3` +on branch [r26.05](https://github.com/triton-inference-server/server/tree/r26.05) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:26.05-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:26.05-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend pytorch --repoagent checksum --container-version 26.04 +python3 compose.py --backend pytorch --repoagent checksum --container-version 26.05 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:26.04-py3-min --image full,nvcr.io/nvidia/tritonserver:26.04-py3 +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:26.05-py3-min --image full,nvcr.io/nvidia/tritonserver:26.05-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. @@ -102,8 +102,8 @@ Note: 2. vLLM and TensorRT-LLM backends are currently not supported backends for `compose.py`. If you want to build additional backends on top of these backends, it would be better to [build it yourself](#build-it-yourself) by using -`nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3` or -`nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3` as a `min` container. +`nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3` as a `min` container. ### CPU-only container composition diff --git a/docs/getting_started/llm.md b/docs/getting_started/llm.md index 052d7829ca..6ea9a5aa33 100644 --- a/docs/getting_started/llm.md +++ b/docs/getting_started/llm.md @@ -282,7 +282,7 @@ The above needs to be done manually with your favorite editor. Once finished, pl -v $(pwd)/all_models:/opt/all_models \ -v $(pwd)/scripts:/opt/scripts \ -v $(pwd)/Phi-3-mini-4k-instruct:/opt/Phi-3-mini-4k-instruct \ - nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3 + nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3 # Launch Server python3 ../scripts/launch_triton_server.py --model_repo ../all_models/inflight_batcher_llm --world_size 1 @@ -308,7 +308,7 @@ The above needs to be done manually with your favorite editor. Once finished, pl - export RELEASE="26.04" + export RELEASE="26.05" docker run -it --net=host --gpus '"device=0"' nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk 17. ## Download the Phi-3 tokenizer diff --git a/docs/introduction/release_notes.md b/docs/introduction/release_notes.md index 295734b89f..19fc0f22f3 100644 --- a/docs/introduction/release_notes.md +++ b/docs/introduction/release_notes.md @@ -25,9 +25,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# [Triton Inference Server Release 26.04](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-26-04.html#rel-26-04) +# [Triton Inference Server Release 26.05](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-26-05.html#rel-26-05) -The Triton Inference Server container image, release 26.04, is available +The Triton Inference Server container image, release 26.05, is available on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver) and is open source on [GitHub](https://github.com/triton-inference-server/server). Release notes can diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index d51ad5ab2f..ff837a4629 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:26.04-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:26.05-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:26.04-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:26.05-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/python/openai/README.md b/python/openai/README.md index 4598a5a43f..4134e72cb6 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -46,7 +46,7 @@ docker run -it --net=host --gpus all --rm \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ - nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3 + nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3 ``` 2. Launch the OpenAI-compatible Triton Inference Server: @@ -355,7 +355,7 @@ Currently, OpenAI-Compatible Frontend supports loading embedding models and embe docker run -it --net=host --gpus all --rm \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ - nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3 + nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3 ``` 2. Launch the OpenAI-compatible Triton Inference Server: @@ -451,7 +451,7 @@ docker run -it --net=host --gpus all --rm \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ -e TRTLLM_ORCHESTRATOR=1 \ - nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3 + nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3 ``` 2. Install dependencies inside the container: diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 7fcdb49823..4d491fa2a1 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=26.04} +TRITON_VERSION=${TRITON_VERSION:=26.05} # Set the CUDA device to use NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index 328f42bbe0..cb1b680800 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -66,7 +66,7 @@ log_message.status "Changing working directory to the script directory to: " "${ cd ${TRITON_MDLS_BASE_SCRIPT_DIR} log_message.status "define: default values" -TRITON_VERSION=${TRITON_VERSION:=26.04} +TRITON_VERSION=${TRITON_VERSION:=26.05} ONNX_VERSION=1.20.1 ONNX_OPSET=0 OPENVINO_VERSION=2024.5.0 From cfd8ec8e35fb4d68303f38206c17fd96879b3f91 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Fri, 15 May 2026 15:13:04 -0400 Subject: [PATCH 2/2] test: Add Torch AOTI Tests (#8771) This change: Creates a new L0_torch_aoti test suit. Adds complex Torch AOTI model generation to qa/common/gen_qa_models.py. Cleans up existion AOTI model generation in qa/common/gen_qa_models.py. Enabled torchvision AOTI model generation in qa/common/gen_qa_model_repository. --- qa/L0_torch_aoti/test.sh | 148 +++++++ qa/L0_torch_aoti/torch_aoti_infer_test.py | 284 +++++++++++++ qa/common/gen_qa_model_repository | 2 +- qa/common/gen_qa_models.py | 492 +++++++++++++++++----- 4 files changed, 821 insertions(+), 105 deletions(-) create mode 100755 qa/L0_torch_aoti/test.sh create mode 100755 qa/L0_torch_aoti/torch_aoti_infer_test.py diff --git a/qa/L0_torch_aoti/test.sh b/qa/L0_torch_aoti/test.sh new file mode 100755 index 0000000000..f37751c55e --- /dev/null +++ b/qa/L0_torch_aoti/test.sh @@ -0,0 +1,148 @@ +#!/bin/bash +# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../common/util.sh + +if [[ "${DEBUG}" == "true" ]]; then + set -x +else + set +x +fi + +COLOR_DARK="\033[90m" +COLOR_ERROR="\033[31m" +COLOR_INFO="\033[94m" +COLOR_RESET="\033[0m" +COLOR_STATUS="\033[36m" +COLOR_SUCCESS="\033[32m" +COLOR_WARNING="\033[33m" +RET=0 + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [[ "$#" -ge 1 ]]; then + REPO_VERSION=$1 +fi +if [[ -z "$REPO_VERSION" ]]; then + echo -e "${COLOR_ERROR}Repository version must be specified${COLOR_RESET}" 1>&2 + echo -e "${COLOR_ERROR}\n***\n*** Test Failed\n***${COLOR_RESET}" 1>&2 + exit 1 +fi +if [[ ! -z "$TEST_REPO_ARCH" ]]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +MODELDIR=${MODELDIR:=`pwd`/models} +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends + +# PyTorch on SBSA requires libgomp to be loaded first. See the following +# GitHub issue for more information: +# https://github.com/pytorch/pytorch/issues/2575 +arch=`uname -m` +echo -e "${COLOR_DARK}Detected architecture: ${arch}${COLOR_RESET}" +if [[ "${arch}" == "aarch64" ]]; then + SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1 + echo -e "${COLOR_DARK}SERVER_LD_PRELOAD=${SERVER_LD_PRELOAD}${COLOR_RESET}" +fi + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="pytorch"} +export BACKENDS + +# Copy the models into the model repository +echo -e "${COLOR_DARK}Setting up model repository in ${MODELDIR}${COLOR_RESET}" +rm -rf ${MODELDIR} && mkdir -p ${MODELDIR} +models=( + "torch_aoti_complex_index" + "torch_aoti_complex_named" + "torch_aoti_int8_int8" + "torch_aoti_int16_int16" + "torch_aoti_int32_int32" + "torch_aoti_int64_int64" + "torch_aoti_float16_float16" + "torch_aoti_float32_float32" + "torchvision_aoti" +) +for model in "${models[@]}"; do + cp -r ${DATADIR}/qa_model_repository/${model} ${MODELDIR}/${model} + echo -e "${COLOR_DARK}ls ${MODELDIR}/${model}${COLOR_RESET}" + ls -lha ${MODELDIR}/${model} +done +echo -e "${COLOR_DARK}ls ${MODELDIR}${COLOR_RESET}" +ls -lha ${MODELDIR} + +SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1" +SERVER_LOG="./torch_aoti_complex_named-server.log" +CLIENT_LOG="./torch_aoti_complex_named-client.log" + +echo -e "${COLOR_DARK}Running ${SERVER} with model repository ${MODELDIR}${COLOR_RESET}" +run_server +if [[ "${SERVER_PID}" -eq 0 ]]; then + echo -e "${COLOR_ERROR}\n***\n*** Failed to start ${SERVER}\n***${COLOR_RESET}" &1>2 + cat ${SERVER_LOG} &1>2 + echo -e "\n" &1>2 + exit 1 +fi + +# Install torch framework +echo -e "${COLOR_DARK}Installing PyTorch framework required by tests${COLOR_RESET}" +pip install torch + +# Run the Tests +TEST_NAME="torch_aoti_infer_test" +python3 ./${TEST_NAME}.py >> ${CLIENT_LOG} 2>&1 +EXIT_CODE=$? +if [[ ${EXIT_CODE} -ne 0 ]]; then + echo -e "${COLOR_ERROR}\n***\n*** Test '${TEST_NAME}' Failed with exit code ${EXIT_CODE}\n***${COLOR_RESET}" &1>2 + cat ${CLIENT_LOG} &1>2 + echo -e "\n" &1>2 + RET=1 +else + echo -e "${COLOR_INFO}\n***\n*** Test '${TEST_NAME}' Passed\n***${COLOR_RESET}" +fi + +# Cleanup +echo -e "${COLOR_DARK}Killing server (pid: ${SERVER_PID})${COLOR_RESET}" +kill -s SIGINT ${SERVER_PID} +wait ${SERVER_PID} || true +echo -e "${COLOR_DARK}Removing model repository${COLOR_RESET}" +for model in "${models[@]}"; do + rm -rf ${MODELDIR}/${model} +done + +# Report results and exit. +if [[ ${RET} -ne 0 ]]; then + echo -e "${COLOR_ERROR}\n***\n*** Test Suite FAILED\n***${COLOR_RESET}" &1>2 +else + echo -e "${COLOR_SUCCESS}\n***\n*** Test Suite PASSED\n***${COLOR_RESET}" +fi + +exit ${RET} diff --git a/qa/L0_torch_aoti/torch_aoti_infer_test.py b/qa/L0_torch_aoti/torch_aoti_infer_test.py new file mode 100755 index 0000000000..2b93f31a48 --- /dev/null +++ b/qa/L0_torch_aoti/torch_aoti_infer_test.py @@ -0,0 +1,284 @@ +#!/usr/bin/python +# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import test_util as tu +import torch +import tritonclient.http as http + + +class TorchAotiTest(tu.TestResultCollector): + def _get_complex_input_shape(self): + return (1, 16) + + def _get_complex_output_shape(self): + return (1, 16) + + def _get_complex_input_data(self, shape): + return [ + torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(), + torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(), + torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(), + torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(), + ] + + def _get_simple_input_data(self, shape, io_type): + if io_type in [torch.int8, torch.int16, torch.int32, torch.int64]: + return torch.randint(low=0, high=127, size=shape, dtype=io_type).numpy() + elif io_type in [torch.float16, torch.float32, torch.float64]: + return torch.randn(size=shape, dtype=io_type).numpy() + else: + raise ValueError(f"Unsupported data type: {io_type}") + + def _get_torchvision_input_data(self, shape): + return torch.randn(size=shape, dtype=torch.float32).numpy() + + def _dtype_to_triton_dtype(self, dtype): + if dtype == torch.int8: + return "INT8" + elif dtype == torch.int16: + return "INT16" + elif dtype == torch.int32: + return "INT32" + elif dtype == torch.int64: + return "INT64" + elif dtype == torch.float16: + return "FP16" + elif dtype == torch.float32: + return "FP32" + else: + raise ValueError(f"Unsupported data type: {dtype}") + + def _get_simple_model_name(self, io_type): + if io_type == torch.int8: + return "torch_aoti_int8_int8" + elif io_type == torch.int16: + return "torch_aoti_int16_int16" + elif io_type == torch.int32: + return "torch_aoti_int32_int32" + elif io_type == torch.int64: + return "torch_aoti_int64_int64" + elif io_type == torch.float16: + return "torch_aoti_float16_float16" + elif io_type == torch.float32: + return "torch_aoti_float32_float32" + else: + raise ValueError(f"Unsupported data type: {io_type}") + + def test_complex_index(self): + MODEL_NAME = "torch_aoti_complex_index" + INPUT_SHAPE = self._get_complex_input_shape() + OUTPUT_SHAPE = self._get_complex_output_shape() + + input_data = self._get_complex_input_data(INPUT_SHAPE) + + with http.InferenceServerClient("localhost:8000") as client: + inputs = [ + http.InferInput("INPUT__0", input_data[0].shape, "INT8"), + http.InferInput("INPUT__1", input_data[1].shape, "INT8"), + http.InferInput("INPUT__2", input_data[2].shape, "INT8"), + http.InferInput("INPUT__3", input_data[3].shape, "INT8"), + ] + + inputs[0].set_data_from_numpy(input_data[0], binary_data=True) + inputs[1].set_data_from_numpy(input_data[1], binary_data=True) + inputs[2].set_data_from_numpy(input_data[2], binary_data=True) + inputs[3].set_data_from_numpy(input_data[3], binary_data=True) + + output_names = [ + "OUTPUT__0", + "OUTPUT__1", + "OUTPUT__2", + "OUTPUT__3", + "OUTPUT__4", + "OUTPUT__5", + ] + + outputs = [] + for output_name in output_names: + outputs.append(http.InferRequestedOutput(output_name, binary_data=True)) + + output_data = [] + results = client.infer(MODEL_NAME, inputs, outputs=outputs) + + for output_name in output_names: + output_data.append(results.as_numpy(output_name)) + + self.assertEqual(len(outputs), len(output_data)) + for data in output_data: + self.assertEqual(data.shape, OUTPUT_SHAPE) + + self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all()) + self.assertTrue((output_data[1] == input_data[0] - input_data[1]).all()) + self.assertTrue((output_data[2] == input_data[0]).all()) + self.assertTrue((output_data[3] == input_data[1]).all()) + self.assertTrue((output_data[4] == input_data[2]).all()) + self.assertTrue((output_data[5] == input_data[3]).all()) + + def test_complex_named(self): + MODEL_NAME = "torch_aoti_complex_named" + INPUT_SHAPE = self._get_complex_input_shape() + OUTPUT_SHAPE = self._get_complex_output_shape() + + input_data = self._get_complex_input_data(INPUT_SHAPE) + + with http.InferenceServerClient("localhost:8000") as client: + inputs = [ + http.InferInput("ARGS[0]", input_data[0].shape, "INT8"), + http.InferInput("ARGS[1]", input_data[1].shape, "INT8"), + http.InferInput("ARGS[2][option1]", input_data[2].shape, "INT8"), + http.InferInput("ARGS[2][option2]", input_data[3].shape, "INT8"), + ] + + inputs[0].set_data_from_numpy(input_data[0], binary_data=True) + inputs[1].set_data_from_numpy(input_data[1], binary_data=True) + inputs[2].set_data_from_numpy(input_data[2], binary_data=True) + inputs[3].set_data_from_numpy(input_data[3], binary_data=True) + + output_names = [ + "RESULT[AAA]", + "RESULT[BBB][0]", + "RESULT[BBB][1]", + "RESULT[CCC][option1]", + "RESULT[CCC][option2]", + "RESULT[ZZZ]", + ] + + outputs = [] + for output_name in output_names: + outputs.append(http.InferRequestedOutput(output_name, binary_data=True)) + + output_data = [] + results = client.infer(MODEL_NAME, inputs, outputs=outputs) + + for output_name in output_names: + output_data.append(results.as_numpy(output_name)) + + self.assertEqual(len(outputs), len(output_data)) + for data in output_data: + self.assertEqual(data.shape, OUTPUT_SHAPE) + + self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all()) + self.assertTrue((output_data[1] == input_data[0]).all()) + self.assertTrue((output_data[2] == input_data[1]).all()) + self.assertTrue((output_data[3] == input_data[2]).all()) + self.assertTrue((output_data[4] == input_data[3]).all()) + self.assertTrue((output_data[5] == (input_data[0] - input_data[1])).all()) + + def test_simple_model(self): + io_types = [ + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.float16, + torch.float32, + ] + for io_type in io_types: + MODEL_NAME = self._get_simple_model_name(io_type) + INPUT_SHAPE = (16,) + OUTPUT_SHAPE = (16,) + TRITON_IO_TYPE = self._dtype_to_triton_dtype(io_type) + + input_data = ( + self._get_simple_input_data(INPUT_SHAPE, io_type), + self._get_simple_input_data(INPUT_SHAPE, io_type), + ) + + with http.InferenceServerClient("localhost:8000") as client: + inputs = [ + http.InferInput("ARGS[0]", input_data[0].shape, TRITON_IO_TYPE), + http.InferInput("ARGS[1]", input_data[1].shape, TRITON_IO_TYPE), + ] + + inputs[0].set_data_from_numpy(input_data[0], binary_data=True) + inputs[1].set_data_from_numpy(input_data[1], binary_data=True) + + output_names = [ + "RESULT", + ] + + outputs = [] + for output_name in output_names: + outputs.append( + http.InferRequestedOutput(output_name, binary_data=True) + ) + + output_data = [] + results = client.infer(MODEL_NAME, inputs, outputs=outputs) + + for output_name in output_names: + output_data.append(results.as_numpy(output_name)) + + self.assertEqual(len(outputs), len(output_data)) + for data in output_data: + self.assertEqual(data.shape, OUTPUT_SHAPE) + self.assertTrue((data == input_data[0] + input_data[1]).all()) + + def test_torchvision(self): + MODEL_NAME = "torchvision_aoti" + INPUT_SHAPE = (1, 3, 224, 224) + OUTPUT_SHAPE = (1, 1000) + + input_data = self._get_torchvision_input_data(INPUT_SHAPE) + input_data[0][0] = 1.0 + + with http.InferenceServerClient("localhost:8000") as client: + inputs = [ + http.InferInput("ARGS[0]", input_data.shape, "FP32"), + ] + + inputs[0].set_data_from_numpy(input_data, binary_data=True) + + output_names = [ + "RESULT", + ] + + outputs = [] + for output_name in output_names: + outputs.append(http.InferRequestedOutput(output_name, binary_data=True)) + + output_data = [] + results = client.infer(MODEL_NAME, inputs, outputs=outputs) + + for output_name in output_names: + output_data.append(results.as_numpy(output_name)) + + self.assertEqual(len(outputs), len(output_data)) + for data in output_data: + self.assertEqual(data.shape, OUTPUT_SHAPE) + output_tensor = torch.from_numpy(data) + self.assertTrue(torch.isfinite(output_tensor).all().item()) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index cb1b680800..a503b64a10 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -263,9 +263,9 @@ set -e PATH=$PATH:/usr/local/cuda-13.0/bin python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --models_dir=$TRITON_MDLS_QA_MODEL python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --models_dir=$TRITON_MDLS_QA_MODEL +python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torchvision-aoti --models_dir=$TRITON_MDLS_QA_MODEL chmod -R 777 $TRITON_MDLS_QA_MODEL python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL -python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL chmod -R 777 $TRITON_MDLS_QA_VARIABLE_MODEL python3 $TRITON_MDLS_SRC_DIR/gen_qa_identity_models.py --libtorch --models_dir=$TRITON_MDLS_QA_IDENTITY_MODEL chmod -R 777 $TRITON_MDLS_QA_IDENTITY_MODEL diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py index d509562bff..a015dc0e2d 100755 --- a/qa/common/gen_qa_models.py +++ b/qa/common/gen_qa_models.py @@ -47,6 +47,7 @@ from typing import List, Tuple _color_blue = "\033[94m" +_color_cyan = "\033[36m" _color_green = "\033[32m" _color_magenta = "\033[35m" _color_red = "\033[31m" @@ -1289,7 +1290,7 @@ def forward(self, INPUT0, INPUT1): traced.save(f"{model_version_dir}/model.pt") -def generate_sample_inputs( +def generate_torch_aoti_sample_inputs( input_shape, input_dtype, device, @@ -1297,70 +1298,32 @@ def generate_sample_inputs( # handle for -1 (when variable) since can't create tensor with shape of [-1] input_shape = [abs(ips) for ips in input_shape] - if input_dtype == np.int8: - input0 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device) - input1 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device) - elif input_dtype == np.int16: - input0 = torch.randint( - -32768, 32767, input_shape, dtype=torch.int16, device=device - ) - input1 = torch.randint( - -32768, 32767, input_shape, dtype=torch.int16, device=device - ) - elif input_dtype == np.int32: - input0 = torch.randint( - -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device - ) - input1 = torch.randint( - -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device - ) - elif input_dtype == np.int64: - input0 = torch.randint( - -9223372036854775808, - 9223372036854775807, - input_shape, - dtype=torch.int64, - device=device, - ) - input1 = torch.randint( - -9223372036854775808, - 9223372036854775807, - input_shape, - dtype=torch.int64, - device=device, - ) - elif input_dtype == np.float16: - input0 = torch.randn(*input_shape, dtype=torch.float16, device=device) - input1 = torch.randn(*input_shape, dtype=torch.float16, device=device) - elif input_dtype == np.float32: - input0 = torch.randn(*input_shape, dtype=torch.float32, device=device) - input1 = torch.randn(*input_shape, dtype=torch.float32, device=device) - elif input_dtype == np.float64: - input0 = torch.randn(*input_shape, dtype=torch.float64, device=device) - input1 = torch.randn(*input_shape, dtype=torch.float64, device=device) - elif input_dtype == np.uint8: - input0 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device) - input1 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device) - elif input_dtype == np.uint16: - input0 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device) - input1 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device) - elif input_dtype == np.uint32: - input0 = torch.randint( - 0, 4294967295, input_shape, dtype=torch.uint32, device=device - ) - input1 = torch.randint( - 0, 4294967295, input_shape, dtype=torch.uint32, device=device - ) - elif input_dtype == np.uint64: - input0 = torch.randint( - 0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device - ) - input1 = torch.randint( - 0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device + np_to_torch_dtype = { + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + np.uint8: torch.uint8, + np.uint16: torch.uint16, + np.uint32: torch.uint32, + np.uint64: torch.uint64, + } + + if input_dtype not in np_to_torch_dtype: + print( + f"{_color_yellow}warning: dtype {input_dtype} is unsupported; falling back to torch.int32{_color_reset}" ) - else: - input0 = torch.randn(*input_shape, device=device) - input1 = torch.randn(*input_shape, device=device) + input_dtype = np.int32 + + input0 = torch.zeros( + input_shape, dtype=np_to_torch_dtype[input_dtype], device=device + ) + input1 = torch.zeros( + input_shape, dtype=np_to_torch_dtype[input_dtype], device=device + ) return (input0, input1) @@ -1395,7 +1358,7 @@ def np_to_dtype(np_dtype): return torch.int32 -def create_torch_aoti_modelfile( +def create_torch_aoti_model_file( models_dir, model_version, input_shape, @@ -1418,7 +1381,7 @@ def create_torch_aoti_modelfile( ) return False - model_version_dir = f"{models_dir}/{model_name}/{model_version}" + model_version_dir = os.path.join(models_dir, model_name, str(model_version)) print(f"{_color_green}Creating model {model_name}{_color_reset}") @@ -1465,13 +1428,14 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor: model.to(device) model = model.eval() - sample_input = generate_sample_inputs(input_shape, input_dtype, device) + sample_inputs = generate_torch_aoti_sample_inputs(input_shape, input_dtype, device) + package_path = os.path.join(model_version_dir, "model.pt2") try: - ep = torch.export.export(model, sample_input) + exported_model = torch.export.export(model, sample_inputs) torch._inductor.aoti_compile_and_package( - ep, - package_path=f"{model_version_dir}/model.pt2", + exported_model, + package_path=package_path, ) except Exception as e: print( @@ -1484,13 +1448,162 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor: return True -def create_torchvision_aoti_modelfile( +def create_torch_aoti_complex_model_file( + models_dir: str, +): + base_name = "torch_aoti_complex" + model_names = [ + f"{base_name}_named", + f"{base_name}_index", + ] + model_version_dirs = [ + os.path.join(models_dir, model_names[0], "1"), + os.path.join(models_dir, model_names[1], "1"), + ] + + for model_version_dir in model_version_dirs: + try: + os.makedirs(model_version_dir) + except OSError: + pass # ignore existing dir + + print(f"{_color_green}Creating model {base_name}{_color_reset}") + + class TorchAotiComplex(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward( + self, + hdata: torch.Tensor, + vdata: torch.Tensor, + options: dict[str, torch.Tensor], + ) -> dict[ + str, + torch.Tensor | tuple[torch.Tensor, torch.Tensor] | dict[str, torch.Tensor], + ]: + out = { + "AAA": hdata + vdata, + "ZZZ": hdata - vdata, + "BBB": ( + hdata, + vdata, + ), + "CCC": options, + } + + return out + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = TorchAotiComplex() + model.to(device) + model = model.eval() + + SHAPE = (1, 16) + + sample_args = ( + torch.zeros(SHAPE, dtype=torch.int8, device=device), + torch.zeros(SHAPE, dtype=torch.int8, device=device), + { + "option1": torch.zeros(SHAPE, dtype=torch.int8, device=device), + "option2": torch.zeros(SHAPE, dtype=torch.int8, device=device), + }, + ) + + # Export and package the model + print(f"{_color_green}Exporting and packaging the model...{_color_reset}") + + model_file_name = "model.pt2" + package_paths = [ + os.path.join(model_version_dirs[0], model_file_name), + os.path.join(model_version_dirs[1], model_file_name), + ] + + try: + exported_model = torch.export.export(model, sample_args) + torch._inductor.aoti_compile_and_package( + exported_model, + package_path=package_paths[0], + ) + except Exception as e: + print( + f"{_color_red}error: Failed to create model {base_name}{_color_reset}", + file=sys.stderr, + ) + print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr) + return False + + try: + # Now load and run the packaged model + print(f"{_color_cyan}Loading and running the packaged model...{_color_reset}") + + compiled_model = torch._inductor.aoti_load_package(package_paths[0]) + + print(f"{_color_cyan}Compiled model call spec:{_color_reset}") + + for elem in compiled_model.loader.get_call_spec(): + print(elem) + + print(f"{_color_cyan}Running the compiled model...{_color_reset}") + + with torch.inference_mode(): + hdata = torch.randint( + low=0, + high=127, + size=SHAPE, + dtype=torch.int8, + device=device, + ) + vdata = torch.randint( + low=0, + high=127, + size=SHAPE, + dtype=torch.int8, + device=device, + ) + options = { + "option1": torch.randint( + low=0, + high=127, + size=SHAPE, + dtype=torch.int8, + device=device, + ), + "option2": torch.randint( + low=0, + high=127, + size=SHAPE, + dtype=torch.int8, + device=device, + ), + } + + _ = compiled_model(hdata, vdata, options) + + print( + f'{_color_green}Model "{base_name}" successfully executed.{_color_reset}' + ) + except Exception as e: + print( + f"{_color_red}error: Failed to validate model {base_name}{_color_reset}", + file=sys.stderr, + ) + print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr) + return False + + # Copy the compiled model package to the alternate model folder. + # Both the named and ordinal addressing versions of the model (from Triton's point-of-view) use the same compiled model. + shutil.copy(package_paths[0], package_paths[1]) + + return True + + +def create_torchvision_aoti_model_file( models_dir: str, max_batch: int, - model_version: int, ): model_name = "torchvision_aoti" - model_version_dir = f"{models_dir}/{model_name}/{model_version}" + model_version_dir = os.path.join(models_dir, model_name, "1") try: os.makedirs(model_version_dir) @@ -1504,16 +1617,16 @@ def create_torchvision_aoti_modelfile( model = model.to(device) model = model.eval() + SHAPE = (max_batch, 3, 224, 224) + # Example input tensor with batch size 1 and 3 color channels (RGB), height and width of 224 - input_tensor = torch.randn(max_batch, 3, 224, 224, device=device) + sample_inputs = (torch.zeros(SHAPE, dtype=torch.float32, device=device),) - try: - ep = torch.export.export(model, (input_tensor,)) + package_path = os.path.join(model_version_dir, "model.pt2") - torch._inductor.aoti_compile_and_package( - ep, - package_path=f"{model_version_dir}/model.pt2", - ) + try: + ep = torch.export.export(model, sample_inputs) + torch._inductor.aoti_compile_and_package(ep, package_path=package_path) except Exception as e: print( f"{_color_red}error: Failed to create model {model_name}{_color_reset}", @@ -1609,9 +1722,11 @@ def create_libtorch_modelconfig( except OSError: pass # ignore existing dir - with open(f"{config_dir}/config.pbtxt", "w") as file: + config_path = os.path.join(config_dir, "config.pbtxt") + + with open(config_path, "w") as file: file.write(config) - print(f"Created {config_dir}/config.pbtxt") + print(f"Created {config_path}") with open(f"{config_dir}/{label_filename}", "w") as file: for l in range(output0_label_cnt): @@ -1619,7 +1734,7 @@ def create_libtorch_modelconfig( print(f"Created {config_dir}/{label_filename}") -def create_torch_aoti_modelconfig( +def create_torch_aoti_model_config( models_dir, input_shape, output_shape, @@ -1650,7 +1765,7 @@ def create_torch_aoti_modelconfig( print(f"{_color_green}Creating config for {model_name}{_color_reset}") label_filename = "output_labels.txt" - config_dir = f"{models_dir}/{model_name}" + config_dir = os.path.join(models_dir, model_name) config = f""" backend: "pytorch" name: "{model_name}" @@ -1658,19 +1773,19 @@ def create_torch_aoti_modelconfig( version_policy: {version_policy_str} input [ {{ - name: "INPUT0" + name: "ARGS[0]" data_type: {np_to_model_dtype(input_dtype)} dims: [ {tu.shape_to_dims_str(input_shape)} ] }}, {{ - name: "INPUT1" + name: "ARGS[1]" data_type: {np_to_model_dtype(input_dtype)} dims: [ {tu.shape_to_dims_str(input_shape)} ] }} ] output [ {{ - name: "OUTPUT__0" + name: "RESULT" data_type: {np_to_model_dtype(output_dtype)} dims: [ {tu.shape_to_dims_str(output_shape)} ] label_filename: "{label_filename}" @@ -1684,17 +1799,173 @@ def create_torch_aoti_modelconfig( except OSError: pass # ignore existing dir - with open(f"{config_dir}/config.pbtxt", "w") as file: + config_path = os.path.join(config_dir, "config.pbtxt") + + with open(config_path, "w") as file: file.write(config) - print(f"Created {config_dir}/config.pbtxt") + print(f"Created {config_path}") - with open(f"{config_dir}/{label_filename}", "w") as file: + label_path = os.path.join(config_dir, label_filename) + + with open(label_path, "w") as file: for l in range(output_label_cnt): file.write(f"label{l}\n") - print(f"Created {config_dir}/{label_filename}") + print(f"Created {label_path}") + + +def create_torch_aoti_complex_model_config( + models_dir, +): + base_name = "torch_aoti_complex" + model_names = [ + f"{base_name}_named", + f"{base_name}_index", + ] + + print(f"{_color_green}Creating config for {base_name}{_color_reset}") + + config_dirs = [ + os.path.join(models_dir, model_names[0]), + os.path.join(models_dir, model_names[1]), + ] + configs = [ + f""" +backend: "pytorch" +platform: "torch_aoti" +name: "{model_names[0]}" +input: [ + {{ + name: "ARGS[0]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "ARGS[1]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "ARGS[2][option1]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "ARGS[2][option2]" + data_type: TYPE_INT8 + dims: [1, 16] + }} +] +output: [ + {{ + name: "RESULT[AAA]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "RESULT[BBB][0]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "RESULT[BBB][1]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "RESULT[CCC][option1]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "RESULT[CCC][option2]" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "RESULT[ZZZ]" + data_type: TYPE_INT8 + dims: [1, 16] + }} +] +instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}] +""", + f""" +backend: "pytorch" +name: "{model_names[1]}" +platform: "torch_aoti" +input: [ + {{ + name: "INPUT__0" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "INPUT__1" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "INPUT__2" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "INPUT__3" + data_type: TYPE_INT8 + dims: [1, 16] + }} +] +output: [ + {{ + name: "OUTPUT__0" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "OUTPUT__1" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "OUTPUT__2" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "OUTPUT__3" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "OUTPUT__4" + data_type: TYPE_INT8 + dims: [1, 16] + }}, + {{ + name: "OUTPUT__5" + data_type: TYPE_INT8 + dims: [1, 16] + }} +] +instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}] +""", + ] + + for i in range(2): + config_dir = config_dirs[i] + try: + os.makedirs(config_dir) + except OSError: + pass # ignore existing dir + config_path = os.path.join(config_dir, "config.pbtxt") -def create_torchvision_aoti_modelconfig( + with open(config_path, "w") as file: + file.write(configs[i]) + print(f"Created {config_path}") + + +def create_torchvision_aoti_model_config( models_dir: str, max_batch: int, ): @@ -1703,7 +1974,7 @@ def create_torchvision_aoti_modelconfig( print(f"{_color_green}Creating config for {model_name}{_color_reset}") - config_dir = f"{models_dir}/{model_name}" + config_dir = os.path.join(models_dir, model_name) config = f""" backend: "pytorch" name: "{model_name}" @@ -1711,14 +1982,13 @@ def create_torchvision_aoti_modelconfig( max_batch_size: {max_batch} input [ {{ - name: "INPUT__0" + name: "ARGS[0]" data_type: TYPE_FP32 - format: FORMAT_NCHW dims: [ 3, 224, 224 ] }}] output [ {{ - name: "OUTPUT__0" + name: "RESULT" data_type: TYPE_FP32 dims: [ 1000 ] label_filename: "{label_filename}" @@ -1732,15 +2002,19 @@ def create_torchvision_aoti_modelconfig( except OSError: pass # ignore existing dir - with open(f"{config_dir}/config.pbtxt", "w") as file: + config_path = os.path.join(config_dir, "config.pbtxt") + + with open(config_path, "w") as file: file.write(config) - print(f"Created {config_dir}/config.pbtxt") + print(f"Created {config_path}") source_path = os.environ.get("TRITON_GENSRCDIR", default="gen_srcdir") source_filename = os.path.join(source_path, RESNET50_LABEL_FILE) - shutil.copyfile(source_filename, f"{config_dir}/{label_filename}") - print(f"Created {config_dir}/{label_filename}") + target_path = os.path.join(config_dir, label_filename) + + shutil.copyfile(source_filename, target_path) + print(f"Created {target_path}") def create_openvino_modelfile( @@ -2106,14 +2380,14 @@ def create_models( f"{_color_magenta}PyTorch: AOTI model generation requested{_color_reset}" ) # max-batch 8 - if create_torch_aoti_modelfile( + if create_torch_aoti_model_file( models_dir, model_version, input_shape, input_dtype, output0_dtype, ): - create_torch_aoti_modelconfig( + create_torch_aoti_model_config( models_dir, input_shape, output0_shape, @@ -2352,6 +2626,8 @@ def create_fixed_models( if FLAGS.onnx: import onnx if FLAGS.libtorch or FLAGS.torch_aoti: + import shutil + import torch from torch import nn if FLAGS.torchvision_aoti: @@ -2747,7 +3023,15 @@ def create_fixed_models( for model_shape in [(-1,), (-1, -1), (-1, -1, -1)]: emu.create_nop_modelconfig(FLAGS.models_dir, model_shape, model_dtype) + if FLAGS.torch_aoti: + print( + f"{_color_magenta}PyTorch: Complex AOTI model generation requested{_color_reset}" + ) + if create_torch_aoti_complex_model_file(FLAGS.models_dir): + create_torch_aoti_complex_model_config(FLAGS.models_dir) + if FLAGS.torchvision_aoti: + # TODO: Add support for variable batch size and version policy for torchvision AOTI models. print(f"{_color_blue}TorchVision AOTI model generation requested{_color_reset}") - if create_torchvision_aoti_modelfile(FLAGS.models_dir, 1, 1): - create_torchvision_aoti_modelconfig(FLAGS.models_dir, 1) + if create_torchvision_aoti_model_file(FLAGS.models_dir, 1): + create_torchvision_aoti_model_config(FLAGS.models_dir, 1)