From 2fd6de93e6416d18cf908e4c03f3aee26bb7222a Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Wed, 13 May 2026 13:50:59 -0700
Subject: [PATCH 1/2] docs(server): update README and versions for r26.05
 (#8780)

---
 Dockerfile.sdk                                 |  2 +-
 README.md                                      | 11 +++--------
 TRITON_VERSION                                 |  2 +-
 build.py                                       |  4 ++--
 deploy/aws/values.yaml                         |  2 +-
 deploy/fleetcommand/Chart.yaml                 |  2 +-
 deploy/fleetcommand/values.yaml                |  6 +++---
 deploy/gcp/values.yaml                         |  2 +-
 .../perf-analyzer-script/triton_client.yaml    |  2 +-
 .../server-deployer/build_and_push.sh          |  4 ++--
 .../server-deployer/chart/triton/Chart.yaml    |  2 +-
 .../server-deployer/chart/triton/values.yaml   |  6 +++---
 .../server-deployer/data-test/schema.yaml      |  2 +-
 .../server-deployer/schema.yaml                |  4 ++--
 .../gke-marketplace-app/trt-engine/README.md   |  6 +++---
 deploy/k8s-onprem/values.yaml                  |  2 +-
 deploy/oci/values.yaml                         |  2 +-
 docs/customization_guide/compose.md            | 18 +++++++++---------
 docs/getting_started/llm.md                    |  4 ++--
 docs/introduction/release_notes.md             |  4 ++--
 docs/user_guide/performance_tuning.md          |  4 ++--
 python/openai/README.md                        |  6 +++---
 qa/common/gen_jetson_trt_models                |  2 +-
 qa/common/gen_qa_model_repository              |  2 +-
 24 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 8febb7bf39..b2181abe6e 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.04-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.05-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
diff --git a/README.md b/README.md
index b2f3d818b0..6fc36283f3 100644
--- a/README.md
+++ b/README.md
@@ -27,11 +27,6 @@
 -->
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
->[!WARNING]
->You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.68.0](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 26.04 container release on NVIDIA GPU Cloud (NGC).
-
 # Triton Inference Server
 
 Triton Inference Server is an open source inference serving software that
@@ -90,16 +85,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r26.04 https://github.com/triton-inference-server/server.git
+git clone -b r26.05 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.04-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.05-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.04-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.05-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
 Image '/workspace/images/mug.jpg':
diff --git a/TRITON_VERSION b/TRITON_VERSION
index 5c6f0d3953..a740b92f5e 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.69.0dev
+2.69.0
diff --git a/build.py b/build.py
index 557f55ae77..e80d62f399 100755
--- a/build.py
+++ b/build.py
@@ -71,8 +71,8 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.69.0dev",
-    "triton_container_version": "26.05dev",
+    "release_version": "2.69.0",
+    "triton_container_version": "26.05",
     "upstream_container_version": "26.04",
     "ort_version": "1.24.4",
     "ort_openvino_version": "2026.1.0",
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index a140611d4f..c94f832aa8 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:26.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:26.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index e96abde6f5..bd360e7955 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: 2.68.0
+appVersion: 2.69.0
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index 6ecf3b351d..b911db4afd 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:26.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:26.05-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r26.04/docs/user_guide/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r26.05/docs/user_guide/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r26.04/README.md
+    # see https://github.com/triton-inference-server/server/blob/r26.05/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index c9900d68a0..9784c9d252 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:26.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:26.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index a732a1da20..0e1347f4fd 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:26.04-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:26.05-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index 8e00967f88..4b4468d89d 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -28,8 +28,8 @@
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
 export MAJOR_VERSION=2.67
-export MINOR_VERSION=2.68.0
-export NGC_VERSION=26.04-py3
+export MINOR_VERSION=2.69.0
+export NGC_VERSION=26.05-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index 18f83cca68..d150f0e8d7 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -28,4 +28,4 @@ apiVersion: v1
 appVersion: "2.68"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.68.0
+version: 2.69.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index 8cfd8171b8..362107e71a 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/26.04
-publishedVersion: '2.68.0'
+modelRepositoryPath: gs://triton_sample_models/26.05
+publishedVersion: '2.69.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 26.04-py3
+  tag: 26.05-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 7583068bc6..4c312c9880 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.68.0'
+  publishedVersion: '2.69.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index 457e13d19d..ccf3b157c4 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.68.0'
+  publishedVersion: '2.69.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/26.04
+    default: gs://triton_sample_models/26.05
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index 0200987e6f..fff7466da4 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:26.04-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:26.05-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/26.04/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/26.05/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/26.04/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/26.05/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index 4dc4bf2c15..3d788f3f17 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -30,7 +30,7 @@ tags:
   openshift: false
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:26.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:26.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index f8867069c1..df5d60066d 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:26.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:26.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index e88f0c90ba..e922d27fbe 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -46,8 +46,8 @@ The `compose.py` script can be found in the
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
 For example branch
- [r26.04](https://github.com/triton-inference-server/server/tree/r26.04)
-should be used to create a image based on the NGC 26.04 Triton release.
+ [r26.05](https://github.com/triton-inference-server/server/tree/r26.05)
+should be used to create a image based on the NGC 26.05 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -78,20 +78,20 @@ For example, running
 ```
 python3 compose.py --backend pytorch --repoagent checksum
 ```
-on branch [r26.04](https://github.com/triton-inference-server/server/tree/r26.04) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:26.04-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:26.04-py3`
+on branch [r26.05](https://github.com/triton-inference-server/server/tree/r26.05) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:26.05-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:26.05-py3`
 
 Alternatively, users can specify the version of Triton container to pull from
 any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend pytorch --repoagent checksum --container-version 26.04
+python3 compose.py --backend pytorch --repoagent checksum --container-version 26.05
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:26.04-py3-min --image full,nvcr.io/nvidia/tritonserver:26.04-py3
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:26.05-py3-min --image full,nvcr.io/nvidia/tritonserver:26.05-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore,
 `--image` flag overrides the `--container-version` flag when both are specified.
@@ -102,8 +102,8 @@ Note:
 2. vLLM and TensorRT-LLM backends are currently not supported backends for
 `compose.py`. If you want to build additional backends on top of these backends,
 it would be better to [build it yourself](#build-it-yourself) by using
-`nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3` or
-`nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3` as a `min` container.
+`nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3` as a `min` container.
 
 
 ### CPU-only container composition
diff --git a/docs/getting_started/llm.md b/docs/getting_started/llm.md
index 052d7829ca..6ea9a5aa33 100644
--- a/docs/getting_started/llm.md
+++ b/docs/getting_started/llm.md
@@ -282,7 +282,7 @@ The above needs to be done manually with your favorite editor. Once finished, pl
     -v $(pwd)/all_models:/opt/all_models \
     -v $(pwd)/scripts:/opt/scripts \
     -v $(pwd)/Phi-3-mini-4k-instruct:/opt/Phi-3-mini-4k-instruct \
-    nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3
+    nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3
 
     # Launch Server
     python3 ../scripts/launch_triton_server.py --model_repo ../all_models/inflight_batcher_llm --world_size 1
@@ -308,7 +308,7 @@ The above needs to be done manually with your favorite editor. Once finished, pl
 
 <!---->
 
-    export RELEASE="26.04"
+    export RELEASE="26.05"
     docker run -it --net=host --gpus '"device=0"'  nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
 
 17. ## Download the Phi-3 tokenizer
diff --git a/docs/introduction/release_notes.md b/docs/introduction/release_notes.md
index 295734b89f..19fc0f22f3 100644
--- a/docs/introduction/release_notes.md
+++ b/docs/introduction/release_notes.md
@@ -25,9 +25,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
-# [Triton Inference Server Release 26.04](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-26-04.html#rel-26-04)
+# [Triton Inference Server Release 26.05](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-26-05.html#rel-26-05)
 
-The Triton Inference Server container image, release 26.04, is available
+The Triton Inference Server container image, release 26.05, is available
 on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver) and
 is open source
 on [GitHub](https://github.com/triton-inference-server/server). Release notes can
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index d51ad5ab2f..ff837a4629 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -235,7 +235,7 @@ with a `tritonserver` binary.
 
 ```bash
 # Start server container
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:26.04-py3
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:26.05-py3
 
 # Start serving your models
 tritonserver --model-repository=/mnt/models
@@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u
 
 ```bash
 # Start the SDK container interactively
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:26.04-py3-sdk
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:26.05-py3-sdk
 
 # Benchmark model being served from step 3
 perf_analyzer -m densenet_onnx --concurrency-range 1:4
diff --git a/python/openai/README.md b/python/openai/README.md
index 4598a5a43f..4134e72cb6 100644
--- a/python/openai/README.md
+++ b/python/openai/README.md
@@ -46,7 +46,7 @@
 docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
-  nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3
 ```
 
 2. Launch the OpenAI-compatible Triton Inference Server:
@@ -355,7 +355,7 @@ Currently, OpenAI-Compatible Frontend supports loading embedding models and embe
 docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
-  nvcr.io/nvidia/tritonserver:26.04-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:26.05-vllm-python-py3
 ```
 
 2. Launch the OpenAI-compatible Triton Inference Server:
@@ -451,7 +451,7 @@ docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
   -e TRTLLM_ORCHESTRATOR=1 \
-  nvcr.io/nvidia/tritonserver:26.04-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:26.05-trtllm-python-py3
 ```
 
 2. Install dependencies inside the container:
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 7fcdb49823..4d491fa2a1 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=26.04}
+TRITON_VERSION=${TRITON_VERSION:=26.05}
 # Set the CUDA device to use
 NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:=0}
 # Set TensorRT image
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 328f42bbe0..cb1b680800 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -66,7 +66,7 @@ log_message.status "Changing working directory to the script directory to: " "${
 cd ${TRITON_MDLS_BASE_SCRIPT_DIR}
 
 log_message.status "define: default values"
-TRITON_VERSION=${TRITON_VERSION:=26.04}
+TRITON_VERSION=${TRITON_VERSION:=26.05}
 ONNX_VERSION=1.20.1
 ONNX_OPSET=0
 OPENVINO_VERSION=2024.5.0

From cfd8ec8e35fb4d68303f38206c17fd96879b3f91 Mon Sep 17 00:00:00 2001
From: J Wyman <jwyman@nvidia.com>
Date: Fri, 15 May 2026 15:13:04 -0400
Subject: [PATCH 2/2] test: Add Torch AOTI Tests (#8771)

This change:

Creates a new L0_torch_aoti test suit.
Adds complex Torch AOTI model generation to qa/common/gen_qa_models.py.
Cleans up existion AOTI model generation in qa/common/gen_qa_models.py.
Enabled torchvision AOTI model generation in qa/common/gen_qa_model_repository.
---
 qa/L0_torch_aoti/test.sh                  | 148 +++++++
 qa/L0_torch_aoti/torch_aoti_infer_test.py | 284 +++++++++++++
 qa/common/gen_qa_model_repository         |   2 +-
 qa/common/gen_qa_models.py                | 492 +++++++++++++++++-----
 4 files changed, 821 insertions(+), 105 deletions(-)
 create mode 100755 qa/L0_torch_aoti/test.sh
 create mode 100755 qa/L0_torch_aoti/torch_aoti_infer_test.py

diff --git a/qa/L0_torch_aoti/test.sh b/qa/L0_torch_aoti/test.sh
new file mode 100755
index 0000000000..f37751c55e
--- /dev/null
+++ b/qa/L0_torch_aoti/test.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+source ../common/util.sh
+
+if [[ "${DEBUG}" == "true" ]]; then
+    set -x
+else
+    set +x
+fi
+
+COLOR_DARK="\033[90m"
+COLOR_ERROR="\033[31m"
+COLOR_INFO="\033[94m"
+COLOR_RESET="\033[0m"
+COLOR_STATUS="\033[36m"
+COLOR_SUCCESS="\033[32m"
+COLOR_WARNING="\033[33m"
+RET=0
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [[ "$#" -ge 1 ]]; then
+    REPO_VERSION=$1
+fi
+if [[ -z "$REPO_VERSION" ]]; then
+    echo -e "${COLOR_ERROR}Repository version must be specified${COLOR_RESET}" 1>&2
+    echo -e "${COLOR_ERROR}\n***\n*** Test Failed\n***${COLOR_RESET}" 1>&2
+    exit 1
+fi
+if [[ ! -z "$TEST_REPO_ARCH" ]]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+MODELDIR=${MODELDIR:=`pwd`/models}
+DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
+TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
+SERVER=${TRITON_DIR}/bin/tritonserver
+BACKEND_DIR=${TRITON_DIR}/backends
+
+# PyTorch on SBSA requires libgomp to be loaded first. See the following
+# GitHub issue for more information:
+# https://github.com/pytorch/pytorch/issues/2575
+arch=`uname -m`
+echo -e "${COLOR_DARK}Detected architecture: ${arch}${COLOR_RESET}"
+if [[ "${arch}" == "aarch64" ]]; then
+    SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1
+    echo -e "${COLOR_DARK}SERVER_LD_PRELOAD=${SERVER_LD_PRELOAD}${COLOR_RESET}"
+fi
+
+# If BACKENDS not specified, set to all
+BACKENDS=${BACKENDS:="pytorch"}
+export BACKENDS
+
+# Copy the models into the model repository
+echo -e "${COLOR_DARK}Setting up model repository in ${MODELDIR}${COLOR_RESET}"
+rm -rf ${MODELDIR} && mkdir -p ${MODELDIR}
+models=(
+    "torch_aoti_complex_index"
+    "torch_aoti_complex_named"
+    "torch_aoti_int8_int8"
+    "torch_aoti_int16_int16"
+    "torch_aoti_int32_int32"
+    "torch_aoti_int64_int64"
+    "torch_aoti_float16_float16"
+    "torch_aoti_float32_float32"
+    "torchvision_aoti"
+)
+for model in "${models[@]}"; do
+    cp -r ${DATADIR}/qa_model_repository/${model} ${MODELDIR}/${model}
+    echo -e "${COLOR_DARK}ls ${MODELDIR}/${model}${COLOR_RESET}"
+    ls -lha ${MODELDIR}/${model}
+done
+echo -e "${COLOR_DARK}ls ${MODELDIR}${COLOR_RESET}"
+ls -lha ${MODELDIR}
+
+SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1"
+SERVER_LOG="./torch_aoti_complex_named-server.log"
+CLIENT_LOG="./torch_aoti_complex_named-client.log"
+
+echo -e "${COLOR_DARK}Running ${SERVER} with model repository ${MODELDIR}${COLOR_RESET}"
+run_server
+if [[ "${SERVER_PID}" -eq 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Failed to start ${SERVER}\n***${COLOR_RESET}" &1>2
+    cat ${SERVER_LOG} &1>2
+    echo -e "\n" &1>2
+    exit 1
+fi
+
+# Install torch framework
+echo -e "${COLOR_DARK}Installing PyTorch framework required by tests${COLOR_RESET}"
+pip install torch
+
+# Run the Tests
+TEST_NAME="torch_aoti_infer_test"
+python3 ./${TEST_NAME}.py >> ${CLIENT_LOG} 2>&1
+EXIT_CODE=$?
+if [[ ${EXIT_CODE} -ne 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Test '${TEST_NAME}' Failed with exit code ${EXIT_CODE}\n***${COLOR_RESET}" &1>2
+    cat ${CLIENT_LOG} &1>2
+    echo -e "\n" &1>2
+    RET=1
+else
+    echo -e "${COLOR_INFO}\n***\n*** Test '${TEST_NAME}' Passed\n***${COLOR_RESET}"
+fi
+
+# Cleanup
+echo -e "${COLOR_DARK}Killing server (pid: ${SERVER_PID})${COLOR_RESET}"
+kill -s SIGINT ${SERVER_PID}
+wait ${SERVER_PID} || true
+echo -e "${COLOR_DARK}Removing model repository${COLOR_RESET}"
+for model in "${models[@]}"; do
+    rm -rf ${MODELDIR}/${model}
+done
+
+# Report results and exit.
+if [[ ${RET} -ne 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Test Suite FAILED\n***${COLOR_RESET}" &1>2
+else
+    echo -e "${COLOR_SUCCESS}\n***\n*** Test Suite PASSED\n***${COLOR_RESET}"
+fi
+
+exit ${RET}
diff --git a/qa/L0_torch_aoti/torch_aoti_infer_test.py b/qa/L0_torch_aoti/torch_aoti_infer_test.py
new file mode 100755
index 0000000000..2b93f31a48
--- /dev/null
+++ b/qa/L0_torch_aoti/torch_aoti_infer_test.py
@@ -0,0 +1,284 @@
+#!/usr/bin/python
+# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import unittest
+
+import test_util as tu
+import torch
+import tritonclient.http as http
+
+
+class TorchAotiTest(tu.TestResultCollector):
+    def _get_complex_input_shape(self):
+        return (1, 16)
+
+    def _get_complex_output_shape(self):
+        return (1, 16)
+
+    def _get_complex_input_data(self, shape):
+        return [
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+        ]
+
+    def _get_simple_input_data(self, shape, io_type):
+        if io_type in [torch.int8, torch.int16, torch.int32, torch.int64]:
+            return torch.randint(low=0, high=127, size=shape, dtype=io_type).numpy()
+        elif io_type in [torch.float16, torch.float32, torch.float64]:
+            return torch.randn(size=shape, dtype=io_type).numpy()
+        else:
+            raise ValueError(f"Unsupported data type: {io_type}")
+
+    def _get_torchvision_input_data(self, shape):
+        return torch.randn(size=shape, dtype=torch.float32).numpy()
+
+    def _dtype_to_triton_dtype(self, dtype):
+        if dtype == torch.int8:
+            return "INT8"
+        elif dtype == torch.int16:
+            return "INT16"
+        elif dtype == torch.int32:
+            return "INT32"
+        elif dtype == torch.int64:
+            return "INT64"
+        elif dtype == torch.float16:
+            return "FP16"
+        elif dtype == torch.float32:
+            return "FP32"
+        else:
+            raise ValueError(f"Unsupported data type: {dtype}")
+
+    def _get_simple_model_name(self, io_type):
+        if io_type == torch.int8:
+            return "torch_aoti_int8_int8"
+        elif io_type == torch.int16:
+            return "torch_aoti_int16_int16"
+        elif io_type == torch.int32:
+            return "torch_aoti_int32_int32"
+        elif io_type == torch.int64:
+            return "torch_aoti_int64_int64"
+        elif io_type == torch.float16:
+            return "torch_aoti_float16_float16"
+        elif io_type == torch.float32:
+            return "torch_aoti_float32_float32"
+        else:
+            raise ValueError(f"Unsupported data type: {io_type}")
+
+    def test_complex_index(self):
+        MODEL_NAME = "torch_aoti_complex_index"
+        INPUT_SHAPE = self._get_complex_input_shape()
+        OUTPUT_SHAPE = self._get_complex_output_shape()
+
+        input_data = self._get_complex_input_data(INPUT_SHAPE)
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("INPUT__0", input_data[0].shape, "INT8"),
+                http.InferInput("INPUT__1", input_data[1].shape, "INT8"),
+                http.InferInput("INPUT__2", input_data[2].shape, "INT8"),
+                http.InferInput("INPUT__3", input_data[3].shape, "INT8"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
+            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
+
+            output_names = [
+                "OUTPUT__0",
+                "OUTPUT__1",
+                "OUTPUT__2",
+                "OUTPUT__3",
+                "OUTPUT__4",
+                "OUTPUT__5",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+
+            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
+            self.assertTrue((output_data[1] == input_data[0] - input_data[1]).all())
+            self.assertTrue((output_data[2] == input_data[0]).all())
+            self.assertTrue((output_data[3] == input_data[1]).all())
+            self.assertTrue((output_data[4] == input_data[2]).all())
+            self.assertTrue((output_data[5] == input_data[3]).all())
+
+    def test_complex_named(self):
+        MODEL_NAME = "torch_aoti_complex_named"
+        INPUT_SHAPE = self._get_complex_input_shape()
+        OUTPUT_SHAPE = self._get_complex_output_shape()
+
+        input_data = self._get_complex_input_data(INPUT_SHAPE)
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("ARGS[0]", input_data[0].shape, "INT8"),
+                http.InferInput("ARGS[1]", input_data[1].shape, "INT8"),
+                http.InferInput("ARGS[2][option1]", input_data[2].shape, "INT8"),
+                http.InferInput("ARGS[2][option2]", input_data[3].shape, "INT8"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
+            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
+
+            output_names = [
+                "RESULT[AAA]",
+                "RESULT[BBB][0]",
+                "RESULT[BBB][1]",
+                "RESULT[CCC][option1]",
+                "RESULT[CCC][option2]",
+                "RESULT[ZZZ]",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+
+            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
+            self.assertTrue((output_data[1] == input_data[0]).all())
+            self.assertTrue((output_data[2] == input_data[1]).all())
+            self.assertTrue((output_data[3] == input_data[2]).all())
+            self.assertTrue((output_data[4] == input_data[3]).all())
+            self.assertTrue((output_data[5] == (input_data[0] - input_data[1])).all())
+
+    def test_simple_model(self):
+        io_types = [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+        ]
+        for io_type in io_types:
+            MODEL_NAME = self._get_simple_model_name(io_type)
+            INPUT_SHAPE = (16,)
+            OUTPUT_SHAPE = (16,)
+            TRITON_IO_TYPE = self._dtype_to_triton_dtype(io_type)
+
+            input_data = (
+                self._get_simple_input_data(INPUT_SHAPE, io_type),
+                self._get_simple_input_data(INPUT_SHAPE, io_type),
+            )
+
+            with http.InferenceServerClient("localhost:8000") as client:
+                inputs = [
+                    http.InferInput("ARGS[0]", input_data[0].shape, TRITON_IO_TYPE),
+                    http.InferInput("ARGS[1]", input_data[1].shape, TRITON_IO_TYPE),
+                ]
+
+                inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+                inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+
+                output_names = [
+                    "RESULT",
+                ]
+
+                outputs = []
+                for output_name in output_names:
+                    outputs.append(
+                        http.InferRequestedOutput(output_name, binary_data=True)
+                    )
+
+                output_data = []
+                results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+                for output_name in output_names:
+                    output_data.append(results.as_numpy(output_name))
+
+                self.assertEqual(len(outputs), len(output_data))
+                for data in output_data:
+                    self.assertEqual(data.shape, OUTPUT_SHAPE)
+                    self.assertTrue((data == input_data[0] + input_data[1]).all())
+
+    def test_torchvision(self):
+        MODEL_NAME = "torchvision_aoti"
+        INPUT_SHAPE = (1, 3, 224, 224)
+        OUTPUT_SHAPE = (1, 1000)
+
+        input_data = self._get_torchvision_input_data(INPUT_SHAPE)
+        input_data[0][0] = 1.0
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("ARGS[0]", input_data.shape, "FP32"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data, binary_data=True)
+
+            output_names = [
+                "RESULT",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+                output_tensor = torch.from_numpy(data)
+                self.assertTrue(torch.isfinite(output_tensor).all().item())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index cb1b680800..a503b64a10 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -263,9 +263,9 @@ set -e
 PATH=$PATH:/usr/local/cuda-13.0/bin
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --models_dir=$TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --models_dir=$TRITON_MDLS_QA_MODEL
+python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torchvision-aoti --models_dir=$TRITON_MDLS_QA_MODEL
 chmod -R 777 $TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
-python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
 chmod -R 777 $TRITON_MDLS_QA_VARIABLE_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_identity_models.py --libtorch --models_dir=$TRITON_MDLS_QA_IDENTITY_MODEL
 chmod -R 777 $TRITON_MDLS_QA_IDENTITY_MODEL
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
index d509562bff..a015dc0e2d 100755
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -47,6 +47,7 @@
 from typing import List, Tuple
 
 _color_blue = "\033[94m"
+_color_cyan = "\033[36m"
 _color_green = "\033[32m"
 _color_magenta = "\033[35m"
 _color_red = "\033[31m"
@@ -1289,7 +1290,7 @@ def forward(self, INPUT0, INPUT1):
     traced.save(f"{model_version_dir}/model.pt")
 
 
-def generate_sample_inputs(
+def generate_torch_aoti_sample_inputs(
     input_shape,
     input_dtype,
     device,
@@ -1297,70 +1298,32 @@ def generate_sample_inputs(
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     input_shape = [abs(ips) for ips in input_shape]
 
-    if input_dtype == np.int8:
-        input0 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
-        input1 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
-    elif input_dtype == np.int16:
-        input0 = torch.randint(
-            -32768, 32767, input_shape, dtype=torch.int16, device=device
-        )
-        input1 = torch.randint(
-            -32768, 32767, input_shape, dtype=torch.int16, device=device
-        )
-    elif input_dtype == np.int32:
-        input0 = torch.randint(
-            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
-        )
-        input1 = torch.randint(
-            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
-        )
-    elif input_dtype == np.int64:
-        input0 = torch.randint(
-            -9223372036854775808,
-            9223372036854775807,
-            input_shape,
-            dtype=torch.int64,
-            device=device,
-        )
-        input1 = torch.randint(
-            -9223372036854775808,
-            9223372036854775807,
-            input_shape,
-            dtype=torch.int64,
-            device=device,
-        )
-    elif input_dtype == np.float16:
-        input0 = torch.randn(*input_shape, dtype=torch.float16, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float16, device=device)
-    elif input_dtype == np.float32:
-        input0 = torch.randn(*input_shape, dtype=torch.float32, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float32, device=device)
-    elif input_dtype == np.float64:
-        input0 = torch.randn(*input_shape, dtype=torch.float64, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float64, device=device)
-    elif input_dtype == np.uint8:
-        input0 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
-        input1 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
-    elif input_dtype == np.uint16:
-        input0 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
-        input1 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
-    elif input_dtype == np.uint32:
-        input0 = torch.randint(
-            0, 4294967295, input_shape, dtype=torch.uint32, device=device
-        )
-        input1 = torch.randint(
-            0, 4294967295, input_shape, dtype=torch.uint32, device=device
-        )
-    elif input_dtype == np.uint64:
-        input0 = torch.randint(
-            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
-        )
-        input1 = torch.randint(
-            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
+    np_to_torch_dtype = {
+        np.int8: torch.int8,
+        np.int16: torch.int16,
+        np.int32: torch.int32,
+        np.int64: torch.int64,
+        np.float16: torch.float16,
+        np.float32: torch.float32,
+        np.float64: torch.float64,
+        np.uint8: torch.uint8,
+        np.uint16: torch.uint16,
+        np.uint32: torch.uint32,
+        np.uint64: torch.uint64,
+    }
+
+    if input_dtype not in np_to_torch_dtype:
+        print(
+            f"{_color_yellow}warning: dtype {input_dtype} is unsupported; falling back to torch.int32{_color_reset}"
         )
-    else:
-        input0 = torch.randn(*input_shape, device=device)
-        input1 = torch.randn(*input_shape, device=device)
+        input_dtype = np.int32
+
+    input0 = torch.zeros(
+        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
+    )
+    input1 = torch.zeros(
+        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
+    )
 
     return (input0, input1)
 
@@ -1395,7 +1358,7 @@ def np_to_dtype(np_dtype):
         return torch.int32
 
 
-def create_torch_aoti_modelfile(
+def create_torch_aoti_model_file(
     models_dir,
     model_version,
     input_shape,
@@ -1418,7 +1381,7 @@ def create_torch_aoti_modelfile(
         )
         return False
 
-    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
+    model_version_dir = os.path.join(models_dir, model_name, str(model_version))
 
     print(f"{_color_green}Creating model {model_name}{_color_reset}")
 
@@ -1465,13 +1428,14 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     model.to(device)
     model = model.eval()
 
-    sample_input = generate_sample_inputs(input_shape, input_dtype, device)
+    sample_inputs = generate_torch_aoti_sample_inputs(input_shape, input_dtype, device)
+    package_path = os.path.join(model_version_dir, "model.pt2")
 
     try:
-        ep = torch.export.export(model, sample_input)
+        exported_model = torch.export.export(model, sample_inputs)
         torch._inductor.aoti_compile_and_package(
-            ep,
-            package_path=f"{model_version_dir}/model.pt2",
+            exported_model,
+            package_path=package_path,
         )
     except Exception as e:
         print(
@@ -1484,13 +1448,162 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     return True
 
 
-def create_torchvision_aoti_modelfile(
+def create_torch_aoti_complex_model_file(
+    models_dir: str,
+):
+    base_name = "torch_aoti_complex"
+    model_names = [
+        f"{base_name}_named",
+        f"{base_name}_index",
+    ]
+    model_version_dirs = [
+        os.path.join(models_dir, model_names[0], "1"),
+        os.path.join(models_dir, model_names[1], "1"),
+    ]
+
+    for model_version_dir in model_version_dirs:
+        try:
+            os.makedirs(model_version_dir)
+        except OSError:
+            pass  # ignore existing dir
+
+    print(f"{_color_green}Creating model {base_name}{_color_reset}")
+
+    class TorchAotiComplex(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(
+            self,
+            hdata: torch.Tensor,
+            vdata: torch.Tensor,
+            options: dict[str, torch.Tensor],
+        ) -> dict[
+            str,
+            torch.Tensor | tuple[torch.Tensor, torch.Tensor] | dict[str, torch.Tensor],
+        ]:
+            out = {
+                "AAA": hdata + vdata,
+                "ZZZ": hdata - vdata,
+                "BBB": (
+                    hdata,
+                    vdata,
+                ),
+                "CCC": options,
+            }
+
+            return out
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = TorchAotiComplex()
+    model.to(device)
+    model = model.eval()
+
+    SHAPE = (1, 16)
+
+    sample_args = (
+        torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        {
+            "option1": torch.zeros(SHAPE, dtype=torch.int8, device=device),
+            "option2": torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        },
+    )
+
+    # Export and package the model
+    print(f"{_color_green}Exporting and packaging the model...{_color_reset}")
+
+    model_file_name = "model.pt2"
+    package_paths = [
+        os.path.join(model_version_dirs[0], model_file_name),
+        os.path.join(model_version_dirs[1], model_file_name),
+    ]
+
+    try:
+        exported_model = torch.export.export(model, sample_args)
+        torch._inductor.aoti_compile_and_package(
+            exported_model,
+            package_path=package_paths[0],
+        )
+    except Exception as e:
+        print(
+            f"{_color_red}error: Failed to create model {base_name}{_color_reset}",
+            file=sys.stderr,
+        )
+        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
+        return False
+
+    try:
+        # Now load and run the packaged model
+        print(f"{_color_cyan}Loading and running the packaged model...{_color_reset}")
+
+        compiled_model = torch._inductor.aoti_load_package(package_paths[0])
+
+        print(f"{_color_cyan}Compiled model call spec:{_color_reset}")
+
+        for elem in compiled_model.loader.get_call_spec():
+            print(elem)
+
+        print(f"{_color_cyan}Running the compiled model...{_color_reset}")
+
+        with torch.inference_mode():
+            hdata = torch.randint(
+                low=0,
+                high=127,
+                size=SHAPE,
+                dtype=torch.int8,
+                device=device,
+            )
+            vdata = torch.randint(
+                low=0,
+                high=127,
+                size=SHAPE,
+                dtype=torch.int8,
+                device=device,
+            )
+            options = {
+                "option1": torch.randint(
+                    low=0,
+                    high=127,
+                    size=SHAPE,
+                    dtype=torch.int8,
+                    device=device,
+                ),
+                "option2": torch.randint(
+                    low=0,
+                    high=127,
+                    size=SHAPE,
+                    dtype=torch.int8,
+                    device=device,
+                ),
+            }
+
+            _ = compiled_model(hdata, vdata, options)
+
+            print(
+                f'{_color_green}Model "{base_name}" successfully executed.{_color_reset}'
+            )
+    except Exception as e:
+        print(
+            f"{_color_red}error: Failed to validate model {base_name}{_color_reset}",
+            file=sys.stderr,
+        )
+        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
+        return False
+
+    # Copy the compiled model package to the alternate model folder.
+    # Both the named and ordinal addressing versions of the model (from Triton's point-of-view) use the same compiled model.
+    shutil.copy(package_paths[0], package_paths[1])
+
+    return True
+
+
+def create_torchvision_aoti_model_file(
     models_dir: str,
     max_batch: int,
-    model_version: int,
 ):
     model_name = "torchvision_aoti"
-    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
+    model_version_dir = os.path.join(models_dir, model_name, "1")
 
     try:
         os.makedirs(model_version_dir)
@@ -1504,16 +1617,16 @@ def create_torchvision_aoti_modelfile(
     model = model.to(device)
     model = model.eval()
 
+    SHAPE = (max_batch, 3, 224, 224)
+
     # Example input tensor with batch size 1 and 3 color channels (RGB), height and width of 224
-    input_tensor = torch.randn(max_batch, 3, 224, 224, device=device)
+    sample_inputs = (torch.zeros(SHAPE, dtype=torch.float32, device=device),)
 
-    try:
-        ep = torch.export.export(model, (input_tensor,))
+    package_path = os.path.join(model_version_dir, "model.pt2")
 
-        torch._inductor.aoti_compile_and_package(
-            ep,
-            package_path=f"{model_version_dir}/model.pt2",
-        )
+    try:
+        ep = torch.export.export(model, sample_inputs)
+        torch._inductor.aoti_compile_and_package(ep, package_path=package_path)
     except Exception as e:
         print(
             f"{_color_red}error: Failed to create model {model_name}{_color_reset}",
@@ -1609,9 +1722,11 @@ def create_libtorch_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
     with open(f"{config_dir}/{label_filename}", "w") as file:
         for l in range(output0_label_cnt):
@@ -1619,7 +1734,7 @@ def create_libtorch_modelconfig(
         print(f"Created {config_dir}/{label_filename}")
 
 
-def create_torch_aoti_modelconfig(
+def create_torch_aoti_model_config(
     models_dir,
     input_shape,
     output_shape,
@@ -1650,7 +1765,7 @@ def create_torch_aoti_modelconfig(
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
     label_filename = "output_labels.txt"
-    config_dir = f"{models_dir}/{model_name}"
+    config_dir = os.path.join(models_dir, model_name)
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1658,19 +1773,19 @@ def create_torch_aoti_modelconfig(
 version_policy: {version_policy_str}
 input [
   {{
-    name: "INPUT0"
+    name: "ARGS[0]"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }},
   {{
-    name: "INPUT1"
+    name: "ARGS[1]"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }}
 ]
 output [
   {{
-    name: "OUTPUT__0"
+    name: "RESULT"
     data_type: {np_to_model_dtype(output_dtype)}
     dims: [ {tu.shape_to_dims_str(output_shape)} ]
     label_filename: "{label_filename}"
@@ -1684,17 +1799,173 @@ def create_torch_aoti_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
-    with open(f"{config_dir}/{label_filename}", "w") as file:
+    label_path = os.path.join(config_dir, label_filename)
+
+    with open(label_path, "w") as file:
         for l in range(output_label_cnt):
             file.write(f"label{l}\n")
-        print(f"Created {config_dir}/{label_filename}")
+        print(f"Created {label_path}")
+
+
+def create_torch_aoti_complex_model_config(
+    models_dir,
+):
+    base_name = "torch_aoti_complex"
+    model_names = [
+        f"{base_name}_named",
+        f"{base_name}_index",
+    ]
+
+    print(f"{_color_green}Creating config for {base_name}{_color_reset}")
+
+    config_dirs = [
+        os.path.join(models_dir, model_names[0]),
+        os.path.join(models_dir, model_names[1]),
+    ]
+    configs = [
+        f"""
+backend: "pytorch"
+platform: "torch_aoti"
+name: "{model_names[0]}"
+input: [
+  {{
+    name: "ARGS[0]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[2][option1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[2][option2]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+output: [
+  {{
+    name: "RESULT[AAA]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[BBB][0]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[BBB][1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[CCC][option1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[CCC][option2]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[ZZZ]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
+""",
+        f"""
+backend: "pytorch"
+name: "{model_names[1]}"
+platform: "torch_aoti"
+input: [
+  {{
+    name: "INPUT__0"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__1"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__2"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__3"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+output: [
+  {{
+    name: "OUTPUT__0"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__1"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__2"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__3"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__4"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__5"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
+""",
+    ]
+
+    for i in range(2):
+        config_dir = config_dirs[i]
+        try:
+            os.makedirs(config_dir)
+        except OSError:
+            pass  # ignore existing dir
 
+        config_path = os.path.join(config_dir, "config.pbtxt")
 
-def create_torchvision_aoti_modelconfig(
+        with open(config_path, "w") as file:
+            file.write(configs[i])
+            print(f"Created {config_path}")
+
+
+def create_torchvision_aoti_model_config(
     models_dir: str,
     max_batch: int,
 ):
@@ -1703,7 +1974,7 @@ def create_torchvision_aoti_modelconfig(
 
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
-    config_dir = f"{models_dir}/{model_name}"
+    config_dir = os.path.join(models_dir, model_name)
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1711,14 +1982,13 @@ def create_torchvision_aoti_modelconfig(
 max_batch_size: {max_batch}
 input  [
   {{
-    name: "INPUT__0"
+    name: "ARGS[0]"
     data_type: TYPE_FP32
-    format: FORMAT_NCHW
     dims: [ 3, 224, 224 ]
   }}]
 output [
   {{
-    name: "OUTPUT__0"
+    name: "RESULT"
     data_type: TYPE_FP32
     dims: [ 1000 ]
     label_filename: "{label_filename}"
@@ -1732,15 +2002,19 @@ def create_torchvision_aoti_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
     source_path = os.environ.get("TRITON_GENSRCDIR", default="gen_srcdir")
     source_filename = os.path.join(source_path, RESNET50_LABEL_FILE)
 
-    shutil.copyfile(source_filename, f"{config_dir}/{label_filename}")
-    print(f"Created {config_dir}/{label_filename}")
+    target_path = os.path.join(config_dir, label_filename)
+
+    shutil.copyfile(source_filename, target_path)
+    print(f"Created {target_path}")
 
 
 def create_openvino_modelfile(
@@ -2106,14 +2380,14 @@ def create_models(
                 f"{_color_magenta}PyTorch: AOTI model generation requested{_color_reset}"
             )
             # max-batch 8
-            if create_torch_aoti_modelfile(
+            if create_torch_aoti_model_file(
                 models_dir,
                 model_version,
                 input_shape,
                 input_dtype,
                 output0_dtype,
             ):
-                create_torch_aoti_modelconfig(
+                create_torch_aoti_model_config(
                     models_dir,
                     input_shape,
                     output0_shape,
@@ -2352,6 +2626,8 @@ def create_fixed_models(
     if FLAGS.onnx:
         import onnx
     if FLAGS.libtorch or FLAGS.torch_aoti:
+        import shutil
+
         import torch
         from torch import nn
     if FLAGS.torchvision_aoti:
@@ -2747,7 +3023,15 @@ def create_fixed_models(
             for model_shape in [(-1,), (-1, -1), (-1, -1, -1)]:
                 emu.create_nop_modelconfig(FLAGS.models_dir, model_shape, model_dtype)
 
+    if FLAGS.torch_aoti:
+        print(
+            f"{_color_magenta}PyTorch: Complex AOTI model generation requested{_color_reset}"
+        )
+        if create_torch_aoti_complex_model_file(FLAGS.models_dir):
+            create_torch_aoti_complex_model_config(FLAGS.models_dir)
+
     if FLAGS.torchvision_aoti:
+        # TODO: Add support for variable batch size and version policy for torchvision AOTI models.
         print(f"{_color_blue}TorchVision AOTI model generation requested{_color_reset}")
-        if create_torchvision_aoti_modelfile(FLAGS.models_dir, 1, 1):
-            create_torchvision_aoti_modelconfig(FLAGS.models_dir, 1)
+        if create_torchvision_aoti_model_file(FLAGS.models_dir, 1):
+            create_torchvision_aoti_model_config(FLAGS.models_dir, 1)