From 0c063cf18e57dc90b8129745f1cbfe47e6257716 Mon Sep 17 00:00:00 2001
From: yinggeh <yinggeh@nvidia.com>
Date: Wed, 13 May 2026 21:58:21 -0700
Subject: [PATCH 1/6] Initial commit

---
 docs/user_guide/request_cancellation.md | 19 ++++---
 qa/L0_request_cancellation/test.sh      | 68 +++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/docs/user_guide/request_cancellation.md b/docs/user_guide/request_cancellation.md
index 753d03968a..10a676ced0 100644
--- a/docs/user_guide/request_cancellation.md
+++ b/docs/user_guide/request_cancellation.md
@@ -28,14 +28,14 @@
 
 # Request Cancellation
 
-Starting from r23.10, Triton supports handling request cancellation received
-from the gRPC client or a C API user. Long running inference requests such
-as for auto generative large language models may run for an indeterminate
-amount of time or indeterminate number of steps. Additionally clients may
-enqueue a large number of requests as part of a sequence or request stream
-and later determine the results are no longer needed. Continuing to process
-requests whose results are no longer required can significantly impact server
-resources.
+Triton supports handling request cancellation received from the gRPC Python
+client or a C API user (since r23.10), and C++ client (since r26.05).
+Long running inference requests such as for auto generative large language
+models may run for an indeterminate amount of time or indeterminate number of
+steps. Additionally clients may enqueue a large number of requests as part of
+a sequence or request stream and later determine the results are no longer
+needed. Continuing to process requests whose results are no longer required can
+significantly impact server resources.
 
 ## Issuing Request Cancellation
 
@@ -51,8 +51,7 @@ about the APIs in [tritonserver.h](https://github.com/triton-inference-server/co
 
 In addition, [gRPC endpoint](../customization_guide/inference_protocols.md#httprest-and-grpc-protocols) can
 now detect cancellation from the client and attempt to terminate request.
-At present, only gRPC python client supports issuing request cancellation
-to the server endpoint. See [request-cancellation](https://github.com/triton-inference-server/client#request-cancellation)
+See [request-cancellation](https://github.com/triton-inference-server/client#request-cancellation)
 for more details on how to issue requests from the client-side.
 See gRPC guide on RPC [cancellation](https://grpc.io/docs/guides/cancellation/) for
 finer details.
diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh
index d8cffb91e6..9174650f3b 100755
--- a/qa/L0_request_cancellation/test.sh
+++ b/qa/L0_request_cancellation/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -42,8 +42,10 @@ export CUDA_VISIBLE_DEVICES=0
 
 SERVER=/opt/tritonserver/bin/tritonserver
 source ../common/util.sh
+CANCEL_LOG_LINE="Cancellation notification received for"
 
 RET=0
+rm -f *.log
 
 #
 # Unit tests
@@ -66,7 +68,7 @@ if [ $? -ne 0 ]; then
 fi
 
 #
-# gRPC cancellation tests
+# Python gRPC cancellation tests
 #
 rm -rf models && mkdir models
 mkdir -p models/custom_identity_int32/1 && (cd models/custom_identity_int32 && \
@@ -121,7 +123,7 @@ for TEST_CASE in "test_grpc_async_infer" \
         RET=1
     fi
 
-    count=$(grep -o "Cancellation notification received for" $SERVER_LOG | wc -l)
+    count=$(grep -o "$CANCEL_LOG_LINE" $SERVER_LOG | wc -l)
     if [ $count == 0 ]; then
         echo -e "\n***\n*** Cancellation not received by server on $TEST_CASE\n***"
         cat $SERVER_LOG
@@ -170,6 +172,66 @@ for TEST_CASE in "test_grpc_async_infer" \
     fi
 done
 
+#
+# C++ gRPC cancellation tests
+#
+GRPC_CANCELLATION_TEST_CPP=../clients/grpc_cancellation_test
+
+for ENTRY in "TestGrpcAsyncInfer 1" \
+             "TestGrpcAsyncInferCancelAfterCompletionIsNoOp 0" \
+             "TestGrpcAsyncInferWithoutContextStillCompletes 0" \
+             "TestGrpcAsyncInferMulti 2" \
+             "TestGrpcStreamInfer 1" \
+             "TestGrpcStreamCancelWithoutInfer 1" \
+             "TestGrpcStreamCancelThenRestart 1"; do
+    read -r TEST_CASE EXPECTED_CANCEL_COUNT <<< "$ENTRY"
+
+    TEST_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.log"
+    SERVER_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.server.log"
+
+    # AsyncInferMulti fans out N concurrent requests; bump to 3 CPU
+    # instances so each can execute in parallel. Reverted after the test
+    # so subsequent cases keep the default single-instance config.
+    if [ "$TEST_CASE" == "TestGrpcAsyncInferMulti" ]; then
+        sed -i 's|instance_group .*|instance_group [{ count: 3, kind: KIND_CPU }]|' \
+            models/custom_identity_int32/config.pbtxt
+    fi
+
+    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    set +e
+    LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH \
+        $GRPC_CANCELLATION_TEST_CPP \
+            --gtest_filter="GrpcCancellationTest.$TEST_CASE" > $TEST_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** C++ gRPC Cancellation Tests Failed on $TEST_CASE\n***"
+        cat $TEST_LOG
+        RET=1
+    fi
+
+    cancel_count=$(grep -c "$CANCEL_LOG_LINE" $SERVER_LOG || true)
+    if [ $cancel_count -ne $EXPECTED_CANCEL_COUNT ]; then
+        echo -e "\n***\n*** Unexpected cancellation count on $TEST_CASE. Expected $EXPECTED_CANCEL_COUNT but received $cancel_count.\n***"
+        cat $SERVER_LOG
+        RET=1
+    fi
+    set -e
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+
+    if [ "$TEST_CASE" == "TestGrpcAsyncInferMulti" ]; then
+        sed -i 's|instance_group .*|instance_group [{ kind: KIND_CPU }]|' \
+            models/custom_identity_int32/config.pbtxt
+    fi
+done
+
 #
 # End-to-end scheduler tests
 #

From 219f0a591cc89b3da60595d77876dcead9b4d80b Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 18 May 2026 08:41:36 -0700
Subject: [PATCH 2/6] Add in-queue request  cancellation

---
 qa/L0_request_cancellation/test.sh | 32 +++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh
index 9174650f3b..2b92c12027 100755
--- a/qa/L0_request_cancellation/test.sh
+++ b/qa/L0_request_cancellation/test.sh
@@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0
 
 SERVER=/opt/tritonserver/bin/tritonserver
 source ../common/util.sh
-CANCEL_LOG_LINE="Cancellation notification received for"
+CANCEL_LOG_LINE="Cancellation notification received for "
 
 RET=0
 rm -f *.log
@@ -175,13 +175,27 @@ done
 #
 # C++ gRPC cancellation tests
 #
+# allow_timeout_override disables queue prefetching, keeping requests queued
+# long enough for the "Queued" cancellation tests to cancel them before
+# forwarding to the rate limiter. This saves overall test time.
+cat >> models/custom_identity_int32/config.pbtxt <<'EOF'
+dynamic_batching {
+  default_queue_policy {
+    allow_timeout_override: true
+  }
+}
+EOF
+
 GRPC_CANCELLATION_TEST_CPP=../clients/grpc_cancellation_test
 
-for ENTRY in "TestGrpcAsyncInfer 1" \
+for ENTRY in "TestGrpcAsyncInferCancelExecutingRequest 1" \
+             "TestGrpcAsyncInferCancelQueuedRequest 2" \
              "TestGrpcAsyncInferCancelAfterCompletionIsNoOp 0" \
              "TestGrpcAsyncInferWithoutContextStillCompletes 0" \
-             "TestGrpcAsyncInferMulti 2" \
-             "TestGrpcStreamInfer 1" \
+             "TestGrpcAsyncInferMultiCancelExecutingRequests 2" \
+             "TestGrpcAsyncInferMultiCancelQueuedRequest 2" \
+             "TestGrpcStreamInferCancelExecutingRequest 1" \
+             "TestGrpcStreamInferCancelQueuedRequest 1" \
              "TestGrpcStreamCancelWithoutInfer 1" \
              "TestGrpcStreamCancelThenRestart 1"; do
     read -r TEST_CASE EXPECTED_CANCEL_COUNT <<< "$ENTRY"
@@ -189,10 +203,10 @@ for ENTRY in "TestGrpcAsyncInfer 1" \
     TEST_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.log"
     SERVER_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.server.log"
 
-    # AsyncInferMulti fans out N concurrent requests; bump to 3 CPU
-    # instances so each can execute in parallel. Reverted after the test
-    # so subsequent cases keep the default single-instance config.
-    if [ "$TEST_CASE" == "TestGrpcAsyncInferMulti" ]; then
+    # AsyncInferMulti fans out N concurrent requests; bump to 3 CPU instances
+    # so each can execute in parallel. Every other test uses the default
+    # single-instance config.
+    if [ "$TEST_CASE" == "TestGrpcAsyncInferMultiCancelExecutingRequests" ]; then
         sed -i 's|instance_group .*|instance_group [{ count: 3, kind: KIND_CPU }]|' \
             models/custom_identity_int32/config.pbtxt
     fi
@@ -226,7 +240,7 @@ for ENTRY in "TestGrpcAsyncInfer 1" \
     kill $SERVER_PID
     wait $SERVER_PID
 
-    if [ "$TEST_CASE" == "TestGrpcAsyncInferMulti" ]; then
+    if [ "$TEST_CASE" == "TestGrpcAsyncInferMultiCancelExecutingRequests" ]; then
         sed -i 's|instance_group .*|instance_group [{ kind: KIND_CPU }]|' \
             models/custom_identity_int32/config.pbtxt
     fi

From 89e6a014dd0d187431ecfcf5ac11a2c3b039ae07 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 18 May 2026 09:25:30 -0700
Subject: [PATCH 3/6] Revert "test: Add Torch AOTI Tests (#8771)"

This reverts commit 5133f7ba17ae59ae34b232fcddd72dae9b26c1e9.
---
 qa/L0_torch_aoti/test.sh                  | 148 -------
 qa/L0_torch_aoti/torch_aoti_infer_test.py | 284 -------------
 qa/common/gen_qa_model_repository         |   2 +-
 qa/common/gen_qa_models.py                | 492 +++++-----------------
 4 files changed, 105 insertions(+), 821 deletions(-)
 delete mode 100755 qa/L0_torch_aoti/test.sh
 delete mode 100755 qa/L0_torch_aoti/torch_aoti_infer_test.py

diff --git a/qa/L0_torch_aoti/test.sh b/qa/L0_torch_aoti/test.sh
deleted file mode 100755
index f37751c55e..0000000000
--- a/qa/L0_torch_aoti/test.sh
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/bin/bash
-# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-source ../common/util.sh
-
-if [[ "${DEBUG}" == "true" ]]; then
-    set -x
-else
-    set +x
-fi
-
-COLOR_DARK="\033[90m"
-COLOR_ERROR="\033[31m"
-COLOR_INFO="\033[94m"
-COLOR_RESET="\033[0m"
-COLOR_STATUS="\033[36m"
-COLOR_SUCCESS="\033[32m"
-COLOR_WARNING="\033[33m"
-RET=0
-
-REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
-if [[ "$#" -ge 1 ]]; then
-    REPO_VERSION=$1
-fi
-if [[ -z "$REPO_VERSION" ]]; then
-    echo -e "${COLOR_ERROR}Repository version must be specified${COLOR_RESET}" 1>&2
-    echo -e "${COLOR_ERROR}\n***\n*** Test Failed\n***${COLOR_RESET}" 1>&2
-    exit 1
-fi
-if [[ ! -z "$TEST_REPO_ARCH" ]]; then
-    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
-fi
-
-export CUDA_VISIBLE_DEVICES=0
-
-MODELDIR=${MODELDIR:=`pwd`/models}
-DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
-TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
-SERVER=${TRITON_DIR}/bin/tritonserver
-BACKEND_DIR=${TRITON_DIR}/backends
-
-# PyTorch on SBSA requires libgomp to be loaded first. See the following
-# GitHub issue for more information:
-# https://github.com/pytorch/pytorch/issues/2575
-arch=`uname -m`
-echo -e "${COLOR_DARK}Detected architecture: ${arch}${COLOR_RESET}"
-if [[ "${arch}" == "aarch64" ]]; then
-    SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1
-    echo -e "${COLOR_DARK}SERVER_LD_PRELOAD=${SERVER_LD_PRELOAD}${COLOR_RESET}"
-fi
-
-# If BACKENDS not specified, set to all
-BACKENDS=${BACKENDS:="pytorch"}
-export BACKENDS
-
-# Copy the models into the model repository
-echo -e "${COLOR_DARK}Setting up model repository in ${MODELDIR}${COLOR_RESET}"
-rm -rf ${MODELDIR} && mkdir -p ${MODELDIR}
-models=(
-    "torch_aoti_complex_index"
-    "torch_aoti_complex_named"
-    "torch_aoti_int8_int8"
-    "torch_aoti_int16_int16"
-    "torch_aoti_int32_int32"
-    "torch_aoti_int64_int64"
-    "torch_aoti_float16_float16"
-    "torch_aoti_float32_float32"
-    "torchvision_aoti"
-)
-for model in "${models[@]}"; do
-    cp -r ${DATADIR}/qa_model_repository/${model} ${MODELDIR}/${model}
-    echo -e "${COLOR_DARK}ls ${MODELDIR}/${model}${COLOR_RESET}"
-    ls -lha ${MODELDIR}/${model}
-done
-echo -e "${COLOR_DARK}ls ${MODELDIR}${COLOR_RESET}"
-ls -lha ${MODELDIR}
-
-SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1"
-SERVER_LOG="./torch_aoti_complex_named-server.log"
-CLIENT_LOG="./torch_aoti_complex_named-client.log"
-
-echo -e "${COLOR_DARK}Running ${SERVER} with model repository ${MODELDIR}${COLOR_RESET}"
-run_server
-if [[ "${SERVER_PID}" -eq 0 ]]; then
-    echo -e "${COLOR_ERROR}\n***\n*** Failed to start ${SERVER}\n***${COLOR_RESET}" &1>2
-    cat ${SERVER_LOG} &1>2
-    echo -e "\n" &1>2
-    exit 1
-fi
-
-# Install torch framework
-echo -e "${COLOR_DARK}Installing PyTorch framework required by tests${COLOR_RESET}"
-pip install torch
-
-# Run the Tests
-TEST_NAME="torch_aoti_infer_test"
-python3 ./${TEST_NAME}.py >> ${CLIENT_LOG} 2>&1
-EXIT_CODE=$?
-if [[ ${EXIT_CODE} -ne 0 ]]; then
-    echo -e "${COLOR_ERROR}\n***\n*** Test '${TEST_NAME}' Failed with exit code ${EXIT_CODE}\n***${COLOR_RESET}" &1>2
-    cat ${CLIENT_LOG} &1>2
-    echo -e "\n" &1>2
-    RET=1
-else
-    echo -e "${COLOR_INFO}\n***\n*** Test '${TEST_NAME}' Passed\n***${COLOR_RESET}"
-fi
-
-# Cleanup
-echo -e "${COLOR_DARK}Killing server (pid: ${SERVER_PID})${COLOR_RESET}"
-kill -s SIGINT ${SERVER_PID}
-wait ${SERVER_PID} || true
-echo -e "${COLOR_DARK}Removing model repository${COLOR_RESET}"
-for model in "${models[@]}"; do
-    rm -rf ${MODELDIR}/${model}
-done
-
-# Report results and exit.
-if [[ ${RET} -ne 0 ]]; then
-    echo -e "${COLOR_ERROR}\n***\n*** Test Suite FAILED\n***${COLOR_RESET}" &1>2
-else
-    echo -e "${COLOR_SUCCESS}\n***\n*** Test Suite PASSED\n***${COLOR_RESET}"
-fi
-
-exit ${RET}
diff --git a/qa/L0_torch_aoti/torch_aoti_infer_test.py b/qa/L0_torch_aoti/torch_aoti_infer_test.py
deleted file mode 100755
index 2b93f31a48..0000000000
--- a/qa/L0_torch_aoti/torch_aoti_infer_test.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/python
-# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-sys.path.append("../common")
-
-import unittest
-
-import test_util as tu
-import torch
-import tritonclient.http as http
-
-
-class TorchAotiTest(tu.TestResultCollector):
-    def _get_complex_input_shape(self):
-        return (1, 16)
-
-    def _get_complex_output_shape(self):
-        return (1, 16)
-
-    def _get_complex_input_data(self, shape):
-        return [
-            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
-            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
-            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
-            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
-        ]
-
-    def _get_simple_input_data(self, shape, io_type):
-        if io_type in [torch.int8, torch.int16, torch.int32, torch.int64]:
-            return torch.randint(low=0, high=127, size=shape, dtype=io_type).numpy()
-        elif io_type in [torch.float16, torch.float32, torch.float64]:
-            return torch.randn(size=shape, dtype=io_type).numpy()
-        else:
-            raise ValueError(f"Unsupported data type: {io_type}")
-
-    def _get_torchvision_input_data(self, shape):
-        return torch.randn(size=shape, dtype=torch.float32).numpy()
-
-    def _dtype_to_triton_dtype(self, dtype):
-        if dtype == torch.int8:
-            return "INT8"
-        elif dtype == torch.int16:
-            return "INT16"
-        elif dtype == torch.int32:
-            return "INT32"
-        elif dtype == torch.int64:
-            return "INT64"
-        elif dtype == torch.float16:
-            return "FP16"
-        elif dtype == torch.float32:
-            return "FP32"
-        else:
-            raise ValueError(f"Unsupported data type: {dtype}")
-
-    def _get_simple_model_name(self, io_type):
-        if io_type == torch.int8:
-            return "torch_aoti_int8_int8"
-        elif io_type == torch.int16:
-            return "torch_aoti_int16_int16"
-        elif io_type == torch.int32:
-            return "torch_aoti_int32_int32"
-        elif io_type == torch.int64:
-            return "torch_aoti_int64_int64"
-        elif io_type == torch.float16:
-            return "torch_aoti_float16_float16"
-        elif io_type == torch.float32:
-            return "torch_aoti_float32_float32"
-        else:
-            raise ValueError(f"Unsupported data type: {io_type}")
-
-    def test_complex_index(self):
-        MODEL_NAME = "torch_aoti_complex_index"
-        INPUT_SHAPE = self._get_complex_input_shape()
-        OUTPUT_SHAPE = self._get_complex_output_shape()
-
-        input_data = self._get_complex_input_data(INPUT_SHAPE)
-
-        with http.InferenceServerClient("localhost:8000") as client:
-            inputs = [
-                http.InferInput("INPUT__0", input_data[0].shape, "INT8"),
-                http.InferInput("INPUT__1", input_data[1].shape, "INT8"),
-                http.InferInput("INPUT__2", input_data[2].shape, "INT8"),
-                http.InferInput("INPUT__3", input_data[3].shape, "INT8"),
-            ]
-
-            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
-            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
-            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
-            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
-
-            output_names = [
-                "OUTPUT__0",
-                "OUTPUT__1",
-                "OUTPUT__2",
-                "OUTPUT__3",
-                "OUTPUT__4",
-                "OUTPUT__5",
-            ]
-
-            outputs = []
-            for output_name in output_names:
-                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
-
-            output_data = []
-            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
-
-            for output_name in output_names:
-                output_data.append(results.as_numpy(output_name))
-
-            self.assertEqual(len(outputs), len(output_data))
-            for data in output_data:
-                self.assertEqual(data.shape, OUTPUT_SHAPE)
-
-            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
-            self.assertTrue((output_data[1] == input_data[0] - input_data[1]).all())
-            self.assertTrue((output_data[2] == input_data[0]).all())
-            self.assertTrue((output_data[3] == input_data[1]).all())
-            self.assertTrue((output_data[4] == input_data[2]).all())
-            self.assertTrue((output_data[5] == input_data[3]).all())
-
-    def test_complex_named(self):
-        MODEL_NAME = "torch_aoti_complex_named"
-        INPUT_SHAPE = self._get_complex_input_shape()
-        OUTPUT_SHAPE = self._get_complex_output_shape()
-
-        input_data = self._get_complex_input_data(INPUT_SHAPE)
-
-        with http.InferenceServerClient("localhost:8000") as client:
-            inputs = [
-                http.InferInput("ARGS[0]", input_data[0].shape, "INT8"),
-                http.InferInput("ARGS[1]", input_data[1].shape, "INT8"),
-                http.InferInput("ARGS[2][option1]", input_data[2].shape, "INT8"),
-                http.InferInput("ARGS[2][option2]", input_data[3].shape, "INT8"),
-            ]
-
-            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
-            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
-            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
-            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
-
-            output_names = [
-                "RESULT[AAA]",
-                "RESULT[BBB][0]",
-                "RESULT[BBB][1]",
-                "RESULT[CCC][option1]",
-                "RESULT[CCC][option2]",
-                "RESULT[ZZZ]",
-            ]
-
-            outputs = []
-            for output_name in output_names:
-                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
-
-            output_data = []
-            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
-
-            for output_name in output_names:
-                output_data.append(results.as_numpy(output_name))
-
-            self.assertEqual(len(outputs), len(output_data))
-            for data in output_data:
-                self.assertEqual(data.shape, OUTPUT_SHAPE)
-
-            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
-            self.assertTrue((output_data[1] == input_data[0]).all())
-            self.assertTrue((output_data[2] == input_data[1]).all())
-            self.assertTrue((output_data[3] == input_data[2]).all())
-            self.assertTrue((output_data[4] == input_data[3]).all())
-            self.assertTrue((output_data[5] == (input_data[0] - input_data[1])).all())
-
-    def test_simple_model(self):
-        io_types = [
-            torch.int8,
-            torch.int16,
-            torch.int32,
-            torch.int64,
-            torch.float16,
-            torch.float32,
-        ]
-        for io_type in io_types:
-            MODEL_NAME = self._get_simple_model_name(io_type)
-            INPUT_SHAPE = (16,)
-            OUTPUT_SHAPE = (16,)
-            TRITON_IO_TYPE = self._dtype_to_triton_dtype(io_type)
-
-            input_data = (
-                self._get_simple_input_data(INPUT_SHAPE, io_type),
-                self._get_simple_input_data(INPUT_SHAPE, io_type),
-            )
-
-            with http.InferenceServerClient("localhost:8000") as client:
-                inputs = [
-                    http.InferInput("ARGS[0]", input_data[0].shape, TRITON_IO_TYPE),
-                    http.InferInput("ARGS[1]", input_data[1].shape, TRITON_IO_TYPE),
-                ]
-
-                inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
-                inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
-
-                output_names = [
-                    "RESULT",
-                ]
-
-                outputs = []
-                for output_name in output_names:
-                    outputs.append(
-                        http.InferRequestedOutput(output_name, binary_data=True)
-                    )
-
-                output_data = []
-                results = client.infer(MODEL_NAME, inputs, outputs=outputs)
-
-                for output_name in output_names:
-                    output_data.append(results.as_numpy(output_name))
-
-                self.assertEqual(len(outputs), len(output_data))
-                for data in output_data:
-                    self.assertEqual(data.shape, OUTPUT_SHAPE)
-                    self.assertTrue((data == input_data[0] + input_data[1]).all())
-
-    def test_torchvision(self):
-        MODEL_NAME = "torchvision_aoti"
-        INPUT_SHAPE = (1, 3, 224, 224)
-        OUTPUT_SHAPE = (1, 1000)
-
-        input_data = self._get_torchvision_input_data(INPUT_SHAPE)
-        input_data[0][0] = 1.0
-
-        with http.InferenceServerClient("localhost:8000") as client:
-            inputs = [
-                http.InferInput("ARGS[0]", input_data.shape, "FP32"),
-            ]
-
-            inputs[0].set_data_from_numpy(input_data, binary_data=True)
-
-            output_names = [
-                "RESULT",
-            ]
-
-            outputs = []
-            for output_name in output_names:
-                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
-
-            output_data = []
-            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
-
-            for output_name in output_names:
-                output_data.append(results.as_numpy(output_name))
-
-            self.assertEqual(len(outputs), len(output_data))
-            for data in output_data:
-                self.assertEqual(data.shape, OUTPUT_SHAPE)
-                output_tensor = torch.from_numpy(data)
-                self.assertTrue(torch.isfinite(output_tensor).all().item())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index dfb19e3344..d490f8c530 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -263,9 +263,9 @@ set -e
 PATH=$PATH:/usr/local/cuda-13.0/bin
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --models_dir=$TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --models_dir=$TRITON_MDLS_QA_MODEL
-python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torchvision-aoti --models_dir=$TRITON_MDLS_QA_MODEL
 chmod -R 777 $TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
+python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
 chmod -R 777 $TRITON_MDLS_QA_VARIABLE_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_identity_models.py --libtorch --models_dir=$TRITON_MDLS_QA_IDENTITY_MODEL
 chmod -R 777 $TRITON_MDLS_QA_IDENTITY_MODEL
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
index cb8aa7a993..12d2a7225f 100755
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -47,7 +47,6 @@
 from typing import List, Tuple
 
 _color_blue = "\033[94m"
-_color_cyan = "\033[36m"
 _color_green = "\033[32m"
 _color_magenta = "\033[35m"
 _color_red = "\033[31m"
@@ -1292,7 +1291,7 @@ def forward(self, INPUT0, INPUT1):
     traced.save(f"{model_version_dir}/model.pt")
 
 
-def generate_torch_aoti_sample_inputs(
+def generate_sample_inputs(
     input_shape,
     input_dtype,
     device,
@@ -1300,32 +1299,70 @@ def generate_torch_aoti_sample_inputs(
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     input_shape = [abs(ips) for ips in input_shape]
 
-    np_to_torch_dtype = {
-        np.int8: torch.int8,
-        np.int16: torch.int16,
-        np.int32: torch.int32,
-        np.int64: torch.int64,
-        np.float16: torch.float16,
-        np.float32: torch.float32,
-        np.float64: torch.float64,
-        np.uint8: torch.uint8,
-        np.uint16: torch.uint16,
-        np.uint32: torch.uint32,
-        np.uint64: torch.uint64,
-    }
-
-    if input_dtype not in np_to_torch_dtype:
-        print(
-            f"{_color_yellow}warning: dtype {input_dtype} is unsupported; falling back to torch.int32{_color_reset}"
+    if input_dtype == np.int8:
+        input0 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
+        input1 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
+    elif input_dtype == np.int16:
+        input0 = torch.randint(
+            -32768, 32767, input_shape, dtype=torch.int16, device=device
         )
-        input_dtype = np.int32
-
-    input0 = torch.zeros(
-        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
-    )
-    input1 = torch.zeros(
-        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
-    )
+        input1 = torch.randint(
+            -32768, 32767, input_shape, dtype=torch.int16, device=device
+        )
+    elif input_dtype == np.int32:
+        input0 = torch.randint(
+            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
+        )
+        input1 = torch.randint(
+            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
+        )
+    elif input_dtype == np.int64:
+        input0 = torch.randint(
+            -9223372036854775808,
+            9223372036854775807,
+            input_shape,
+            dtype=torch.int64,
+            device=device,
+        )
+        input1 = torch.randint(
+            -9223372036854775808,
+            9223372036854775807,
+            input_shape,
+            dtype=torch.int64,
+            device=device,
+        )
+    elif input_dtype == np.float16:
+        input0 = torch.randn(*input_shape, dtype=torch.float16, device=device)
+        input1 = torch.randn(*input_shape, dtype=torch.float16, device=device)
+    elif input_dtype == np.float32:
+        input0 = torch.randn(*input_shape, dtype=torch.float32, device=device)
+        input1 = torch.randn(*input_shape, dtype=torch.float32, device=device)
+    elif input_dtype == np.float64:
+        input0 = torch.randn(*input_shape, dtype=torch.float64, device=device)
+        input1 = torch.randn(*input_shape, dtype=torch.float64, device=device)
+    elif input_dtype == np.uint8:
+        input0 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
+        input1 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
+    elif input_dtype == np.uint16:
+        input0 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
+        input1 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
+    elif input_dtype == np.uint32:
+        input0 = torch.randint(
+            0, 4294967295, input_shape, dtype=torch.uint32, device=device
+        )
+        input1 = torch.randint(
+            0, 4294967295, input_shape, dtype=torch.uint32, device=device
+        )
+    elif input_dtype == np.uint64:
+        input0 = torch.randint(
+            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
+        )
+        input1 = torch.randint(
+            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
+        )
+    else:
+        input0 = torch.randn(*input_shape, device=device)
+        input1 = torch.randn(*input_shape, device=device)
 
     return (input0, input1)
 
@@ -1360,7 +1397,7 @@ def np_to_dtype(np_dtype):
         return torch.int32
 
 
-def create_torch_aoti_model_file(
+def create_torch_aoti_modelfile(
     models_dir,
     model_version,
     input_shape,
@@ -1383,7 +1420,7 @@ def create_torch_aoti_model_file(
         )
         return False
 
-    model_version_dir = os.path.join(models_dir, model_name, str(model_version))
+    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
 
     print(f"{_color_green}Creating model {model_name}{_color_reset}")
 
@@ -1430,14 +1467,13 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     model.to(device)
     model = model.eval()
 
-    sample_inputs = generate_torch_aoti_sample_inputs(input_shape, input_dtype, device)
-    package_path = os.path.join(model_version_dir, "model.pt2")
+    sample_input = generate_sample_inputs(input_shape, input_dtype, device)
 
     try:
-        exported_model = torch.export.export(model, sample_inputs)
+        ep = torch.export.export(model, sample_input)
         torch._inductor.aoti_compile_and_package(
-            exported_model,
-            package_path=package_path,
+            ep,
+            package_path=f"{model_version_dir}/model.pt2",
         )
     except Exception as e:
         print(
@@ -1450,162 +1486,13 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     return True
 
 
-def create_torch_aoti_complex_model_file(
-    models_dir: str,
-):
-    base_name = "torch_aoti_complex"
-    model_names = [
-        f"{base_name}_named",
-        f"{base_name}_index",
-    ]
-    model_version_dirs = [
-        os.path.join(models_dir, model_names[0], "1"),
-        os.path.join(models_dir, model_names[1], "1"),
-    ]
-
-    for model_version_dir in model_version_dirs:
-        try:
-            os.makedirs(model_version_dir)
-        except OSError:
-            pass  # ignore existing dir
-
-    print(f"{_color_green}Creating model {base_name}{_color_reset}")
-
-    class TorchAotiComplex(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-
-        def forward(
-            self,
-            hdata: torch.Tensor,
-            vdata: torch.Tensor,
-            options: dict[str, torch.Tensor],
-        ) -> dict[
-            str,
-            torch.Tensor | tuple[torch.Tensor, torch.Tensor] | dict[str, torch.Tensor],
-        ]:
-            out = {
-                "AAA": hdata + vdata,
-                "ZZZ": hdata - vdata,
-                "BBB": (
-                    hdata,
-                    vdata,
-                ),
-                "CCC": options,
-            }
-
-            return out
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = TorchAotiComplex()
-    model.to(device)
-    model = model.eval()
-
-    SHAPE = (1, 16)
-
-    sample_args = (
-        torch.zeros(SHAPE, dtype=torch.int8, device=device),
-        torch.zeros(SHAPE, dtype=torch.int8, device=device),
-        {
-            "option1": torch.zeros(SHAPE, dtype=torch.int8, device=device),
-            "option2": torch.zeros(SHAPE, dtype=torch.int8, device=device),
-        },
-    )
-
-    # Export and package the model
-    print(f"{_color_green}Exporting and packaging the model...{_color_reset}")
-
-    model_file_name = "model.pt2"
-    package_paths = [
-        os.path.join(model_version_dirs[0], model_file_name),
-        os.path.join(model_version_dirs[1], model_file_name),
-    ]
-
-    try:
-        exported_model = torch.export.export(model, sample_args)
-        torch._inductor.aoti_compile_and_package(
-            exported_model,
-            package_path=package_paths[0],
-        )
-    except Exception as e:
-        print(
-            f"{_color_red}error: Failed to create model {base_name}{_color_reset}",
-            file=sys.stderr,
-        )
-        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
-        return False
-
-    try:
-        # Now load and run the packaged model
-        print(f"{_color_cyan}Loading and running the packaged model...{_color_reset}")
-
-        compiled_model = torch._inductor.aoti_load_package(package_paths[0])
-
-        print(f"{_color_cyan}Compiled model call spec:{_color_reset}")
-
-        for elem in compiled_model.loader.get_call_spec():
-            print(elem)
-
-        print(f"{_color_cyan}Running the compiled model...{_color_reset}")
-
-        with torch.inference_mode():
-            hdata = torch.randint(
-                low=0,
-                high=127,
-                size=SHAPE,
-                dtype=torch.int8,
-                device=device,
-            )
-            vdata = torch.randint(
-                low=0,
-                high=127,
-                size=SHAPE,
-                dtype=torch.int8,
-                device=device,
-            )
-            options = {
-                "option1": torch.randint(
-                    low=0,
-                    high=127,
-                    size=SHAPE,
-                    dtype=torch.int8,
-                    device=device,
-                ),
-                "option2": torch.randint(
-                    low=0,
-                    high=127,
-                    size=SHAPE,
-                    dtype=torch.int8,
-                    device=device,
-                ),
-            }
-
-            _ = compiled_model(hdata, vdata, options)
-
-            print(
-                f'{_color_green}Model "{base_name}" successfully executed.{_color_reset}'
-            )
-    except Exception as e:
-        print(
-            f"{_color_red}error: Failed to validate model {base_name}{_color_reset}",
-            file=sys.stderr,
-        )
-        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
-        return False
-
-    # Copy the compiled model package to the alternate model folder.
-    # Both the named and ordinal addressing versions of the model (from Triton's point-of-view) use the same compiled model.
-    shutil.copy(package_paths[0], package_paths[1])
-
-    return True
-
-
-def create_torchvision_aoti_model_file(
+def create_torchvision_aoti_modelfile(
     models_dir: str,
     max_batch: int,
+    model_version: int,
 ):
     model_name = "torchvision_aoti"
-    model_version_dir = os.path.join(models_dir, model_name, "1")
+    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
 
     try:
         os.makedirs(model_version_dir)
@@ -1619,16 +1506,16 @@ def create_torchvision_aoti_model_file(
     model = model.to(device)
     model = model.eval()
 
-    SHAPE = (max_batch, 3, 224, 224)
-
     # Example input tensor with batch size 1 and 3 color channels (RGB), height and width of 224
-    sample_inputs = (torch.zeros(SHAPE, dtype=torch.float32, device=device),)
-
-    package_path = os.path.join(model_version_dir, "model.pt2")
+    input_tensor = torch.randn(max_batch, 3, 224, 224, device=device)
 
     try:
-        ep = torch.export.export(model, sample_inputs)
-        torch._inductor.aoti_compile_and_package(ep, package_path=package_path)
+        ep = torch.export.export(model, (input_tensor,))
+
+        torch._inductor.aoti_compile_and_package(
+            ep,
+            package_path=f"{model_version_dir}/model.pt2",
+        )
     except Exception as e:
         print(
             f"{_color_red}error: Failed to create model {model_name}{_color_reset}",
@@ -1724,11 +1611,9 @@ def create_libtorch_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    config_path = os.path.join(config_dir, "config.pbtxt")
-
-    with open(config_path, "w") as file:
+    with open(f"{config_dir}/config.pbtxt", "w") as file:
         file.write(config)
-        print(f"Created {config_path}")
+        print(f"Created {config_dir}/config.pbtxt")
 
     with open(f"{config_dir}/{label_filename}", "w") as file:
         for l in range(output0_label_cnt):
@@ -1736,7 +1621,7 @@ def create_libtorch_modelconfig(
         print(f"Created {config_dir}/{label_filename}")
 
 
-def create_torch_aoti_model_config(
+def create_torch_aoti_modelconfig(
     models_dir,
     input_shape,
     output_shape,
@@ -1767,7 +1652,7 @@ def create_torch_aoti_model_config(
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
     label_filename = "output_labels.txt"
-    config_dir = os.path.join(models_dir, model_name)
+    config_dir = f"{models_dir}/{model_name}"
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1775,19 +1660,19 @@ def create_torch_aoti_model_config(
 version_policy: {version_policy_str}
 input [
   {{
-    name: "ARGS[0]"
+    name: "INPUT0"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }},
   {{
-    name: "ARGS[1]"
+    name: "INPUT1"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }}
 ]
 output [
   {{
-    name: "RESULT"
+    name: "OUTPUT__0"
     data_type: {np_to_model_dtype(output_dtype)}
     dims: [ {tu.shape_to_dims_str(output_shape)} ]
     label_filename: "{label_filename}"
@@ -1801,173 +1686,17 @@ def create_torch_aoti_model_config(
     except OSError:
         pass  # ignore existing dir
 
-    config_path = os.path.join(config_dir, "config.pbtxt")
-
-    with open(config_path, "w") as file:
+    with open(f"{config_dir}/config.pbtxt", "w") as file:
         file.write(config)
-        print(f"Created {config_path}")
-
-    label_path = os.path.join(config_dir, label_filename)
+        print(f"Created {config_dir}/config.pbtxt")
 
-    with open(label_path, "w") as file:
+    with open(f"{config_dir}/{label_filename}", "w") as file:
         for l in range(output_label_cnt):
             file.write(f"label{l}\n")
-        print(f"Created {label_path}")
-
-
-def create_torch_aoti_complex_model_config(
-    models_dir,
-):
-    base_name = "torch_aoti_complex"
-    model_names = [
-        f"{base_name}_named",
-        f"{base_name}_index",
-    ]
-
-    print(f"{_color_green}Creating config for {base_name}{_color_reset}")
-
-    config_dirs = [
-        os.path.join(models_dir, model_names[0]),
-        os.path.join(models_dir, model_names[1]),
-    ]
-    configs = [
-        f"""
-backend: "pytorch"
-platform: "torch_aoti"
-name: "{model_names[0]}"
-input: [
-  {{
-    name: "ARGS[0]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "ARGS[1]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "ARGS[2][option1]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "ARGS[2][option2]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }}
-]
-output: [
-  {{
-    name: "RESULT[AAA]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "RESULT[BBB][0]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "RESULT[BBB][1]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "RESULT[CCC][option1]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "RESULT[CCC][option2]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "RESULT[ZZZ]"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }}
-]
-instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
-""",
-        f"""
-backend: "pytorch"
-name: "{model_names[1]}"
-platform: "torch_aoti"
-input: [
-  {{
-    name: "INPUT__0"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "INPUT__1"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "INPUT__2"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "INPUT__3"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }}
-]
-output: [
-  {{
-    name: "OUTPUT__0"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "OUTPUT__1"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "OUTPUT__2"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "OUTPUT__3"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "OUTPUT__4"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }},
-  {{
-    name: "OUTPUT__5"
-    data_type: TYPE_INT8
-    dims: [1, 16]
-  }}
-]
-instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
-""",
-    ]
-
-    for i in range(2):
-        config_dir = config_dirs[i]
-        try:
-            os.makedirs(config_dir)
-        except OSError:
-            pass  # ignore existing dir
-
-        config_path = os.path.join(config_dir, "config.pbtxt")
-
-        with open(config_path, "w") as file:
-            file.write(configs[i])
-            print(f"Created {config_path}")
+        print(f"Created {config_dir}/{label_filename}")
 
 
-def create_torchvision_aoti_model_config(
+def create_torchvision_aoti_modelconfig(
     models_dir: str,
     max_batch: int,
 ):
@@ -1976,7 +1705,7 @@ def create_torchvision_aoti_model_config(
 
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
-    config_dir = os.path.join(models_dir, model_name)
+    config_dir = f"{models_dir}/{model_name}"
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1984,13 +1713,14 @@ def create_torchvision_aoti_model_config(
 max_batch_size: {max_batch}
 input  [
   {{
-    name: "ARGS[0]"
+    name: "INPUT__0"
     data_type: TYPE_FP32
+    format: FORMAT_NCHW
     dims: [ 3, 224, 224 ]
   }}]
 output [
   {{
-    name: "RESULT"
+    name: "OUTPUT__0"
     data_type: TYPE_FP32
     dims: [ 1000 ]
     label_filename: "{label_filename}"
@@ -2004,19 +1734,15 @@ def create_torchvision_aoti_model_config(
     except OSError:
         pass  # ignore existing dir
 
-    config_path = os.path.join(config_dir, "config.pbtxt")
-
-    with open(config_path, "w") as file:
+    with open(f"{config_dir}/config.pbtxt", "w") as file:
         file.write(config)
-        print(f"Created {config_path}")
+        print(f"Created {config_dir}/config.pbtxt")
 
     source_path = os.environ.get("TRITON_GENSRCDIR", default="gen_srcdir")
     source_filename = os.path.join(source_path, RESNET50_LABEL_FILE)
 
-    target_path = os.path.join(config_dir, label_filename)
-
-    shutil.copyfile(source_filename, target_path)
-    print(f"Created {target_path}")
+    shutil.copyfile(source_filename, f"{config_dir}/{label_filename}")
+    print(f"Created {config_dir}/{label_filename}")
 
 
 def create_openvino_modelfile(
@@ -2382,14 +2108,14 @@ def create_models(
                 f"{_color_magenta}PyTorch: AOTI model generation requested{_color_reset}"
             )
             # max-batch 8
-            if create_torch_aoti_model_file(
+            if create_torch_aoti_modelfile(
                 models_dir,
                 model_version,
                 input_shape,
                 input_dtype,
                 output0_dtype,
             ):
-                create_torch_aoti_model_config(
+                create_torch_aoti_modelconfig(
                     models_dir,
                     input_shape,
                     output0_shape,
@@ -2628,8 +2354,6 @@ def create_fixed_models(
     if FLAGS.onnx:
         import onnx
     if FLAGS.libtorch or FLAGS.torch_aoti:
-        import shutil
-
         import torch
         from torch import nn
     if FLAGS.torchvision_aoti:
@@ -3025,15 +2749,7 @@ def create_fixed_models(
             for model_shape in [(-1,), (-1, -1), (-1, -1, -1)]:
                 emu.create_nop_modelconfig(FLAGS.models_dir, model_shape, model_dtype)
 
-    if FLAGS.torch_aoti:
-        print(
-            f"{_color_magenta}PyTorch: Complex AOTI model generation requested{_color_reset}"
-        )
-        if create_torch_aoti_complex_model_file(FLAGS.models_dir):
-            create_torch_aoti_complex_model_config(FLAGS.models_dir)
-
     if FLAGS.torchvision_aoti:
-        # TODO: Add support for variable batch size and version policy for torchvision AOTI models.
         print(f"{_color_blue}TorchVision AOTI model generation requested{_color_reset}")
-        if create_torchvision_aoti_model_file(FLAGS.models_dir, 1):
-            create_torchvision_aoti_model_config(FLAGS.models_dir, 1)
+        if create_torchvision_aoti_modelfile(FLAGS.models_dir, 1, 1):
+            create_torchvision_aoti_modelconfig(FLAGS.models_dir, 1)

From 1275bd227ca9e1e3f66438e8400bafb3fe1640f6 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 18 May 2026 09:25:39 -0700
Subject: [PATCH 4/6] Revert "fix(qa): ommit plugin creation if TensorRT branch
 is missed. (#8783)"

This reverts commit 1e69d88b0fccd77c5782c316f6f32a7c2175622f.
---
 .../gen_qa_dyna_sequence_implicit_models.py   |  3 +-
 qa/common/gen_qa_dyna_sequence_models.py      |  6 +--
 qa/common/gen_qa_identity_models.py           |  6 +--
 qa/common/gen_qa_implicit_models.py           |  3 +-
 qa/common/gen_qa_model_repository             | 54 ++++++++++++-------
 qa/common/gen_qa_models.py                    |  6 +--
 qa/common/gen_qa_sequence_models.py           |  6 +--
 qa/common/gen_qa_trt_format_models.py         |  3 +-
 qa/common/gen_qa_trt_plugin_models.py         | 33 +++---------
 9 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/qa/common/gen_qa_dyna_sequence_implicit_models.py b/qa/common/gen_qa_dyna_sequence_implicit_models.py
index c69ca28eab..32e4ebea13 100755
--- a/qa/common/gen_qa_dyna_sequence_implicit_models.py
+++ b/qa/common/gen_qa_dyna_sequence_implicit_models.py
@@ -575,8 +575,7 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_dyna_sequence_models.py b/qa/common/gen_qa_dyna_sequence_models.py
index 1a26890f32..7f8459b0da 100755
--- a/qa/common/gen_qa_dyna_sequence_models.py
+++ b/qa/common/gen_qa_dyna_sequence_models.py
@@ -129,8 +129,7 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
@@ -370,8 +369,7 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_identity_models.py b/qa/common/gen_qa_identity_models.py
index 426d939d9e..248126bcc2 100755
--- a/qa/common/gen_qa_identity_models.py
+++ b/qa/common/gen_qa_identity_models.py
@@ -584,8 +584,7 @@ def create_plan_dynamic_rf_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
@@ -708,8 +707,7 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
diff --git a/qa/common/gen_qa_implicit_models.py b/qa/common/gen_qa_implicit_models.py
index c0800098ec..06b86ee5c2 100755
--- a/qa/common/gen_qa_implicit_models.py
+++ b/qa/common/gen_qa_implicit_models.py
@@ -1066,8 +1066,7 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index d490f8c530..328f42bbe0 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -352,27 +352,43 @@ python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_format_models.py --models_dir=$TRITON_MD
 chmod -R 777 $TRITON_MDLS_QA_TRT_FORMAT_MODEL
 nvidia-smi --query-gpu=compute_cap | grep -qzE '10\.7|11\.0' && echo -e '${COLOR_WARNING}[WARNING]${COLOR_RESET} Skipping model generation for data dependent shape (NonZero not supported on this GPU)${COLOR_RESET}' || python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_data_dependent_shape.py --models_dir=$TRITON_MDLS_QA_TRT_DATA_DEPENDENT_MODEL
 chmod -R 777 $TRITON_MDLS_QA_TRT_DATA_DEPENDENT_MODEL
-# Build the custom Hardmax plugin used by L0_trt_plugin. The plugin source is
-# pulled from the public NVIDIA/TensorRT repo at the release/<major.minor>
-# branch matching the runtime TRT version. If the branch is not published
-# yet for this TRT version, the plugin model generation is skipped with a
-# warning -- L0_trt_plugin will report missing artifacts but the rest of
-# the QA model repository is still produced.
-TRT_BRANCH=\$(echo \${TRT_VERSION} | cut -d . -f -2)
-TRTSRC=/workspace/TensorRT
-rm -rf \${TRTSRC}
-if git clone --depth 1 -b release/\${TRT_BRANCH} \
-     https://github.com/NVIDIA/TensorRT.git \${TRTSRC}; then
-  cd \${TRTSRC}/samples/python/onnx_custom_plugin && \
-    rm -rf build && mkdir build && cd build && cmake .. && make -j && \
-    cp libcustomHardmaxPlugin.so ${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}/.
-  LD_PRELOAD=${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}/libcustomHardmaxPlugin.so \
-    python3 ${TRITON_MDLS_SRC_DIR}/gen_qa_trt_plugin_models.py \
-    --models_dir=${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}
-  chmod -R 777 ${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}
+# Make shared library for custom Hardmax plugin.
+if [ -d "/usr/src/tensorrt/samples/python/onnx_custom_plugin" ]; then
+    cd /usr/src/tensorrt/samples/python/onnx_custom_plugin
 else
-  echo "[WARNING] TensorRT release/\${TRT_BRANCH} not available on github.com/NVIDIA/TensorRT; skipping CustomHardmax plugin model generation. L0_trt_plugin coverage will be missing for this TRT version."
+    TRT_BRANCH=\$(echo \$TRT_VERSION | cut -d . -f -2)
+    if ! git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT; then
+      MAJOR=\$(echo "\$TRT_BRANCH" | cut -d . -f 1)
+      MINOR=\$(echo "\$TRT_BRANCH" | cut -d . -f 2)
+      if [ -n "\$MINOR" ] && [ "\$MINOR" -gt 0 ] 2>/dev/null; then
+        TRT_BRANCH="\${MAJOR}.\$((MINOR - 1))"
+        echo "Fallback: cloning TensorRT release/\${TRT_BRANCH} (previous minor)"
+        git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT
+      elif [ -n "\$MAJOR" ] && [ "\$MAJOR" -gt 0 ] 2>/dev/null; then
+        PREV_MAJOR=\$((MAJOR - 1))
+        echo "Fallback: MINOR is 0, querying remote for latest release/\${PREV_MAJOR}.x branch"
+        TRT_BRANCH=\$(git ls-remote --heads https://github.com/NVIDIA/TensorRT.git "refs/heads/release/\${PREV_MAJOR}.*" \
+          | awk -F'refs/heads/release/' '{print \$2}' \
+          | awk -F. '{print \$2, \$0}' \
+          | sort -k1,1n \
+          | tail -1 \
+          | awk '{print \$2}')
+        if [ -n "\$TRT_BRANCH" ]; then
+          echo "Fallback: cloning TensorRT release/\${TRT_BRANCH}"
+          git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT
+        else
+          exit 1
+        fi
+      else
+        exit 1
+      fi
+    fi
+    cd /workspace/TensorRT/samples/python/onnx_custom_plugin
 fi
+rm -rf build && mkdir build && \
+cd build && cmake .. && make -j && cp libcustomHardmaxPlugin.so $TRITON_MDLS_QA_TRT_PLUGIN_MODEL/.
+LD_PRELOAD=$TRITON_MDLS_QA_TRT_PLUGIN_MODEL/libcustomHardmaxPlugin.so python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_plugin_models.py --models_dir=$TRITON_MDLS_QA_TRT_PLUGIN_MODEL
+chmod -R 777 $TRITON_MDLS_QA_TRT_PLUGIN_MODEL
 exit 0
 EOF
 
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
index 12d2a7225f..d509562bff 100755
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -159,8 +159,7 @@ def create_plan_dynamic_rf_modelfile(
     profile.set_shape("INPUT1", min_shape, opt_shape, max_shape)
 
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
@@ -450,8 +449,7 @@ def create_plan_fixed_rf_modelfile(
     profile.set_shape("INPUT1", min_shape, opt_shape, max_shape)
 
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
diff --git a/qa/common/gen_qa_sequence_models.py b/qa/common/gen_qa_sequence_models.py
index f8d89a5f9e..bf83a3a5f3 100755
--- a/qa/common/gen_qa_sequence_models.py
+++ b/qa/common/gen_qa_sequence_models.py
@@ -118,8 +118,7 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
@@ -321,8 +320,7 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_trt_format_models.py b/qa/common/gen_qa_trt_format_models.py
index 5f2cadd69e..5645a7178c 100755
--- a/qa/common/gen_qa_trt_format_models.py
+++ b/qa/common/gen_qa_trt_format_models.py
@@ -147,8 +147,7 @@ def create_plan_modelfile(
     # The build will fail if TensorRT cannot build an engine without introducing such reformatting. The failure may happen only for some target platforms, because of what formats are supported by kernels for those platforms.
     # flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
-        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
diff --git a/qa/common/gen_qa_trt_plugin_models.py b/qa/common/gen_qa_trt_plugin_models.py
index 9fd23d92a8..83ed9e82a7 100755
--- a/qa/common/gen_qa_trt_plugin_models.py
+++ b/qa/common/gen_qa_trt_plugin_models.py
@@ -44,12 +44,7 @@
 def get_trt_plugin(plugin_name):
     plugin = None
     field_collection = None
-    # The upstream onnx_custom_plugin sample is V2 on TRT 10.x release
-    # branches and V3 on rel-11.0 (and TRT 11 removed the V2 plugin
-    # registry surface). Pick the matching API at runtime.
-    registry = trt.get_plugin_registry()
-    use_v3 = not hasattr(registry, "plugin_creator_list")
-    plugin_creators = registry.all_creators if use_v3 else registry.plugin_creator_list
+    plugin_creators = trt.get_plugin_registry().plugin_creator_list
     for plugin_creator in plugin_creators:
         if (plugin_creator.name == "CustomHardmax") and (
             plugin_name == "CustomHardmax"
@@ -62,16 +57,9 @@ def get_trt_plugin(plugin_name):
 
     if field_collection is None:
         raise RuntimeError("Plugin not found: " + plugin_name)
-    if use_v3:
-        plugin = plugin_creator.create_plugin(
-            name=plugin_name,
-            field_collection=field_collection,
-            phase=trt.TensorRTPhase.BUILD,
-        )
-    else:
-        plugin = plugin_creator.create_plugin(
-            name=plugin_name, field_collection=field_collection
-        )
+    plugin = plugin_creator.create_plugin(
+        name=plugin_name, field_collection=field_collection
+    )
 
     return plugin
 
@@ -116,16 +104,9 @@ def create_plan_modelfile(
     input_layer = network.add_input(
         name="INPUT0", dtype=trt_input_dtype, shape=input_with_batchsize
     )
-    # add_plugin_v2 was removed in TRT 11; add_plugin_v3 has existed since
-    # TRT 10.0. Pick the API that exists on this TRT install; the plugin
-    # object returned by get_trt_plugin() is matched to the same version.
-    plugin_obj = get_trt_plugin(plugin_name)
-    if hasattr(network, "add_plugin_v2"):
-        plugin_layer = network.add_plugin_v2(inputs=[input_layer], plugin=plugin_obj)
-    else:
-        plugin_layer = network.add_plugin_v3(
-            inputs=[input_layer], shape_inputs=[], plugin=plugin_obj
-        )
+    plugin_layer = network.add_plugin_v2(
+        inputs=[input_layer], plugin=get_trt_plugin(plugin_name)
+    )
     plugin_layer.get_output(0).name = "OUTPUT0"
     network.mark_output(plugin_layer.get_output(0))
 

From 1bb39785d83d71b4f77aa98c765ffa1b77c4b928 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 18 May 2026 16:44:43 -0700
Subject: [PATCH 5/6] Reapply "fix(qa): ommit plugin creation if TensorRT
 branch is missed. (#8783)"

This reverts commit 1275bd227ca9e1e3f66438e8400bafb3fe1640f6.
---
 .../gen_qa_dyna_sequence_implicit_models.py   |  3 +-
 qa/common/gen_qa_dyna_sequence_models.py      |  6 ++-
 qa/common/gen_qa_identity_models.py           |  6 ++-
 qa/common/gen_qa_implicit_models.py           |  3 +-
 qa/common/gen_qa_model_repository             | 54 +++++++------------
 qa/common/gen_qa_models.py                    |  6 ++-
 qa/common/gen_qa_sequence_models.py           |  6 ++-
 qa/common/gen_qa_trt_format_models.py         |  3 +-
 qa/common/gen_qa_trt_plugin_models.py         | 33 +++++++++---
 9 files changed, 67 insertions(+), 53 deletions(-)

diff --git a/qa/common/gen_qa_dyna_sequence_implicit_models.py b/qa/common/gen_qa_dyna_sequence_implicit_models.py
index 32e4ebea13..c69ca28eab 100755
--- a/qa/common/gen_qa_dyna_sequence_implicit_models.py
+++ b/qa/common/gen_qa_dyna_sequence_implicit_models.py
@@ -575,7 +575,8 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_dyna_sequence_models.py b/qa/common/gen_qa_dyna_sequence_models.py
index 7f8459b0da..1a26890f32 100755
--- a/qa/common/gen_qa_dyna_sequence_models.py
+++ b/qa/common/gen_qa_dyna_sequence_models.py
@@ -129,7 +129,8 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
@@ -369,7 +370,8 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_identity_models.py b/qa/common/gen_qa_identity_models.py
index 248126bcc2..426d939d9e 100755
--- a/qa/common/gen_qa_identity_models.py
+++ b/qa/common/gen_qa_identity_models.py
@@ -584,7 +584,8 @@ def create_plan_dynamic_rf_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
@@ -707,7 +708,8 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
diff --git a/qa/common/gen_qa_implicit_models.py b/qa/common/gen_qa_implicit_models.py
index 06b86ee5c2..c0800098ec 100755
--- a/qa/common/gen_qa_implicit_models.py
+++ b/qa/common/gen_qa_implicit_models.py
@@ -1066,7 +1066,8 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 93c018a15a..455b8b0259 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -355,43 +355,27 @@ python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_format_models.py --models_dir=$TRITON_MD
 chmod -R 777 $TRITON_MDLS_QA_TRT_FORMAT_MODEL
 nvidia-smi --query-gpu=compute_cap | grep -qzE '10\.7|11\.0' && echo -e '${COLOR_WARNING}[WARNING]${COLOR_RESET} Skipping model generation for data dependent shape (NonZero not supported on this GPU)${COLOR_RESET}' || python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_data_dependent_shape.py --models_dir=$TRITON_MDLS_QA_TRT_DATA_DEPENDENT_MODEL
 chmod -R 777 $TRITON_MDLS_QA_TRT_DATA_DEPENDENT_MODEL
-# Make shared library for custom Hardmax plugin.
-if [ -d "/usr/src/tensorrt/samples/python/onnx_custom_plugin" ]; then
-    cd /usr/src/tensorrt/samples/python/onnx_custom_plugin
+# Build the custom Hardmax plugin used by L0_trt_plugin. The plugin source is
+# pulled from the public NVIDIA/TensorRT repo at the release/<major.minor>
+# branch matching the runtime TRT version. If the branch is not published
+# yet for this TRT version, the plugin model generation is skipped with a
+# warning -- L0_trt_plugin will report missing artifacts but the rest of
+# the QA model repository is still produced.
+TRT_BRANCH=\$(echo \${TRT_VERSION} | cut -d . -f -2)
+TRTSRC=/workspace/TensorRT
+rm -rf \${TRTSRC}
+if git clone --depth 1 -b release/\${TRT_BRANCH} \
+     https://github.com/NVIDIA/TensorRT.git \${TRTSRC}; then
+  cd \${TRTSRC}/samples/python/onnx_custom_plugin && \
+    rm -rf build && mkdir build && cd build && cmake .. && make -j && \
+    cp libcustomHardmaxPlugin.so ${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}/.
+  LD_PRELOAD=${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}/libcustomHardmaxPlugin.so \
+    python3 ${TRITON_MDLS_SRC_DIR}/gen_qa_trt_plugin_models.py \
+    --models_dir=${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}
+  chmod -R 777 ${TRITON_MDLS_QA_TRT_PLUGIN_MODEL}
 else
-    TRT_BRANCH=\$(echo \$TRT_VERSION | cut -d . -f -2)
-    if ! git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT; then
-      MAJOR=\$(echo "\$TRT_BRANCH" | cut -d . -f 1)
-      MINOR=\$(echo "\$TRT_BRANCH" | cut -d . -f 2)
-      if [ -n "\$MINOR" ] && [ "\$MINOR" -gt 0 ] 2>/dev/null; then
-        TRT_BRANCH="\${MAJOR}.\$((MINOR - 1))"
-        echo "Fallback: cloning TensorRT release/\${TRT_BRANCH} (previous minor)"
-        git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT
-      elif [ -n "\$MAJOR" ] && [ "\$MAJOR" -gt 0 ] 2>/dev/null; then
-        PREV_MAJOR=\$((MAJOR - 1))
-        echo "Fallback: MINOR is 0, querying remote for latest release/\${PREV_MAJOR}.x branch"
-        TRT_BRANCH=\$(git ls-remote --heads https://github.com/NVIDIA/TensorRT.git "refs/heads/release/\${PREV_MAJOR}.*" \
-          | awk -F'refs/heads/release/' '{print \$2}' \
-          | awk -F. '{print \$2, \$0}' \
-          | sort -k1,1n \
-          | tail -1 \
-          | awk '{print \$2}')
-        if [ -n "\$TRT_BRANCH" ]; then
-          echo "Fallback: cloning TensorRT release/\${TRT_BRANCH}"
-          git clone -b release/\${TRT_BRANCH} --depth 1 https://github.com/NVIDIA/TensorRT.git /workspace/TensorRT
-        else
-          exit 1
-        fi
-      else
-        exit 1
-      fi
-    fi
-    cd /workspace/TensorRT/samples/python/onnx_custom_plugin
+  echo "[WARNING] TensorRT release/\${TRT_BRANCH} not available on github.com/NVIDIA/TensorRT; skipping CustomHardmax plugin model generation. L0_trt_plugin coverage will be missing for this TRT version."
 fi
-rm -rf build && mkdir build && \
-cd build && cmake .. && make -j && cp libcustomHardmaxPlugin.so $TRITON_MDLS_QA_TRT_PLUGIN_MODEL/.
-LD_PRELOAD=$TRITON_MDLS_QA_TRT_PLUGIN_MODEL/libcustomHardmaxPlugin.so python3 $TRITON_MDLS_SRC_DIR/gen_qa_trt_plugin_models.py --models_dir=$TRITON_MDLS_QA_TRT_PLUGIN_MODEL
-chmod -R 777 $TRITON_MDLS_QA_TRT_PLUGIN_MODEL
 exit 0
 EOF
 
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
index fac5001ed9..865f8559dc 100755
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -159,7 +159,8 @@ def create_plan_dynamic_rf_modelfile(
     profile.set_shape("INPUT1", min_shape, opt_shape, max_shape)
 
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
@@ -449,7 +450,8 @@ def create_plan_fixed_rf_modelfile(
     profile.set_shape("INPUT1", min_shape, opt_shape, max_shape)
 
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
diff --git a/qa/common/gen_qa_sequence_models.py b/qa/common/gen_qa_sequence_models.py
index bf83a3a5f3..f8d89a5f9e 100755
--- a/qa/common/gen_qa_sequence_models.py
+++ b/qa/common/gen_qa_sequence_models.py
@@ -118,7 +118,8 @@ def create_plan_shape_tensor_modelfile(
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
@@ -320,7 +321,8 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
     flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags |= 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
 
     if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
diff --git a/qa/common/gen_qa_trt_format_models.py b/qa/common/gen_qa_trt_format_models.py
index 5645a7178c..5f2cadd69e 100755
--- a/qa/common/gen_qa_trt_format_models.py
+++ b/qa/common/gen_qa_trt_format_models.py
@@ -147,7 +147,8 @@ def create_plan_modelfile(
     # The build will fail if TensorRT cannot build an engine without introducing such reformatting. The failure may happen only for some target platforms, because of what formats are supported by kernels for those platforms.
     # flags = 1 << int(trt.BuilderFlag.DIRECT_IO)
     flags = 1 << int(trt.BuilderFlag.PREFER_PRECISION_CONSTRAINTS)
-    flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
+    if hasattr(trt.BuilderFlag, "REJECT_EMPTY_ALGORITHMS"):
+        flags |= 1 << int(trt.BuilderFlag.REJECT_EMPTY_ALGORITHMS)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
         if dt == trt.int8:
diff --git a/qa/common/gen_qa_trt_plugin_models.py b/qa/common/gen_qa_trt_plugin_models.py
index 83ed9e82a7..9fd23d92a8 100755
--- a/qa/common/gen_qa_trt_plugin_models.py
+++ b/qa/common/gen_qa_trt_plugin_models.py
@@ -44,7 +44,12 @@
 def get_trt_plugin(plugin_name):
     plugin = None
     field_collection = None
-    plugin_creators = trt.get_plugin_registry().plugin_creator_list
+    # The upstream onnx_custom_plugin sample is V2 on TRT 10.x release
+    # branches and V3 on rel-11.0 (and TRT 11 removed the V2 plugin
+    # registry surface). Pick the matching API at runtime.
+    registry = trt.get_plugin_registry()
+    use_v3 = not hasattr(registry, "plugin_creator_list")
+    plugin_creators = registry.all_creators if use_v3 else registry.plugin_creator_list
     for plugin_creator in plugin_creators:
         if (plugin_creator.name == "CustomHardmax") and (
             plugin_name == "CustomHardmax"
@@ -57,9 +62,16 @@ def get_trt_plugin(plugin_name):
 
     if field_collection is None:
         raise RuntimeError("Plugin not found: " + plugin_name)
-    plugin = plugin_creator.create_plugin(
-        name=plugin_name, field_collection=field_collection
-    )
+    if use_v3:
+        plugin = plugin_creator.create_plugin(
+            name=plugin_name,
+            field_collection=field_collection,
+            phase=trt.TensorRTPhase.BUILD,
+        )
+    else:
+        plugin = plugin_creator.create_plugin(
+            name=plugin_name, field_collection=field_collection
+        )
 
     return plugin
 
@@ -104,9 +116,16 @@ def create_plan_modelfile(
     input_layer = network.add_input(
         name="INPUT0", dtype=trt_input_dtype, shape=input_with_batchsize
     )
-    plugin_layer = network.add_plugin_v2(
-        inputs=[input_layer], plugin=get_trt_plugin(plugin_name)
-    )
+    # add_plugin_v2 was removed in TRT 11; add_plugin_v3 has existed since
+    # TRT 10.0. Pick the API that exists on this TRT install; the plugin
+    # object returned by get_trt_plugin() is matched to the same version.
+    plugin_obj = get_trt_plugin(plugin_name)
+    if hasattr(network, "add_plugin_v2"):
+        plugin_layer = network.add_plugin_v2(inputs=[input_layer], plugin=plugin_obj)
+    else:
+        plugin_layer = network.add_plugin_v3(
+            inputs=[input_layer], shape_inputs=[], plugin=plugin_obj
+        )
     plugin_layer.get_output(0).name = "OUTPUT0"
     network.mark_output(plugin_layer.get_output(0))
 

From 754528f61186c74183c1bf012059e15fc5c08d7f Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Mon, 18 May 2026 16:44:48 -0700
Subject: [PATCH 6/6] Reapply "test: Add Torch AOTI Tests (#8771)"

This reverts commit 89e6a014dd0d187431ecfcf5ac11a2c3b039ae07.
---
 qa/L0_torch_aoti/test.sh                  | 148 +++++++
 qa/L0_torch_aoti/torch_aoti_infer_test.py | 284 +++++++++++++
 qa/common/gen_qa_model_repository         |   2 +-
 qa/common/gen_qa_models.py                | 492 +++++++++++++++++-----
 4 files changed, 821 insertions(+), 105 deletions(-)
 create mode 100755 qa/L0_torch_aoti/test.sh
 create mode 100755 qa/L0_torch_aoti/torch_aoti_infer_test.py

diff --git a/qa/L0_torch_aoti/test.sh b/qa/L0_torch_aoti/test.sh
new file mode 100755
index 0000000000..f37751c55e
--- /dev/null
+++ b/qa/L0_torch_aoti/test.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+source ../common/util.sh
+
+if [[ "${DEBUG}" == "true" ]]; then
+    set -x
+else
+    set +x
+fi
+
+COLOR_DARK="\033[90m"
+COLOR_ERROR="\033[31m"
+COLOR_INFO="\033[94m"
+COLOR_RESET="\033[0m"
+COLOR_STATUS="\033[36m"
+COLOR_SUCCESS="\033[32m"
+COLOR_WARNING="\033[33m"
+RET=0
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [[ "$#" -ge 1 ]]; then
+    REPO_VERSION=$1
+fi
+if [[ -z "$REPO_VERSION" ]]; then
+    echo -e "${COLOR_ERROR}Repository version must be specified${COLOR_RESET}" 1>&2
+    echo -e "${COLOR_ERROR}\n***\n*** Test Failed\n***${COLOR_RESET}" 1>&2
+    exit 1
+fi
+if [[ ! -z "$TEST_REPO_ARCH" ]]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+MODELDIR=${MODELDIR:=`pwd`/models}
+DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
+TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
+SERVER=${TRITON_DIR}/bin/tritonserver
+BACKEND_DIR=${TRITON_DIR}/backends
+
+# PyTorch on SBSA requires libgomp to be loaded first. See the following
+# GitHub issue for more information:
+# https://github.com/pytorch/pytorch/issues/2575
+arch=`uname -m`
+echo -e "${COLOR_DARK}Detected architecture: ${arch}${COLOR_RESET}"
+if [[ "${arch}" == "aarch64" ]]; then
+    SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1
+    echo -e "${COLOR_DARK}SERVER_LD_PRELOAD=${SERVER_LD_PRELOAD}${COLOR_RESET}"
+fi
+
+# If BACKENDS not specified, set to all
+BACKENDS=${BACKENDS:="pytorch"}
+export BACKENDS
+
+# Copy the models into the model repository
+echo -e "${COLOR_DARK}Setting up model repository in ${MODELDIR}${COLOR_RESET}"
+rm -rf ${MODELDIR} && mkdir -p ${MODELDIR}
+models=(
+    "torch_aoti_complex_index"
+    "torch_aoti_complex_named"
+    "torch_aoti_int8_int8"
+    "torch_aoti_int16_int16"
+    "torch_aoti_int32_int32"
+    "torch_aoti_int64_int64"
+    "torch_aoti_float16_float16"
+    "torch_aoti_float32_float32"
+    "torchvision_aoti"
+)
+for model in "${models[@]}"; do
+    cp -r ${DATADIR}/qa_model_repository/${model} ${MODELDIR}/${model}
+    echo -e "${COLOR_DARK}ls ${MODELDIR}/${model}${COLOR_RESET}"
+    ls -lha ${MODELDIR}/${model}
+done
+echo -e "${COLOR_DARK}ls ${MODELDIR}${COLOR_RESET}"
+ls -lha ${MODELDIR}
+
+SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1"
+SERVER_LOG="./torch_aoti_complex_named-server.log"
+CLIENT_LOG="./torch_aoti_complex_named-client.log"
+
+echo -e "${COLOR_DARK}Running ${SERVER} with model repository ${MODELDIR}${COLOR_RESET}"
+run_server
+if [[ "${SERVER_PID}" -eq 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Failed to start ${SERVER}\n***${COLOR_RESET}" &1>2
+    cat ${SERVER_LOG} &1>2
+    echo -e "\n" &1>2
+    exit 1
+fi
+
+# Install torch framework
+echo -e "${COLOR_DARK}Installing PyTorch framework required by tests${COLOR_RESET}"
+pip install torch
+
+# Run the Tests
+TEST_NAME="torch_aoti_infer_test"
+python3 ./${TEST_NAME}.py >> ${CLIENT_LOG} 2>&1
+EXIT_CODE=$?
+if [[ ${EXIT_CODE} -ne 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Test '${TEST_NAME}' Failed with exit code ${EXIT_CODE}\n***${COLOR_RESET}" &1>2
+    cat ${CLIENT_LOG} &1>2
+    echo -e "\n" &1>2
+    RET=1
+else
+    echo -e "${COLOR_INFO}\n***\n*** Test '${TEST_NAME}' Passed\n***${COLOR_RESET}"
+fi
+
+# Cleanup
+echo -e "${COLOR_DARK}Killing server (pid: ${SERVER_PID})${COLOR_RESET}"
+kill -s SIGINT ${SERVER_PID}
+wait ${SERVER_PID} || true
+echo -e "${COLOR_DARK}Removing model repository${COLOR_RESET}"
+for model in "${models[@]}"; do
+    rm -rf ${MODELDIR}/${model}
+done
+
+# Report results and exit.
+if [[ ${RET} -ne 0 ]]; then
+    echo -e "${COLOR_ERROR}\n***\n*** Test Suite FAILED\n***${COLOR_RESET}" &1>2
+else
+    echo -e "${COLOR_SUCCESS}\n***\n*** Test Suite PASSED\n***${COLOR_RESET}"
+fi
+
+exit ${RET}
diff --git a/qa/L0_torch_aoti/torch_aoti_infer_test.py b/qa/L0_torch_aoti/torch_aoti_infer_test.py
new file mode 100755
index 0000000000..2b93f31a48
--- /dev/null
+++ b/qa/L0_torch_aoti/torch_aoti_infer_test.py
@@ -0,0 +1,284 @@
+#!/usr/bin/python
+# Copyright 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import unittest
+
+import test_util as tu
+import torch
+import tritonclient.http as http
+
+
+class TorchAotiTest(tu.TestResultCollector):
+    def _get_complex_input_shape(self):
+        return (1, 16)
+
+    def _get_complex_output_shape(self):
+        return (1, 16)
+
+    def _get_complex_input_data(self, shape):
+        return [
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+            torch.randint(low=0, high=127, size=shape, dtype=torch.int8).numpy(),
+        ]
+
+    def _get_simple_input_data(self, shape, io_type):
+        if io_type in [torch.int8, torch.int16, torch.int32, torch.int64]:
+            return torch.randint(low=0, high=127, size=shape, dtype=io_type).numpy()
+        elif io_type in [torch.float16, torch.float32, torch.float64]:
+            return torch.randn(size=shape, dtype=io_type).numpy()
+        else:
+            raise ValueError(f"Unsupported data type: {io_type}")
+
+    def _get_torchvision_input_data(self, shape):
+        return torch.randn(size=shape, dtype=torch.float32).numpy()
+
+    def _dtype_to_triton_dtype(self, dtype):
+        if dtype == torch.int8:
+            return "INT8"
+        elif dtype == torch.int16:
+            return "INT16"
+        elif dtype == torch.int32:
+            return "INT32"
+        elif dtype == torch.int64:
+            return "INT64"
+        elif dtype == torch.float16:
+            return "FP16"
+        elif dtype == torch.float32:
+            return "FP32"
+        else:
+            raise ValueError(f"Unsupported data type: {dtype}")
+
+    def _get_simple_model_name(self, io_type):
+        if io_type == torch.int8:
+            return "torch_aoti_int8_int8"
+        elif io_type == torch.int16:
+            return "torch_aoti_int16_int16"
+        elif io_type == torch.int32:
+            return "torch_aoti_int32_int32"
+        elif io_type == torch.int64:
+            return "torch_aoti_int64_int64"
+        elif io_type == torch.float16:
+            return "torch_aoti_float16_float16"
+        elif io_type == torch.float32:
+            return "torch_aoti_float32_float32"
+        else:
+            raise ValueError(f"Unsupported data type: {io_type}")
+
+    def test_complex_index(self):
+        MODEL_NAME = "torch_aoti_complex_index"
+        INPUT_SHAPE = self._get_complex_input_shape()
+        OUTPUT_SHAPE = self._get_complex_output_shape()
+
+        input_data = self._get_complex_input_data(INPUT_SHAPE)
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("INPUT__0", input_data[0].shape, "INT8"),
+                http.InferInput("INPUT__1", input_data[1].shape, "INT8"),
+                http.InferInput("INPUT__2", input_data[2].shape, "INT8"),
+                http.InferInput("INPUT__3", input_data[3].shape, "INT8"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
+            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
+
+            output_names = [
+                "OUTPUT__0",
+                "OUTPUT__1",
+                "OUTPUT__2",
+                "OUTPUT__3",
+                "OUTPUT__4",
+                "OUTPUT__5",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+
+            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
+            self.assertTrue((output_data[1] == input_data[0] - input_data[1]).all())
+            self.assertTrue((output_data[2] == input_data[0]).all())
+            self.assertTrue((output_data[3] == input_data[1]).all())
+            self.assertTrue((output_data[4] == input_data[2]).all())
+            self.assertTrue((output_data[5] == input_data[3]).all())
+
+    def test_complex_named(self):
+        MODEL_NAME = "torch_aoti_complex_named"
+        INPUT_SHAPE = self._get_complex_input_shape()
+        OUTPUT_SHAPE = self._get_complex_output_shape()
+
+        input_data = self._get_complex_input_data(INPUT_SHAPE)
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("ARGS[0]", input_data[0].shape, "INT8"),
+                http.InferInput("ARGS[1]", input_data[1].shape, "INT8"),
+                http.InferInput("ARGS[2][option1]", input_data[2].shape, "INT8"),
+                http.InferInput("ARGS[2][option2]", input_data[3].shape, "INT8"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+            inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+            inputs[2].set_data_from_numpy(input_data[2], binary_data=True)
+            inputs[3].set_data_from_numpy(input_data[3], binary_data=True)
+
+            output_names = [
+                "RESULT[AAA]",
+                "RESULT[BBB][0]",
+                "RESULT[BBB][1]",
+                "RESULT[CCC][option1]",
+                "RESULT[CCC][option2]",
+                "RESULT[ZZZ]",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+
+            self.assertTrue((output_data[0] == (input_data[0] + input_data[1])).all())
+            self.assertTrue((output_data[1] == input_data[0]).all())
+            self.assertTrue((output_data[2] == input_data[1]).all())
+            self.assertTrue((output_data[3] == input_data[2]).all())
+            self.assertTrue((output_data[4] == input_data[3]).all())
+            self.assertTrue((output_data[5] == (input_data[0] - input_data[1])).all())
+
+    def test_simple_model(self):
+        io_types = [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+        ]
+        for io_type in io_types:
+            MODEL_NAME = self._get_simple_model_name(io_type)
+            INPUT_SHAPE = (16,)
+            OUTPUT_SHAPE = (16,)
+            TRITON_IO_TYPE = self._dtype_to_triton_dtype(io_type)
+
+            input_data = (
+                self._get_simple_input_data(INPUT_SHAPE, io_type),
+                self._get_simple_input_data(INPUT_SHAPE, io_type),
+            )
+
+            with http.InferenceServerClient("localhost:8000") as client:
+                inputs = [
+                    http.InferInput("ARGS[0]", input_data[0].shape, TRITON_IO_TYPE),
+                    http.InferInput("ARGS[1]", input_data[1].shape, TRITON_IO_TYPE),
+                ]
+
+                inputs[0].set_data_from_numpy(input_data[0], binary_data=True)
+                inputs[1].set_data_from_numpy(input_data[1], binary_data=True)
+
+                output_names = [
+                    "RESULT",
+                ]
+
+                outputs = []
+                for output_name in output_names:
+                    outputs.append(
+                        http.InferRequestedOutput(output_name, binary_data=True)
+                    )
+
+                output_data = []
+                results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+                for output_name in output_names:
+                    output_data.append(results.as_numpy(output_name))
+
+                self.assertEqual(len(outputs), len(output_data))
+                for data in output_data:
+                    self.assertEqual(data.shape, OUTPUT_SHAPE)
+                    self.assertTrue((data == input_data[0] + input_data[1]).all())
+
+    def test_torchvision(self):
+        MODEL_NAME = "torchvision_aoti"
+        INPUT_SHAPE = (1, 3, 224, 224)
+        OUTPUT_SHAPE = (1, 1000)
+
+        input_data = self._get_torchvision_input_data(INPUT_SHAPE)
+        input_data[0][0] = 1.0
+
+        with http.InferenceServerClient("localhost:8000") as client:
+            inputs = [
+                http.InferInput("ARGS[0]", input_data.shape, "FP32"),
+            ]
+
+            inputs[0].set_data_from_numpy(input_data, binary_data=True)
+
+            output_names = [
+                "RESULT",
+            ]
+
+            outputs = []
+            for output_name in output_names:
+                outputs.append(http.InferRequestedOutput(output_name, binary_data=True))
+
+            output_data = []
+            results = client.infer(MODEL_NAME, inputs, outputs=outputs)
+
+            for output_name in output_names:
+                output_data.append(results.as_numpy(output_name))
+
+            self.assertEqual(len(outputs), len(output_data))
+            for data in output_data:
+                self.assertEqual(data.shape, OUTPUT_SHAPE)
+                output_tensor = torch.from_numpy(data)
+                self.assertTrue(torch.isfinite(output_tensor).all().item())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 455b8b0259..8841060116 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -263,9 +263,9 @@ set -e
 PATH=$PATH:/usr/local/cuda-13.0/bin
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --models_dir=$TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --models_dir=$TRITON_MDLS_QA_MODEL
+python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torchvision-aoti --models_dir=$TRITON_MDLS_QA_MODEL
 chmod -R 777 $TRITON_MDLS_QA_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --libtorch --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
-python3 $TRITON_MDLS_SRC_DIR/gen_qa_models.py --torch-aoti --variable --models_dir=$TRITON_MDLS_QA_VARIABLE_MODEL
 chmod -R 777 $TRITON_MDLS_QA_VARIABLE_MODEL
 python3 $TRITON_MDLS_SRC_DIR/gen_qa_identity_models.py --libtorch --models_dir=$TRITON_MDLS_QA_IDENTITY_MODEL
 chmod -R 777 $TRITON_MDLS_QA_IDENTITY_MODEL
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
index 865f8559dc..d305f07ce3 100755
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -47,6 +47,7 @@
 from typing import List, Tuple
 
 _color_blue = "\033[94m"
+_color_cyan = "\033[36m"
 _color_green = "\033[32m"
 _color_magenta = "\033[35m"
 _color_red = "\033[31m"
@@ -1291,7 +1292,7 @@ def forward(self, INPUT0, INPUT1):
     traced.save(f"{model_version_dir}/model.pt")
 
 
-def generate_sample_inputs(
+def generate_torch_aoti_sample_inputs(
     input_shape,
     input_dtype,
     device,
@@ -1299,70 +1300,32 @@ def generate_sample_inputs(
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     input_shape = [abs(ips) for ips in input_shape]
 
-    if input_dtype == np.int8:
-        input0 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
-        input1 = torch.randint(-128, 127, input_shape, dtype=torch.int8, device=device)
-    elif input_dtype == np.int16:
-        input0 = torch.randint(
-            -32768, 32767, input_shape, dtype=torch.int16, device=device
-        )
-        input1 = torch.randint(
-            -32768, 32767, input_shape, dtype=torch.int16, device=device
-        )
-    elif input_dtype == np.int32:
-        input0 = torch.randint(
-            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
-        )
-        input1 = torch.randint(
-            -2147483648, 2147483647, input_shape, dtype=torch.int32, device=device
-        )
-    elif input_dtype == np.int64:
-        input0 = torch.randint(
-            -9223372036854775808,
-            9223372036854775807,
-            input_shape,
-            dtype=torch.int64,
-            device=device,
-        )
-        input1 = torch.randint(
-            -9223372036854775808,
-            9223372036854775807,
-            input_shape,
-            dtype=torch.int64,
-            device=device,
-        )
-    elif input_dtype == np.float16:
-        input0 = torch.randn(*input_shape, dtype=torch.float16, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float16, device=device)
-    elif input_dtype == np.float32:
-        input0 = torch.randn(*input_shape, dtype=torch.float32, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float32, device=device)
-    elif input_dtype == np.float64:
-        input0 = torch.randn(*input_shape, dtype=torch.float64, device=device)
-        input1 = torch.randn(*input_shape, dtype=torch.float64, device=device)
-    elif input_dtype == np.uint8:
-        input0 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
-        input1 = torch.randint(0, 255, input_shape, dtype=torch.uint8, device=device)
-    elif input_dtype == np.uint16:
-        input0 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
-        input1 = torch.randint(0, 65535, input_shape, dtype=torch.uint16, device=device)
-    elif input_dtype == np.uint32:
-        input0 = torch.randint(
-            0, 4294967295, input_shape, dtype=torch.uint32, device=device
-        )
-        input1 = torch.randint(
-            0, 4294967295, input_shape, dtype=torch.uint32, device=device
-        )
-    elif input_dtype == np.uint64:
-        input0 = torch.randint(
-            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
-        )
-        input1 = torch.randint(
-            0, 18446744073709551615, input_shape, dtype=torch.uint64, device=device
+    np_to_torch_dtype = {
+        np.int8: torch.int8,
+        np.int16: torch.int16,
+        np.int32: torch.int32,
+        np.int64: torch.int64,
+        np.float16: torch.float16,
+        np.float32: torch.float32,
+        np.float64: torch.float64,
+        np.uint8: torch.uint8,
+        np.uint16: torch.uint16,
+        np.uint32: torch.uint32,
+        np.uint64: torch.uint64,
+    }
+
+    if input_dtype not in np_to_torch_dtype:
+        print(
+            f"{_color_yellow}warning: dtype {input_dtype} is unsupported; falling back to torch.int32{_color_reset}"
         )
-    else:
-        input0 = torch.randn(*input_shape, device=device)
-        input1 = torch.randn(*input_shape, device=device)
+        input_dtype = np.int32
+
+    input0 = torch.zeros(
+        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
+    )
+    input1 = torch.zeros(
+        input_shape, dtype=np_to_torch_dtype[input_dtype], device=device
+    )
 
     return (input0, input1)
 
@@ -1397,7 +1360,7 @@ def np_to_dtype(np_dtype):
         return torch.int32
 
 
-def create_torch_aoti_modelfile(
+def create_torch_aoti_model_file(
     models_dir,
     model_version,
     input_shape,
@@ -1420,7 +1383,7 @@ def create_torch_aoti_modelfile(
         )
         return False
 
-    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
+    model_version_dir = os.path.join(models_dir, model_name, str(model_version))
 
     print(f"{_color_green}Creating model {model_name}{_color_reset}")
 
@@ -1467,13 +1430,14 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     model.to(device)
     model = model.eval()
 
-    sample_input = generate_sample_inputs(input_shape, input_dtype, device)
+    sample_inputs = generate_torch_aoti_sample_inputs(input_shape, input_dtype, device)
+    package_path = os.path.join(model_version_dir, "model.pt2")
 
     try:
-        ep = torch.export.export(model, sample_input)
+        exported_model = torch.export.export(model, sample_inputs)
         torch._inductor.aoti_compile_and_package(
-            ep,
-            package_path=f"{model_version_dir}/model.pt2",
+            exported_model,
+            package_path=package_path,
         )
     except Exception as e:
         print(
@@ -1486,13 +1450,162 @@ def forward(self, INPUT0: torch.Tensor, INPUT1: torch.Tensor) -> torch.Tensor:
     return True
 
 
-def create_torchvision_aoti_modelfile(
+def create_torch_aoti_complex_model_file(
+    models_dir: str,
+):
+    base_name = "torch_aoti_complex"
+    model_names = [
+        f"{base_name}_named",
+        f"{base_name}_index",
+    ]
+    model_version_dirs = [
+        os.path.join(models_dir, model_names[0], "1"),
+        os.path.join(models_dir, model_names[1], "1"),
+    ]
+
+    for model_version_dir in model_version_dirs:
+        try:
+            os.makedirs(model_version_dir)
+        except OSError:
+            pass  # ignore existing dir
+
+    print(f"{_color_green}Creating model {base_name}{_color_reset}")
+
+    class TorchAotiComplex(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(
+            self,
+            hdata: torch.Tensor,
+            vdata: torch.Tensor,
+            options: dict[str, torch.Tensor],
+        ) -> dict[
+            str,
+            torch.Tensor | tuple[torch.Tensor, torch.Tensor] | dict[str, torch.Tensor],
+        ]:
+            out = {
+                "AAA": hdata + vdata,
+                "ZZZ": hdata - vdata,
+                "BBB": (
+                    hdata,
+                    vdata,
+                ),
+                "CCC": options,
+            }
+
+            return out
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = TorchAotiComplex()
+    model.to(device)
+    model = model.eval()
+
+    SHAPE = (1, 16)
+
+    sample_args = (
+        torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        {
+            "option1": torch.zeros(SHAPE, dtype=torch.int8, device=device),
+            "option2": torch.zeros(SHAPE, dtype=torch.int8, device=device),
+        },
+    )
+
+    # Export and package the model
+    print(f"{_color_green}Exporting and packaging the model...{_color_reset}")
+
+    model_file_name = "model.pt2"
+    package_paths = [
+        os.path.join(model_version_dirs[0], model_file_name),
+        os.path.join(model_version_dirs[1], model_file_name),
+    ]
+
+    try:
+        exported_model = torch.export.export(model, sample_args)
+        torch._inductor.aoti_compile_and_package(
+            exported_model,
+            package_path=package_paths[0],
+        )
+    except Exception as e:
+        print(
+            f"{_color_red}error: Failed to create model {base_name}{_color_reset}",
+            file=sys.stderr,
+        )
+        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
+        return False
+
+    try:
+        # Now load and run the packaged model
+        print(f"{_color_cyan}Loading and running the packaged model...{_color_reset}")
+
+        compiled_model = torch._inductor.aoti_load_package(package_paths[0])
+
+        print(f"{_color_cyan}Compiled model call spec:{_color_reset}")
+
+        for elem in compiled_model.loader.get_call_spec():
+            print(elem)
+
+        print(f"{_color_cyan}Running the compiled model...{_color_reset}")
+
+        with torch.inference_mode():
+            hdata = torch.randint(
+                low=0,
+                high=127,
+                size=SHAPE,
+                dtype=torch.int8,
+                device=device,
+            )
+            vdata = torch.randint(
+                low=0,
+                high=127,
+                size=SHAPE,
+                dtype=torch.int8,
+                device=device,
+            )
+            options = {
+                "option1": torch.randint(
+                    low=0,
+                    high=127,
+                    size=SHAPE,
+                    dtype=torch.int8,
+                    device=device,
+                ),
+                "option2": torch.randint(
+                    low=0,
+                    high=127,
+                    size=SHAPE,
+                    dtype=torch.int8,
+                    device=device,
+                ),
+            }
+
+            _ = compiled_model(hdata, vdata, options)
+
+            print(
+                f'{_color_green}Model "{base_name}" successfully executed.{_color_reset}'
+            )
+    except Exception as e:
+        print(
+            f"{_color_red}error: Failed to validate model {base_name}{_color_reset}",
+            file=sys.stderr,
+        )
+        print(f"\n{_color_red}{e}{_color_reset}\n", file=sys.stderr)
+        return False
+
+    # Copy the compiled model package to the alternate model folder.
+    # Both the named and ordinal addressing versions of the model (from Triton's point-of-view) use the same compiled model.
+    shutil.copy(package_paths[0], package_paths[1])
+
+    return True
+
+
+def create_torchvision_aoti_model_file(
     models_dir: str,
     max_batch: int,
-    model_version: int,
 ):
     model_name = "torchvision_aoti"
-    model_version_dir = f"{models_dir}/{model_name}/{model_version}"
+    model_version_dir = os.path.join(models_dir, model_name, "1")
 
     try:
         os.makedirs(model_version_dir)
@@ -1506,16 +1619,16 @@ def create_torchvision_aoti_modelfile(
     model = model.to(device)
     model = model.eval()
 
+    SHAPE = (max_batch, 3, 224, 224)
+
     # Example input tensor with batch size 1 and 3 color channels (RGB), height and width of 224
-    input_tensor = torch.randn(max_batch, 3, 224, 224, device=device)
+    sample_inputs = (torch.zeros(SHAPE, dtype=torch.float32, device=device),)
 
-    try:
-        ep = torch.export.export(model, (input_tensor,))
+    package_path = os.path.join(model_version_dir, "model.pt2")
 
-        torch._inductor.aoti_compile_and_package(
-            ep,
-            package_path=f"{model_version_dir}/model.pt2",
-        )
+    try:
+        ep = torch.export.export(model, sample_inputs)
+        torch._inductor.aoti_compile_and_package(ep, package_path=package_path)
     except Exception as e:
         print(
             f"{_color_red}error: Failed to create model {model_name}{_color_reset}",
@@ -1611,9 +1724,11 @@ def create_libtorch_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
     with open(f"{config_dir}/{label_filename}", "w") as file:
         for l in range(output0_label_cnt):
@@ -1621,7 +1736,7 @@ def create_libtorch_modelconfig(
         print(f"Created {config_dir}/{label_filename}")
 
 
-def create_torch_aoti_modelconfig(
+def create_torch_aoti_model_config(
     models_dir,
     input_shape,
     output_shape,
@@ -1652,7 +1767,7 @@ def create_torch_aoti_modelconfig(
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
     label_filename = "output_labels.txt"
-    config_dir = f"{models_dir}/{model_name}"
+    config_dir = os.path.join(models_dir, model_name)
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1660,19 +1775,19 @@ def create_torch_aoti_modelconfig(
 version_policy: {version_policy_str}
 input [
   {{
-    name: "INPUT0"
+    name: "ARGS[0]"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }},
   {{
-    name: "INPUT1"
+    name: "ARGS[1]"
     data_type: {np_to_model_dtype(input_dtype)}
     dims: [ {tu.shape_to_dims_str(input_shape)} ]
   }}
 ]
 output [
   {{
-    name: "OUTPUT__0"
+    name: "RESULT"
     data_type: {np_to_model_dtype(output_dtype)}
     dims: [ {tu.shape_to_dims_str(output_shape)} ]
     label_filename: "{label_filename}"
@@ -1686,17 +1801,173 @@ def create_torch_aoti_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
-    with open(f"{config_dir}/{label_filename}", "w") as file:
+    label_path = os.path.join(config_dir, label_filename)
+
+    with open(label_path, "w") as file:
         for l in range(output_label_cnt):
             file.write(f"label{l}\n")
-        print(f"Created {config_dir}/{label_filename}")
+        print(f"Created {label_path}")
+
+
+def create_torch_aoti_complex_model_config(
+    models_dir,
+):
+    base_name = "torch_aoti_complex"
+    model_names = [
+        f"{base_name}_named",
+        f"{base_name}_index",
+    ]
+
+    print(f"{_color_green}Creating config for {base_name}{_color_reset}")
+
+    config_dirs = [
+        os.path.join(models_dir, model_names[0]),
+        os.path.join(models_dir, model_names[1]),
+    ]
+    configs = [
+        f"""
+backend: "pytorch"
+platform: "torch_aoti"
+name: "{model_names[0]}"
+input: [
+  {{
+    name: "ARGS[0]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[2][option1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "ARGS[2][option2]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+output: [
+  {{
+    name: "RESULT[AAA]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[BBB][0]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[BBB][1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[CCC][option1]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[CCC][option2]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "RESULT[ZZZ]"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
+""",
+        f"""
+backend: "pytorch"
+name: "{model_names[1]}"
+platform: "torch_aoti"
+input: [
+  {{
+    name: "INPUT__0"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__1"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__2"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "INPUT__3"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+output: [
+  {{
+    name: "OUTPUT__0"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__1"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__2"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__3"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__4"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }},
+  {{
+    name: "OUTPUT__5"
+    data_type: TYPE_INT8
+    dims: [1, 16]
+  }}
+]
+instance_group [{{ kind: {"KIND_GPU" if torch.cuda.is_available() else "KIND_CPU"} }}]
+""",
+    ]
+
+    for i in range(2):
+        config_dir = config_dirs[i]
+        try:
+            os.makedirs(config_dir)
+        except OSError:
+            pass  # ignore existing dir
 
+        config_path = os.path.join(config_dir, "config.pbtxt")
 
-def create_torchvision_aoti_modelconfig(
+        with open(config_path, "w") as file:
+            file.write(configs[i])
+            print(f"Created {config_path}")
+
+
+def create_torchvision_aoti_model_config(
     models_dir: str,
     max_batch: int,
 ):
@@ -1705,7 +1976,7 @@ def create_torchvision_aoti_modelconfig(
 
     print(f"{_color_green}Creating config for {model_name}{_color_reset}")
 
-    config_dir = f"{models_dir}/{model_name}"
+    config_dir = os.path.join(models_dir, model_name)
     config = f"""
 backend: "pytorch"
 name: "{model_name}"
@@ -1713,14 +1984,13 @@ def create_torchvision_aoti_modelconfig(
 max_batch_size: {max_batch}
 input  [
   {{
-    name: "INPUT__0"
+    name: "ARGS[0]"
     data_type: TYPE_FP32
-    format: FORMAT_NCHW
     dims: [ 3, 224, 224 ]
   }}]
 output [
   {{
-    name: "OUTPUT__0"
+    name: "RESULT"
     data_type: TYPE_FP32
     dims: [ 1000 ]
     label_filename: "{label_filename}"
@@ -1734,15 +2004,19 @@ def create_torchvision_aoti_modelconfig(
     except OSError:
         pass  # ignore existing dir
 
-    with open(f"{config_dir}/config.pbtxt", "w") as file:
+    config_path = os.path.join(config_dir, "config.pbtxt")
+
+    with open(config_path, "w") as file:
         file.write(config)
-        print(f"Created {config_dir}/config.pbtxt")
+        print(f"Created {config_path}")
 
     source_path = os.environ.get("TRITON_GENSRCDIR", default="gen_srcdir")
     source_filename = os.path.join(source_path, RESNET50_LABEL_FILE)
 
-    shutil.copyfile(source_filename, f"{config_dir}/{label_filename}")
-    print(f"Created {config_dir}/{label_filename}")
+    target_path = os.path.join(config_dir, label_filename)
+
+    shutil.copyfile(source_filename, target_path)
+    print(f"Created {target_path}")
 
 
 def create_openvino_modelfile(
@@ -2108,14 +2382,14 @@ def create_models(
                 f"{_color_magenta}PyTorch: AOTI model generation requested{_color_reset}"
             )
             # max-batch 8
-            if create_torch_aoti_modelfile(
+            if create_torch_aoti_model_file(
                 models_dir,
                 model_version,
                 input_shape,
                 input_dtype,
                 output0_dtype,
             ):
-                create_torch_aoti_modelconfig(
+                create_torch_aoti_model_config(
                     models_dir,
                     input_shape,
                     output0_shape,
@@ -2354,6 +2628,8 @@ def create_fixed_models(
     if FLAGS.onnx:
         import onnx
     if FLAGS.libtorch or FLAGS.torch_aoti:
+        import shutil
+
         import torch
         from torch import nn
     if FLAGS.torchvision_aoti:
@@ -2758,7 +3034,15 @@ def create_fixed_models(
             for model_shape in [(-1,), (-1, -1), (-1, -1, -1)]:
                 emu.create_nop_modelconfig(FLAGS.models_dir, model_shape, model_dtype)
 
+    if FLAGS.torch_aoti:
+        print(
+            f"{_color_magenta}PyTorch: Complex AOTI model generation requested{_color_reset}"
+        )
+        if create_torch_aoti_complex_model_file(FLAGS.models_dir):
+            create_torch_aoti_complex_model_config(FLAGS.models_dir)
+
     if FLAGS.torchvision_aoti:
+        # TODO: Add support for variable batch size and version policy for torchvision AOTI models.
         print(f"{_color_blue}TorchVision AOTI model generation requested{_color_reset}")
-        if create_torchvision_aoti_modelfile(FLAGS.models_dir, 1, 1):
-            create_torchvision_aoti_modelconfig(FLAGS.models_dir, 1)
+        if create_torchvision_aoti_model_file(FLAGS.models_dir, 1):
+            create_torchvision_aoti_model_config(FLAGS.models_dir, 1)