From 4cb30914099dfac019661700283aacb93f6acc3c Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 16 Apr 2025 05:34:25 -0700
Subject: [PATCH 1/4] Initial commit

---
 src/instance_state.cc | 33 ++++++++++++---------------------
 src/model_state.cc    | 40 ++++++++++++++++++++++++++++++++++++++--
 src/model_state.h     |  9 ++++++++-
 3 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/src/instance_state.cc b/src/instance_state.cc
index 56208a1..3d4b1c9 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1697,15 +1697,6 @@ ModelInstanceState::InitOptimizationProfiles()
   // the first context creation. As currently triton supports one
   // context per engine, in order to set the specified profile_index,
   // another context is created and the previous context is destroyed.
-  std::shared_ptr<nvinfer1::IExecutionContext> default_trt_context(
-      engine_->createExecutionContext());
-  if (default_trt_context == nullptr) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("unable to create TensorRT context: ") +
-         model_state_->GetTensorRTLogger().LastErrorMsg())
-            .c_str());
-  }
   std::vector<std::pair<std::string, int>> profile_name_index;
   // No optimization profile is set for this TensorRT plan
   if (ProfileNames().empty()) {
@@ -1736,17 +1727,17 @@ ModelInstanceState::InitOptimizationProfiles()
               .c_str());
       continue;
     }
-    if (profile_index == 0) {
-      res.first->second.context_ = std::move(default_trt_context);
-    } else {
-      res.first->second.context_.reset(engine_->createExecutionContext());
-      if (res.first->second.context_ == nullptr) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string("unable to create TensorRT context: ") +
-             model_state_->GetTensorRTLogger().LastErrorMsg())
-                .c_str());
-      }
+
+    res.first->second.context_.reset(engine_->createExecutionContext());
+    if (res.first->second.context_ == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to create TensorRT context: ") +
+           model_state_->GetTensorRTLogger().LastErrorMsg())
+              .c_str());
+    }
+
+    if (profile_index != 0) {
       if (!res.first->second.context_->setOptimizationProfileAsync(
               profile_index, stream_)) {
         return TRITONSERVER_ErrorNew(
diff --git a/src/model_state.cc b/src/model_state.cc
index 6127989..254f5e8 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -142,7 +142,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 }
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : TensorRTModel(triton_model), engine_sharing_(true)
+    : TensorRTModel(triton_model), engine_sharing_(true),
+      alloc_strategy_(nvinfer1::ExecutionContextAllocationStrategy::kSTATIC)
 {
   // Obtain backend configuration
   TRITONBACKEND_Backend* backend;
@@ -288,6 +289,41 @@ ModelState::ValidateModelConfig()
 TRITONSERVER_Error*
 ModelState::ParseParameters()
 {
+  triton::common::TritonJson::Value params;
+  bool status = ModelConfig().Find("parameters", &params);
+  if (status) {
+    // If 'allocation_strategy' is not present in 'parameters',
+    // will use the default strategy "STATIC".
+    std::string exec_alloc_strategy;
+    TRITONSERVER_Error* err =
+        GetParameterValue(params, "allocation_strategy", &exec_alloc_strategy);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      // exec_alloc_strategy is present in model config parameters
+      if (exec_alloc_strategy == "STATIC") {
+        alloc_strategy_ = nvinfer1::ExecutionContextAllocationStrategy::kSTATIC;
+      } else if (exec_alloc_strategy == "ON_PROFILE_CHANGE") {
+        alloc_strategy_ =
+            nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE;
+      } else {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("Invalid value for 'allocation_strategy': '" +
+             exec_alloc_strategy + "' for model instance '" + Name() + "'")
+                .c_str());
+      }
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          ("'allocation_strategy' set to '" + exec_alloc_strategy +
+           "' for model instance '" + Name() + "'")
+              .c_str());
+    }
+  }
   return nullptr;  // success
 }
 
diff --git a/src/model_state.h b/src/model_state.h
index b132806..42274a3 100644
--- a/src/model_state.h
+++ b/src/model_state.h
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -88,6 +88,11 @@ class ModelState : public TensorRTModel {
 
   TensorRTLogger& GetTensorRTLogger() { return tensorrt_logger_; }
 
+  nvinfer1::ExecutionContextAllocationStrategy AllocationStrategy() const
+  {
+    return alloc_strategy_;
+  }
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
 
@@ -140,6 +145,8 @@ class ModelState : public TensorRTModel {
 
   // Whether the backend should support version-compatible TensorRT models.
   static inline bool is_version_compatible_{false};
+
+  nvinfer1::ExecutionContextAllocationStrategy alloc_strategy_;
 };
 
 

From 351d027def3b62e7d6ba76abb040d826115badcb Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Wed, 16 Apr 2025 14:52:34 -0700
Subject: [PATCH 2/4] Minor fix

---
 src/instance_state.cc | 3 ++-
 src/model_state.cc    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/instance_state.cc b/src/instance_state.cc
index 3d4b1c9..b999b70 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -1728,7 +1728,8 @@ ModelInstanceState::InitOptimizationProfiles()
       continue;
     }
 
-    res.first->second.context_.reset(engine_->createExecutionContext());
+    res.first->second.context_.reset(
+        engine_->createExecutionContext(model_state_->AllocationStrategy()));
     if (res.first->second.context_ == nullptr) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
diff --git a/src/model_state.cc b/src/model_state.cc
index 254f5e8..5a25af3 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -314,7 +314,8 @@ ModelState::ParseParameters()
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INVALID_ARG,
             ("Invalid value for 'allocation_strategy': '" +
-             exec_alloc_strategy + "' for model instance '" + Name() + "'")
+             exec_alloc_strategy + "' for model instance '" + Name() +
+             "'. Supported values are 'STATIC' and 'ON_PROFILE_CHANGE'.")
                 .c_str());
       }
       LOG_MESSAGE(

From 56e09e98baea796112d4099ee76c408e15ba6b13 Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Thu, 17 Apr 2025 11:33:50 -0700
Subject: [PATCH 3/4] Minor update

---
 src/model_state.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/model_state.cc b/src/model_state.cc
index 5a25af3..d3395aa 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -294,9 +294,9 @@ ModelState::ParseParameters()
   if (status) {
     // If 'allocation_strategy' is not present in 'parameters',
     // will use the default strategy "STATIC".
-    std::string exec_alloc_strategy;
+    std::string alloc_strategy;
     TRITONSERVER_Error* err =
-        GetParameterValue(params, "allocation_strategy", &exec_alloc_strategy);
+        GetParameterValue(params, "allocation_strategy", &alloc_strategy);
     if (err != nullptr) {
       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
         return err;
@@ -304,23 +304,23 @@ ModelState::ParseParameters()
         TRITONSERVER_ErrorDelete(err);
       }
     } else {
-      // exec_alloc_strategy is present in model config parameters
-      if (exec_alloc_strategy == "STATIC") {
+      // allocation_strategy is present in model config parameters
+      if (alloc_strategy == "STATIC") {
         alloc_strategy_ = nvinfer1::ExecutionContextAllocationStrategy::kSTATIC;
-      } else if (exec_alloc_strategy == "ON_PROFILE_CHANGE") {
+      } else if (alloc_strategy == "ON_PROFILE_CHANGE") {
         alloc_strategy_ =
             nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE;
       } else {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INVALID_ARG,
-            ("Invalid value for 'allocation_strategy': '" +
-             exec_alloc_strategy + "' for model instance '" + Name() +
+            ("Invalid value for 'allocation_strategy': '" + alloc_strategy +
+             "' for model instance '" + Name() +
              "'. Supported values are 'STATIC' and 'ON_PROFILE_CHANGE'.")
                 .c_str());
       }
       LOG_MESSAGE(
           TRITONSERVER_LOG_INFO,
-          ("'allocation_strategy' set to '" + exec_alloc_strategy +
+          ("'allocation_strategy' set to '" + alloc_strategy +
            "' for model instance '" + Name() + "'")
               .c_str());
     }

From 65d9d8bb295c0a85640ae0c7b71001b846a06d7e Mon Sep 17 00:00:00 2001
From: Yingge He <yinggeh@nvidia.com>
Date: Fri, 18 Apr 2025 01:49:38 -0700
Subject: [PATCH 4/4] Sync forked PR

---
 README.md             | 18 ++++++++++++++++++
 src/instance_state.cc |  5 +----
 src/model_state.cc    | 17 +++++++++--------
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 4fcd7b6..33c9fa7 100644
--- a/README.md
+++ b/README.md
@@ -99,3 +99,21 @@ but the listed CMake argument can be used to override.
 * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
 * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
 * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
+
+## Parameters
+
+Triton exposes some flags to control the execution mode of the TensorRT models through
+the Parameters section of the model's `config.pbtxt` file.
+
+### execution_context_allocation_strategy
+
+Different memory allocation behaviors for IExecutionContext. IExecutionContext requires a block of device memory for internal activation tensors during inference. The user can let the execution context manage the memory in various ways. Current options are "STATIC" (default) and "ON_PROFILE_CHANGE".
+
+```
+parameters: {
+  key: "execution_context_allocation_strategy"
+  value: {
+    string_value: "STATIC"
+  }
+}
+```
diff --git a/src/instance_state.cc b/src/instance_state.cc
index b999b70..8819911 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -1693,10 +1693,6 @@ ModelInstanceState::InitIOIndexMap()
 TRITONSERVER_Error*
 ModelInstanceState::InitOptimizationProfiles()
 {
-  // TRT sets the optimization profile index to be 0 implicitly with
-  // the first context creation. As currently triton supports one
-  // context per engine, in order to set the specified profile_index,
-  // another context is created and the previous context is destroyed.
   std::vector<std::pair<std::string, int>> profile_name_index;
   // No optimization profile is set for this TensorRT plan
   if (ProfileNames().empty()) {
@@ -1728,6 +1724,7 @@ ModelInstanceState::InitOptimizationProfiles()
       continue;
     }
 
+    // Create a new execution context for the profile
     res.first->second.context_.reset(
         engine_->createExecutionContext(model_state_->AllocationStrategy()));
     if (res.first->second.context_ == nullptr) {
diff --git a/src/model_state.cc b/src/model_state.cc
index d3395aa..bc72d67 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -292,11 +292,11 @@ ModelState::ParseParameters()
   triton::common::TritonJson::Value params;
   bool status = ModelConfig().Find("parameters", &params);
   if (status) {
-    // If 'allocation_strategy' is not present in 'parameters',
-    // will use the default strategy "STATIC".
+    // If 'execution_context_allocation_strategy' is not present in
+    // 'parameters', will use the default strategy "STATIC".
     std::string alloc_strategy;
-    TRITONSERVER_Error* err =
-        GetParameterValue(params, "allocation_strategy", &alloc_strategy);
+    TRITONSERVER_Error* err = GetParameterValue(
+        params, "execution_context_allocation_strategy", &alloc_strategy);
     if (err != nullptr) {
       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
         return err;
@@ -304,7 +304,8 @@ ModelState::ParseParameters()
         TRITONSERVER_ErrorDelete(err);
       }
     } else {
-      // allocation_strategy is present in model config parameters
+      // 'execution_context_allocation_strategy' is present in model config
+      // parameters.
       if (alloc_strategy == "STATIC") {
         alloc_strategy_ = nvinfer1::ExecutionContextAllocationStrategy::kSTATIC;
       } else if (alloc_strategy == "ON_PROFILE_CHANGE") {
@@ -313,14 +314,14 @@ ModelState::ParseParameters()
       } else {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INVALID_ARG,
-            ("Invalid value for 'allocation_strategy': '" + alloc_strategy +
-             "' for model instance '" + Name() +
+            ("Invalid value for 'execution_context_allocation_strategy': '" +
+             alloc_strategy + "' for model instance '" + Name() +
              "'. Supported values are 'STATIC' and 'ON_PROFILE_CHANGE'.")
                 .c_str());
       }
       LOG_MESSAGE(
           TRITONSERVER_LOG_INFO,
-          ("'allocation_strategy' set to '" + alloc_strategy +
+          ("'execution_context_allocation_strategy' set to '" + alloc_strategy +
            "' for model instance '" + Name() + "'")
               .c_str());
     }