diff --git a/src/nnfusion/engine/memory_allocator.cpp b/src/nnfusion/engine/memory_allocator.cpp
index fcb99c431..220b47363 100644
--- a/src/nnfusion/engine/memory_allocator.cpp
+++ b/src/nnfusion/engine/memory_allocator.cpp
@@ -290,9 +290,9 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_alloc()
     auto& lu = *_lu;
     if (m_max_allocated > 0)
     {
-        lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
         if (!FLAGS_ffunction_codegen)
         {
+            lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
             lu << "CUDA_SAFE_CALL(cudaMalloc((void**)&" << this->get_name() << "_memory_pool,"
                << m_max_allocated << "));\n";
             lu << "CUDA_SAFE_CALL(cudaMemset((void*)" << this->get_name() << "_memory_pool, 0, "
@@ -332,9 +332,13 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_free()
         return _lu;
 
     auto& lu = *_lu;
-    lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
+
     if (!FLAGS_ffunction_codegen)
+    {
+        lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
         lu << "CUDA_SAFE_CALL(cudaFree(" << this->get_name() + "_memory_pool));\n";
+    }
+
     return _lu;
 }
 
@@ -342,7 +346,8 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_set(int value)
 {
     LanguageUnit_p _lu(new LanguageUnit(this->get_name() + "_memset"));
     auto& lu = *_lu;
-    lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
+    if (!FLAGS_ffunction_codegen)
+        lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n";
     lu << "CUDA_SAFE_CALL(cudaMemset((void*)" << this->get_name() + "_memory_pool, " << value
        << ", " << m_max_allocated << "));\n";
     return _lu;
diff --git a/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp
index 2ff245b69..19f37dcb7 100644
--- a/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp
@@ -346,7 +346,7 @@ LanguageUnit_p BaseCodegenPass::codegen_workspace_size(std::shared_ptr<Translati
         total_alloc += allocator.second->max_allocated();
     }
 
-    *lu_workspace << "int get_workspace_size()\n{\n";
+    *lu_workspace << "uint64_t get_workspace_size()\n{\n";
     *lu_workspace << "    return " << total_alloc << ";\n";
     *lu_workspace << "}\n";
     return lu_workspace;
diff --git a/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp
index 1f9983567..f9fb5a6dd 100644
--- a/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp
@@ -373,7 +373,7 @@ void CpuCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx,
     // if (device_type() == CUDA_GPU || device_type() == ROCM_GPU)
     //     lu_header << header::cuda->get_code();
     lu_header << "extern \"C\" int get_device_type();\n";
-    lu_header << "extern \"C\" int get_workspace_size();\n";
+    lu_header << "extern \"C\" uint64_t get_workspace_size();\n";
     lu_header << "extern \"C\" int kernel_entry(";
     std::string params = get_kernel_entry_paras(tu);
     lu_header << params;
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 4746f1b70..3edb77cc1 100644
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -1116,7 +1116,7 @@ void CudaCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx
 
         lu_header << header::cuda_fp16->get_code();
     lu_header << "extern \"C\" int get_device_type();\n";
-    lu_header << "extern \"C\" int get_workspace_size();\n";
+    lu_header << "extern \"C\" uint64_t get_workspace_size();\n";
     lu_header << "extern \"C\" int kernel_entry";
     if (FLAGS_fhost_entry)
         lu_header << "_host";
@@ -1371,7 +1371,7 @@ cmake_minimum_required(VERSION 3.5)
 
 SET(SRC "nnfusion_rt.cu" CACHE STRING "codegen source file")
 SET(TARGET_NAME "nnfusion_naive_rt" CACHE STRING "codegen target name")
-SET(CUDA_ARCH "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" CACHE STRING "target architecture")
+SET(CUDA_ARCH "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_86,code=sm_86" CACHE STRING "target architecture")
 
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
diff --git a/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp
index 8025bd147..9d2dcce2b 100644
--- a/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp
@@ -326,7 +326,7 @@ void HLSLCPPCodegenPass::create_header_file(std::shared_ptr<InterpreterContext>
     auto& lu_header = *lup_header;
 
     lu_header << "extern \"C\" RUNTIME_API int get_device_type();\n";
-    lu_header << "extern \"C\" RUNTIME_API int get_workspace_size();\n";
+    lu_header << "extern \"C\" RUNTIME_API uint64_t get_workspace_size();\n";
     lu_header << "extern \"C\" RUNTIME_API int kernel_entry";
     if (FLAGS_fhost_entry)
         lu_header << "_host";
diff --git a/src/python/nnfusion/executor.py b/src/python/nnfusion/executor.py
index a53c92ee0..c941e904c 100644
--- a/src/python/nnfusion/executor.py
+++ b/src/python/nnfusion/executor.py
@@ -234,6 +234,7 @@ def _maybe_reserve_mem(self, device):
         if get_workspace_size is None:
             return None
 
+        get_workspace_size.restype = ctypes.c_uint64
         n_byte = get_workspace_size()
         if not n_byte:
             return None