diff --git a/src/nnfusion/engine/memory_allocator.cpp b/src/nnfusion/engine/memory_allocator.cpp index fcb99c431..220b47363 100644 --- a/src/nnfusion/engine/memory_allocator.cpp +++ b/src/nnfusion/engine/memory_allocator.cpp @@ -290,9 +290,9 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_alloc() auto& lu = *_lu; if (m_max_allocated > 0) { - lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; if (!FLAGS_ffunction_codegen) { + lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; lu << "CUDA_SAFE_CALL(cudaMalloc((void**)&" << this->get_name() << "_memory_pool," << m_max_allocated << "));\n"; lu << "CUDA_SAFE_CALL(cudaMemset((void*)" << this->get_name() << "_memory_pool, 0, " @@ -332,9 +332,13 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_free() return _lu; auto& lu = *_lu; - lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; + if (!FLAGS_ffunction_codegen) + { + lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; lu << "CUDA_SAFE_CALL(cudaFree(" << this->get_name() + "_memory_pool));\n"; + } + return _lu; } @@ -342,7 +346,8 @@ LanguageUnit_p nnfusion::MemoryAllocator::emit_memory_set(int value) { LanguageUnit_p _lu(new LanguageUnit(this->get_name() + "_memset")); auto& lu = *_lu; - lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; + if (!FLAGS_ffunction_codegen) + lu << "CUDA_SAFE_CALL(cudaSetDevice(" << m_device_id << "));\n"; lu << "CUDA_SAFE_CALL(cudaMemset((void*)" << this->get_name() + "_memory_pool, " << value << ", " << m_max_allocated << "));\n"; return _lu; diff --git a/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp index 2ff245b69..19f37dcb7 100644 --- a/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/base_codegen_pass.cpp @@ -346,7 +346,7 @@ LanguageUnit_p BaseCodegenPass::codegen_workspace_size(std::shared_ptrmax_allocated(); } - *lu_workspace << "int get_workspace_size()\n{\n"; + *lu_workspace << "uint64_t get_workspace_size()\n{\n"; *lu_workspace << " return " << total_alloc << ";\n"; *lu_workspace << "}\n"; return lu_workspace; diff --git a/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp index 1f9983567..f9fb5a6dd 100644 --- a/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cpu_codegen_pass.cpp @@ -373,7 +373,7 @@ void CpuCodegenPass::create_header_file(std::shared_ptr ctx, // if (device_type() == CUDA_GPU || device_type() == ROCM_GPU) // lu_header << header::cuda->get_code(); lu_header << "extern \"C\" int get_device_type();\n"; - lu_header << "extern \"C\" int get_workspace_size();\n"; + lu_header << "extern \"C\" uint64_t get_workspace_size();\n"; lu_header << "extern \"C\" int kernel_entry("; std::string params = get_kernel_entry_paras(tu); lu_header << params; diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 4746f1b70..3edb77cc1 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -1116,7 +1116,7 @@ void CudaCodegenPass::create_header_file(std::shared_ptr ctx lu_header << header::cuda_fp16->get_code(); lu_header << "extern \"C\" int get_device_type();\n"; - lu_header << "extern \"C\" int get_workspace_size();\n"; + lu_header << "extern \"C\" uint64_t get_workspace_size();\n"; lu_header << "extern \"C\" int kernel_entry"; if (FLAGS_fhost_entry) lu_header << "_host"; @@ -1371,7 +1371,7 @@ cmake_minimum_required(VERSION 3.5) SET(SRC "nnfusion_rt.cu" CACHE STRING "codegen source file") SET(TARGET_NAME "nnfusion_naive_rt" CACHE STRING "codegen target name") -SET(CUDA_ARCH "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" CACHE STRING "target architecture") +SET(CUDA_ARCH "-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_86,code=sm_86" CACHE STRING "target architecture") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) diff --git a/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp index 8025bd147..9d2dcce2b 100644 --- a/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/hlsl_cpp_codegen_pass.cpp @@ -326,7 +326,7 @@ void HLSLCPPCodegenPass::create_header_file(std::shared_ptr auto& lu_header = *lup_header; lu_header << "extern \"C\" RUNTIME_API int get_device_type();\n"; - lu_header << "extern \"C\" RUNTIME_API int get_workspace_size();\n"; + lu_header << "extern \"C\" RUNTIME_API uint64_t get_workspace_size();\n"; lu_header << "extern \"C\" RUNTIME_API int kernel_entry"; if (FLAGS_fhost_entry) lu_header << "_host"; diff --git a/src/python/nnfusion/executor.py b/src/python/nnfusion/executor.py index a53c92ee0..c941e904c 100644 --- a/src/python/nnfusion/executor.py +++ b/src/python/nnfusion/executor.py @@ -234,6 +234,7 @@ def _maybe_reserve_mem(self, device): if get_workspace_size is None: return None + get_workspace_size.restype = ctypes.c_uint64 n_byte = get_workspace_size() if not n_byte: return None