diff --git a/.github/workflows/compiler-build.yml b/.github/workflows/compiler-build.yml
index 32ab43f05..04a0a08ea 100644
--- a/.github/workflows/compiler-build.yml
+++ b/.github/workflows/compiler-build.yml
@@ -138,7 +138,7 @@ jobs:
         working-directory: ${{github.workspace}}
         run: |
           dotnet tool install --global dotnet-coverage
-          dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --blame"
+          dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --filter FullyQualifiedName!~Nncase.Tests.TargetTest.UnitTestCUDAKernels --blame"
           dotnet-coverage merge -o coverage.unit.xml -f cobertura -r coverage/*.xml
 
       - name: Upload Coverage
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0193cbf70..01d61edc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,18 @@ option(BUILD_TESTING "Build test programs" OFF)
 option(ENABLE_OP_PROFILE "Profile ops cast time" OFF)
 option(ENABLE_DUMP_MANAGER "Enable dump manager" OFF)
 option(ENABLE_DUMP_MEM "Dump mem usage" OFF)
+option(ENABLE_CUDA_RUNTIME "Enable CUDA runtime" OFF)
+
+if(DEFINED CMAKE_CUDA_COMPILER AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
+    set(ENABLE_CUDA_RUNTIME ON CACHE BOOL "Enable CUDA runtime" FORCE)
+endif()
+
+if(ENABLE_CUDA_RUNTIME)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES 120)
+    endif()
+    enable_language(CUDA)
+endif()
 
 if (BUILDING_RUNTIME)
     # option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" OFF)
diff --git a/cmake/compile_flags.cmake b/cmake/compile_flags.cmake
index b5d7a36c4..fa9022513 100644
--- a/cmake/compile_flags.cmake
+++ b/cmake/compile_flags.cmake
@@ -4,7 +4,7 @@ if (MSVC)
     set(PYBIND11_CPP_STANDARD "/std:c++latest")
 else()
     add_compile_options(-fvisibility=hidden)
-    add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
+    add_compile_options(-Wall -Wextra -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
     if (APPLE)
         add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized -Wno-deprecated -Wno-braced-scalar-init)
     elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@@ -15,6 +15,11 @@ else()
     endif()
 endif()
 
+if (CMAKE_CUDA_COMPILER)
+    message(STATUS "Configuring for CUDA")
+    #add_compile_options(-save-temps)
+endif()
+
 if(${CMAKE_SYSTEM_PROCESSOR} MATCHES
    "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)|(X86_64)")
     if (MSVC)
diff --git a/conanfile.py b/conanfile.py
index b20364120..33d5c946b 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -29,6 +29,7 @@ class nncaseConan(ConanFile):
         "k230_runtime": [True, False],
         "k80_runtime": [True, False],
         "vulkan_runtime": [True, False],
+        "cuda_runtime": [True, False],
         "tests": [True, False],
         "python": [True, False],
         "python_root": ["ANY"]
@@ -40,6 +41,7 @@ class nncaseConan(ConanFile):
         "k230_runtime": False,
         "k80_runtime": False,
         "vulkan_runtime": False,
+        "cuda_runtime": False,
         "tests": False,
         "python": True,
         "python_root": ""
@@ -88,8 +90,11 @@ def generate(self):
         tc.variables['ENABLE_K230_RUNTIME'] = self.options.k230_runtime
         tc.variables['ENABLE_K80_RUNTIME'] = self.options.k80_runtime
         tc.variables['ENABLE_VULKAN_RUNTIME'] = self.options.vulkan_runtime
+        tc.variables['ENABLE_CUDA_RUNTIME'] = self.options.cuda_runtime
         tc.variables['BUILD_PYTHON_BINDING'] = self.options.python
         tc.variables['BUILD_TESTING'] = self.options.tests
+        if self.options.cuda_runtime:
+            tc.variables['CMAKE_CUDA_ARCHITECTURES'] = "120"
         if self.options.get_safe("python_root", default="") != "":
             tc.variables['Python3_ROOT_DIR'] = self.options.python_root
         if self.options.runtime:
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs
index a746e14ad..c9cd8d750 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs
@@ -80,16 +80,16 @@ public static string TopoAwareRuntimeDef(NTTTargetOptions options, ulong dataAli
         return content;
     }
 
-    public static string ModuleTopologyDef(NTTTargetOptions options)
+    public static string ModuleTopologyDef(NTTTargetOptions options, bool isCUDA)
     {
-        var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", options).Result;
+        var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", new { Hierarchies = options.Hierarchies[0], IsCUDA = isCUDA }).Result;
         return content;
     }
 
-    public static string CMakeDef()
+    public static string CMakeDef(bool isCUDA)
     {
         var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake"));
-        var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result;
+        var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath, IsCUDA = isCUDA }).Result;
         return content;
     }
 
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs
index 5d38ea682..34e15ba45 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs
@@ -22,6 +22,8 @@ public class CSourceCompiler
 {
     private static string? _vcVarPath;
 
+    private readonly bool _isCUDA;
+
     /// <summary>
     /// compiler exe name.
     /// </summary>
@@ -37,8 +39,9 @@ public class CSourceCompiler
     /// </summary>
     private string _ext = string.Empty;
 
-    public CSourceCompiler()
+    public CSourceCompiler(bool isCUDA)
     {
+        _isCUDA = isCUDA;
         PlatformSpecific();
         ArchSpecific();
     }
@@ -186,8 +189,16 @@ private void ArchSpecific()
 
     private string ArgumentsSpecific(string sourcePath, string outPath)
     {
-        var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
-        "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
+        string archConfig = string.Empty;
+        if (_isCUDA)
+        {
+            archConfig = $"-DCMAKE_CUDA_ARCHITECTURES=120 -DCMAKE_CUDA_COMPILER=clang++";
+        }
+        else
+        {
+            archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
+            "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
+        }
 
 #if DEBUG
         var config = "Release";
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/DeviceCSourceConvertVisitor.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/DeviceCSourceConvertVisitor.cs
index 26a66d3a8..b4b47410d 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/DeviceCSourceConvertVisitor.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/DeviceCSourceConvertVisitor.cs
@@ -47,7 +47,7 @@ public static void WriteWithProfiler(string functionName, string tagName = "")
         IndentScope.Writer.IndWrite("{\n");
 #if false // Disable device profiling for now.
         IndentScope.Writer.Write($"constexpr std::string_view function_name = \"{tagName}\";\n");
-        IndentScope.Writer.Write($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
+        IndentScope.Writer.Write($"profile_scope profiler(function_name, profile_level::device);\n");
 #endif
         IndentScope.Writer.Write($"{functionName};\n");
         IndentScope.Writer.IndWrite("}\n");
@@ -69,7 +69,7 @@ public static void WriteIndWithProfiler(string functionName, string tagName = ""
         IndentScope.Writer.IndWrite("{\n");
 #if false // Disable device profiling for now.
         IndentScope.Writer.IndWrite($"constexpr std::string_view function_name = \"{tagName}\";\n");
-        IndentScope.Writer.IndWrite($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
+        IndentScope.Writer.IndWrite($"profile_scope profiler(function_name, profile_level::device);\n");
 #endif
         IndentScope.Writer.IndWrite($"{functionName};\n");
         IndentScope.Writer.IndWrite("}\n");
@@ -94,7 +94,7 @@ protected override CSymbol VisitPrimFunction(PrimFunction expr)
         }
 
         var ctype = $"template<{string.Join(", ", Enumerable.Range(0, expr.Parameters.Length).Select(x => $"class T{x}"))}>" + Environment.NewLine +
-            $"void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";
+            $"NTT_DEVICE void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";
 
         using (var scope = new IndentScope(_deviceBuilder))
         {
@@ -192,7 +192,7 @@ protected override CSymbol VisitPhysicalBuffer(PhysicalBuffer expr)
             _ => throw new NotSupportedException(expr.Location.ToString()),
         };
 
-        var str = $"std::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
+        var str = $"ntt::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
         symbol = new(start.Type, str);
         _exprMemo.Add(expr, symbol);
         return symbol;
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/FunctionBuilder.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/FunctionBuilder.cs
index b44ee466d..8d64872f2 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/FunctionBuilder.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/FunctionBuilder.cs
@@ -21,15 +21,17 @@ internal class FunctionBuilder
     private readonly BinaryWriter _textWriter;
     private readonly BinaryWriter _rdataWriter;
     private readonly IReadOnlyList<BinaryWriter> _threadLocalRdataWriters;
+    private readonly IReadOnlyList<BinaryWriter> _warpLocalRdataWriters;
     private readonly IReadOnlyList<BinaryWriter> _blockLocalRdataWriters;
 
-    public FunctionBuilder(uint id, BinaryWriter rdataWriter, IReadOnlyList<BinaryWriter> threadLocalRdataWriters, IReadOnlyList<BinaryWriter> blockLocalRdataWriters, Targets.NTTTargetOptions targetOptions)
+    public FunctionBuilder(uint id, BinaryWriter rdataWriter, IReadOnlyList<BinaryWriter> threadLocalRdataWriters, IReadOnlyList<BinaryWriter> warpLocalRdataWriters, IReadOnlyList<BinaryWriter> blockLocalRdataWriters, Targets.NTTTargetOptions targetOptions)
     {
         _id = id;
         _sectionManager = new();
         _textWriter = _sectionManager.GetWriter(WellknownSectionNames.Text);
         _rdataWriter = rdataWriter;
         _threadLocalRdataWriters = threadLocalRdataWriters;
+        _warpLocalRdataWriters = warpLocalRdataWriters;
         _blockLocalRdataWriters = blockLocalRdataWriters;
         TargetOptions = targetOptions;
     }
@@ -58,66 +60,17 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)
                     tensor.Serialize(_rdataWriter.BaseStream);
                 }
 
-                // 2. write the thread local rdata
-                ulong threadLocalRdataPoolSize = ulong.MinValue;
-                foreach (var (@const, range) in primFunc.SchedResult.ThreadLocalRdatas)
-                {
-                    var tensor = ((TensorConst)@const).Value;
-                    var distributedType = (DistributedType)@const.CheckedType;
-                    var size = range.Max - range.Min;
-                    threadLocalRdataPoolSize = System.Math.Max(range.Max, threadLocalRdataPoolSize);
-                    var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
-                    var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
-                    for (int i = 0; i < _threadLocalRdataWriters.Count; i++)
-                    {
-                        var threadLocalRdataWriter = _threadLocalRdataWriters[i];
-                        var shardIndex = DistributedUtility.GetUnraveledIndex(i, TargetOptions.Hierarchies[0]);
-                        (var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
-                        var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);
-
-                        if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
-                        {
-                            throw new InvalidDataException("The Buffer Size Not Equal!");
-                        }
-
-                        threadLocalRdataWriter.Position(checked((long)range.Min));
-                        tensor.Serialize(threadLocalRdataWriter.BaseStream, linearOffset, localShape, localStrides);
-                    }
-                }
-
-                // 2. write the block local rdata
-                ulong blockLocalRdataPoolSize = ulong.MinValue;
-                foreach (var (@const, range) in primFunc.SchedResult.BlockLocalRdatas)
-                {
-                    var tensor = ((TensorConst)@const).Value;
-                    var distributedType = (DistributedType)@const.CheckedType;
-                    var size = range.Max - range.Min;
-                    blockLocalRdataPoolSize = System.Math.Max(range.Max, blockLocalRdataPoolSize);
-                    var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
-                    var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
-                    for (int i = 0; i < _blockLocalRdataWriters.Count; i++)
-                    {
-                        var blockLocalRdataWriter = _blockLocalRdataWriters[i];
-                        var shardIndex = DistributedUtility.GetUnraveledIndex(i, TargetOptions.Hierarchies[0][..^1]).Concat([0]).ToArray();
-                        (var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
-                        var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);
-
-                        if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
-                        {
-                            throw new InvalidDataException("The Buffer Size Not Equal!");
-                        }
-
-                        blockLocalRdataWriter.Position(checked((long)range.Min));
-                        tensor.Serialize(blockLocalRdataWriter.BaseStream, linearOffset, localShape, localStrides);
-                    }
-                }
+                // 2. write the local rdatas
+                var threadLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.ThreadLocalRdatas, _threadLocalRdataWriters, "t");
+                var warpLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.WarpLocalRdatas, _warpLocalRdataWriters, "w");
+                var blockLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.BlockLocalRdatas, _blockLocalRdataWriters, "b");
 
-                // 4. build function.
+                // 3. build function.
                 var visitor = new KernelCSourceConvertVisitor(TargetOptions);
                 visitor.Visit(primFunc);
                 var functionCSource = visitor.GetCSource();
 
-                // 5. write the kernel desc
+                // 4. write the kernel desc
                 using (var writer = _sectionManager.GetWriter(LinkableKernelFunction.KernelHeaderSectionName))
                 {
                     var header = default(KernelDescHeader);
@@ -125,6 +78,7 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)
                     header.LocalDataAlign = (uint)primFunc.SchedResult.DataAlign;
                     header.OutputPoolSize = primFunc.SchedResult.OutputUsage;
                     header.LocalDataPoolSize = primFunc.SchedResult.DataUsage;
+                    header.WarpLocalDataPoolSize = primFunc.SchedResult.WarpLocalDataPoolSize;
                     header.BlockLocalDataPoolSize = primFunc.SchedResult.BlockLocalDataPoolSize;
                     writer.Write(ref header);
                 }
@@ -132,6 +86,7 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)
                 var memoryPoolDesc = new KernelMemoryPoolDesc(
                     rdataPoolSize,
                     threadLocalRdataPoolSize,
+                    warpLocalRdataPoolSize,
                     blockLocalRdataPoolSize);
                 var kernelDescSection = new LinkedSection(_sectionManager.GetContent(LinkableKernelFunction.KernelHeaderSectionName)!, ".desc", 0, 8, (uint)sizeof(KernelDescHeader));
                 return new LinkableKernelFunction(_id, primFunc, functionCSource, memoryPoolDesc, _sectionManager.GetContent(WellknownSectionNames.Text)!, kernelDescSection);
@@ -154,4 +109,50 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)
 
         throw new NotSupportedException($"the {baseFunc.GetType()} {baseFunc.Name} is notsupport for codegen!");
     }
+
+    private ulong SerializeLocalRdata(IReadOnlyDictionary<Const, ValueRange<ulong>> localRdatas, IReadOnlyList<BinaryWriter> localRdataWriters, string scopeName)
+    {
+        ulong localRdataPoolSize = ulong.MinValue;
+        foreach (var (@const, range) in localRdatas)
+        {
+            var tensor = ((TensorConst)@const).Value;
+            var distributedType = (DistributedType)@const.CheckedType;
+            var size = range.Max - range.Min;
+            localRdataPoolSize = System.Math.Max(range.Max, localRdataPoolSize);
+            var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
+            var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
+            for (int i = 0; i < localRdataWriters.Count; i++)
+            {
+                var localRdataWriter = localRdataWriters[i];
+                var shardIndex = GetScopedShardIndex(i, scopeName);
+                (var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
+                var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);
+
+                if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
+                {
+                    throw new InvalidDataException("The Buffer Size Not Equal!");
+                }
+
+                localRdataWriter.Position(checked((long)range.Min));
+                tensor.Serialize(localRdataWriter.BaseStream, linearOffset, localShape, localStrides);
+            }
+        }
+
+        return localRdataPoolSize;
+    }
+
+    private int[] GetScopedShardIndex(int writerIndex, string scopeName)
+    {
+        var hierarchies = TargetOptions.Hierarchies[0];
+        var scopeIndex = TargetOptions.HierarchyNames.IndexOf(scopeName, StringComparison.Ordinal);
+        if (scopeIndex < 0)
+        {
+            return DistributedUtility.GetUnraveledIndex(writerIndex, hierarchies);
+        }
+
+        var scopedHierarchies = hierarchies[..(scopeIndex + 1)];
+        return DistributedUtility.GetUnraveledIndex(writerIndex, scopedHierarchies)
+            .Concat(Enumerable.Repeat(0, hierarchies.Length - scopedHierarchies.Length))
+            .ToArray();
+    }
 }
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/FusionCSourceConvertVisitor.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/FusionCSourceConvertVisitor.cs
index d655ebc19..31a804f95 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/FusionCSourceConvertVisitor.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/FusionCSourceConvertVisitor.cs
@@ -60,7 +60,7 @@ protected override CSymbol VisitFusion(Fusion expr)
             IndentScope.Writer.IndWrite($"template<{string.Join(", ", Enumerable.Range(0, expr.Parameters.Length).Select(x => $"class T{x}"))}> struct {expr.Name} {{\n");
             using (_ = new IndentScope())
             {
-                IndentScope.Writer.IndWrite($"auto operator()({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"const T{i} &{s.Name}").ToArray())}) const noexcept {{\n");
+                IndentScope.Writer.IndWrite($"constexpr auto operator()({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"const T{i} &{s.Name}").ToArray())}) const noexcept {{\n");
 
                 // 2. Function body
                 using (_ = new IndentScope())
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs
index f20039466..96c6d89a2 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs
@@ -27,7 +27,7 @@ namespace Nncase.CodeGen.NTT;
 /// </summary>
 internal sealed class KernelCSourceConvertVisitor : CSourceConvertVisitor, IDisposable
 {
-    private readonly HashSet<string> _excludedVars = new() { "data", "block_local_data" };
+    private readonly HashSet<string> _excludedVars = new() { "data", "warp_local_data", "block_local_data" };
     private readonly StringBuilder _kernelBuilder;
     private readonly HashSet<TIR.PrimFunction> _refFuncs;
     private readonly HashSet<TIR.Buffer> _declaredBuffers = new(ReferenceEqualityComparer.Instance);
@@ -47,7 +47,7 @@ public KernelCSourceConvertVisitor(NTTTargetOptions targetOptions)
 
     private Var[] TensorParams => _tensorParams ??= VisitEntry.Parameters.ToArray().OfType<Var>().Where(x => !_excludedVars.Contains(x.Name)).ToArray();
 
-    public static void WriteWithProfiler(string functionName, string tagName = "")
+    public void WriteWithProfiler(string functionName, string tagName = "")
     {
         functionName = functionName.TrimEnd(new char[] { ';', '\n' });
         if (tagName == string.Empty)
@@ -62,12 +62,12 @@ public static void WriteWithProfiler(string functionName, string tagName = "")
         tagName = tagName == string.Empty ? functionName : tagName;
         IndentScope.Writer.IndWrite("{\n");
         IndentScope.Writer.Write($"constexpr std::string_view function_name = \"{tagName}\";\n");
-        IndentScope.Writer.Write($"auto_profiler profiler(function_name, runtime::profiling_level::kernel);\n");
+        IndentScope.Writer.Write($"profile_scope profiler(0, profile_level::kernel);\n");
         IndentScope.Writer.Write($"{functionName};\n");
         IndentScope.Writer.IndWrite("}\n");
     }
 
-    public static void WriteIndWithProfiler(string functionName, string tagName = "")
+    public void WriteIndWithProfiler(string functionName, string tagName = "")
     {
         functionName = functionName.TrimEnd(new char[] { ';', '\n' });
         if (tagName == string.Empty)
@@ -82,7 +82,7 @@ public static void WriteIndWithProfiler(string functionName, string tagName = ""
         tagName = tagName == string.Empty ? functionName : tagName;
         IndentScope.Writer.IndWrite("{\n");
         IndentScope.Writer.IndWrite($"constexpr std::string_view function_name = \"{tagName}\";\n");
-        IndentScope.Writer.IndWrite($"auto_profiler profiler(function_name, runtime::profiling_level::kernel);\n");
+        IndentScope.Writer.IndWrite($"profile_scope profiler(0, profile_level::kernel);\n");
         IndentScope.Writer.IndWrite($"{functionName};\n");
         IndentScope.Writer.IndWrite("}\n");
     }
@@ -92,7 +92,7 @@ public KernelCSource GetCSource()
         var paramsExcluded = VisitEntry.Parameters.ToArray().OfType<IVar>().Where(x => !_excludedVars.Contains(x.Name)).ToArray();
         var templateHeader = TensorParams.Length == 0 ? string.Empty : $"template<{string.Join(", ", Enumerable.Range(0, TensorParams.Length).Select(x => $"class T{x}"))}>" + Environment.NewLine;
         var ctype = templateHeader +
-            $"void {VisitEntry.Name}({string.Concat(paramsExcluded.Select(Visit).Select(s => $"{s.Type} {s.Name}, ").ToArray())}const std::byte *rdata, const std::byte *thread_local_rdata, const std::byte *block_local_rdata, std::byte *thread_local_data, std::byte *block_local_data, std::byte *output, nncase::ntt::runtime::thread_inout_desc *const output_descs)";
+            $"NTT_DEVICE void {VisitEntry.Name}({string.Concat(paramsExcluded.Select(Visit).Select(s => $"{s.Type} {s.Name}, ").ToArray())}const std::byte *rdata, const std::byte *thread_local_rdata, const std::byte *warp_local_rdata, const std::byte *block_local_rdata, std::byte *thread_local_data, std::byte *warp_local_data, std::byte *block_local_data, std::byte *output, nncase::ntt::runtime::thread_inout_desc *const output_descs)";
         return new(
             Declare: ctype + ";\n",
             Kernel: CSourceBuiltn.MakeKernel(ctype, _kernelBuilder.ToString()),
@@ -186,16 +186,18 @@ protected override CSymbol VisitPhysicalBuffer(PhysicalBuffer expr)
         {
             (MemoryLocation.Rdata, 0) => "rdata",
             (MemoryLocation.ThreadLocalRdata, 0) => "thread_local_rdata",
+            (MemoryLocation.WarpLocalRdata, 0) => "warp_local_rdata",
             (MemoryLocation.BlockLocalRdata, 0) => "block_local_rdata",
             (MemoryLocation.Data, 0) => "thread_local_data",
             (MemoryLocation.Data, 1) => "thread_local_data",
+            (MemoryLocation.WarpLocalData, 0) => "warp_local_data",
             (MemoryLocation.BlockLocalData, 0) => "block_local_data",
             (MemoryLocation.Output, 0) => "output",
             _ => throw new NotSupportedException($"{expr.Location}, {expr.Hierarchy}"),
         };
 
         var ptypeName = "std::byte";
-        if (expr.Location is MemoryLocation.Rdata or MemoryLocation.ThreadLocalRdata or MemoryLocation.BlockLocalRdata)
+        if (expr.Location is MemoryLocation.Rdata or MemoryLocation.ThreadLocalRdata or MemoryLocation.WarpLocalRdata or MemoryLocation.BlockLocalRdata)
         {
             // Rdata, ThreadLocalRdata and BlockLocalRdata are const
             ptypeName = $"const {ptypeName}";
@@ -205,12 +207,12 @@ protected override CSymbol VisitPhysicalBuffer(PhysicalBuffer expr)
         if (expr.Size is DimConst)
         {
             var spanSize = (ulong)expr.Size.FixedValue;
-            name = $"std::span<{ptypeName}, {spanSize}>({loc} + {start.Name}UL, {spanSize})";
+            name = $"ntt::span<{ptypeName}, {spanSize}>({loc} + {start.Name}UL, {spanSize})";
         }
         else
         {
             var spanSize = Visit(expr.Size).Name;
-            name = $"std::span<{ptypeName}>({loc} + {start.Name}UL, {spanSize})";
+            name = $"ntt::span<{ptypeName}>({loc} + {start.Name}UL, {spanSize})";
         }
 
         symbol = new(start.Type, name);
@@ -471,7 +473,7 @@ protected override CSymbol VisitCall(Call expr)
                     WriteWithProfiler($"slice({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[3], local: true).Name}, {VisitDimOrShape(args[1]).Name}, {VisitDimOrShape(args[2]).Name}, fixed_dims_v<{string.Join(",", slice.Axes)}>, fixed_dims_v<{string.Join(",", slice.Strides)}>);\n");
                     break;
                 case TIR.NTT.Concat concat:
-                    WriteWithProfiler($"concat(std::make_tuple({string.Join(",", args.SkipLast(1).Select(x => VisitBuffer(x, local: true)).Select(s => s.Name))}), {VisitBuffer(args[^1], local: true).Name}, {concat.Axis}_dim);\n");
+                    WriteWithProfiler($"concat(ntt::make_tuple({string.Join(",", args.SkipLast(1).Select(x => VisitBuffer(x, local: true)).Select(s => s.Name))}), {VisitBuffer(args[^1], local: true).Name}, {concat.Axis}_dim);\n");
                     break;
                 case TIR.NTT.Transpose transpose:
                     WriteWithProfiler($"transpose({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name}, fixed_dims_v<{string.Join(",", transpose.Perm)}>);\n");
@@ -555,7 +557,7 @@ protected override CSymbol VisitCall(Call expr)
                     WriteIndWithProfiler($"get_position_ids({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name}, {KernelUtility.ShardingToC(getPositionIds.DistributedType)}, {Visit(getPositionIds.DistributedType.TensorType.Shape).Name});\n");
                     break;
                 case TIR.NTT.Stack stack:
-                    IndentScope.Writer.Write($"stack<{stack.Axis}>(std::make_tuple({string.Join(",", args.SkipLast(1).Select(x => VisitBuffer(x, local: true)).Select(s => s.Name))}), {VisitBuffer(args[^1], local: true).Name});\n");
+                    IndentScope.Writer.Write($"stack<{stack.Axis}>(ntt::make_tuple({string.Join(",", args.SkipLast(1).Select(x => VisitBuffer(x, local: true)).Select(s => s.Name))}), {VisitBuffer(args[^1], local: true).Name});\n");
                     break;
                 case TIR.NTT.Reshape reshape:
                     IndentScope.Writer.Write($"reshape({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name});\n");
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableFunction.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableFunction.cs
index 3239ac741..1fb2eebb8 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableFunction.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableFunction.cs
@@ -21,11 +21,14 @@ internal unsafe struct KernelDescHeader
     [MarshalAs(UnmanagedType.U8)]
     public ulong LocalDataPoolSize;
 
+    [MarshalAs(UnmanagedType.U8)]
+    public ulong WarpLocalDataPoolSize;
+
     [MarshalAs(UnmanagedType.U8)]
     public ulong BlockLocalDataPoolSize;
 }
 
-internal sealed record KernelMemoryPoolDesc(ulong RdataPoolSize, ulong ThreadLocalRdataPoolSize, ulong BlockLocalRdataPoolSize);
+internal sealed record KernelMemoryPoolDesc(ulong RdataPoolSize, ulong ThreadLocalRdataPoolSize, ulong WarpLocalRdataPoolSize, ulong BlockLocalRdataPoolSize);
 
 internal sealed class LinkableKernelFunction : ILinkableFunction
 {
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableModule.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableModule.cs
index 5b25ad596..d6b881cb1 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableModule.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkableModule.cs
@@ -17,20 +17,24 @@ namespace Nncase.CodeGen.NTT;
 
 internal sealed class LinkableModule : ILinkableModule
 {
+    private readonly string _moduleKind;
     private readonly Stream _desc;
     private readonly Stream _rdata;
     private readonly IReadOnlyList<Stream> _threadLocalRdatas;
     private readonly IReadOnlyList<Stream> _threadLocalCaches;
+    private readonly IReadOnlyList<Stream> _warpLocalRdatas;
     private readonly IReadOnlyList<Stream> _blockLocalRdatas;
     private readonly IReadOnlyList<ILinkableFunction> _functions;
     private readonly NTTTargetOptions _targetOptions;
 
-    public LinkableModule(Stream desc, Stream rdata, IReadOnlyList<Stream> threadLocalRdatas, IReadOnlyList<Stream> threadLocalCaches, IReadOnlyList<Stream> blockLocalRdatas, IReadOnlyList<ILinkableFunction> functions, CompileOptions options)
+    public LinkableModule(string moduleKind, Stream desc, Stream rdata, IReadOnlyList<Stream> threadLocalRdatas, IReadOnlyList<Stream> threadLocalCaches, IReadOnlyList<Stream> warpLocalRdatas, IReadOnlyList<Stream> blockLocalRdatas, IReadOnlyList<ILinkableFunction> functions, CompileOptions options)
     {
+        _moduleKind = moduleKind;
         _desc = desc;
         _rdata = rdata;
         _threadLocalRdatas = threadLocalRdatas;
         _threadLocalCaches = threadLocalCaches;
+        _warpLocalRdatas = warpLocalRdatas;
         _blockLocalRdatas = blockLocalRdatas;
         _functions = functions;
         PublicFunctions = _functions.OfType<LinkableKernelFunction>().ToArray();
@@ -134,14 +138,15 @@ private void WriteModuleTopologyDef(string codegenDir)
         {
             using (var writer = new StreamWriter(fs))
             {
-                writer.Write(CSourceBuiltn.ModuleTopologyDef(_targetOptions));
+                writer.Write(CSourceBuiltn.ModuleTopologyDef(_targetOptions, isCUDA: _moduleKind == CUDATarget.Kind));
             }
         }
     }
 
     private void WriteThreadMain(string codegenDir, LinkableKernelFunction mainFunc, IReadOnlyList<string> kernelFiles)
     {
-        using (var fs = File.Open(Path.Join(codegenDir, "thread_main.cpp"), FileMode.Create))
+        var threadMainExt = _moduleKind == CUDATarget.Kind ? "cu" : "cpp";
+        using (var fs = File.Open(Path.Join(codegenDir, $"thread_main.{threadMainExt}"), FileMode.Create))
         {
             using (var writer = new StreamWriter(fs))
             {
@@ -173,7 +178,7 @@ private void WriteCMakeLists(string codegenDir)
         {
             using (var writer = new StreamWriter(fs))
             {
-                writer.Write(CSourceBuiltn.CMakeDef());
+                writer.Write(CSourceBuiltn.CMakeDef(isCUDA: _moduleKind == CUDATarget.Kind));
             }
         }
     }
@@ -199,12 +204,12 @@ private ILinkedModule GenerateLinkedModule(string codegenDir, LinkableKernelFunc
         var funcText = File.ReadAllBytes(elfPath);
         textWriter.Write(funcText);
         linkedFunctions.Add(new LinkedFunction(mainFunc.Id, mainFunc.SourceFunction, 0, (uint)funcText.Length, mainFunc.Sections));
-        return new LinkedModule(linkedFunctions, _desc, manager.GetContent(WellknownSectionNames.Text)!, _rdata, _threadLocalRdatas, _threadLocalCaches, _blockLocalRdatas, rdataAlign);
+        return new LinkedModule(_moduleKind, linkedFunctions, _desc, manager.GetContent(WellknownSectionNames.Text)!, _rdata, _threadLocalRdatas, _threadLocalCaches, _warpLocalRdatas, _blockLocalRdatas, rdataAlign);
     }
 
     private string CompileCSource(string sourcePath)
     {
-        var compiler = new CSourceCompiler();
+        var compiler = new CSourceCompiler(_moduleKind == CUDATarget.Kind);
         var binDir = Path.Join(sourcePath, "build", "nncase_ntt_module");
         return compiler.Compile(sourcePath, binDir);
     }
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkedModule.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkedModule.cs
index 6165b08fc..827e4371a 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkedModule.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/LinkedModule.cs
@@ -17,21 +17,22 @@ internal unsafe struct ModuleDescHeader
     public uint ThreadDim;
 
     [MarshalAs(UnmanagedType.U4)]
-    public uint BlockDim;
+    public uint WarpDim;
 
     [MarshalAs(UnmanagedType.U4)]
-    public uint ChipDim;
+    public uint BlockDim;
 
     [MarshalAs(UnmanagedType.U4)]
-    public uint Reserved0;
+    public uint ChipDim;
 }
 
 internal sealed class LinkedModule : ILinkedModule
 {
     public const string ModuleHeaderSectionName = ".desc";
 
-    public unsafe LinkedModule(IReadOnlyList<ILinkedFunction> functions, Stream desc, Stream text, Stream rdata, IReadOnlyList<Stream> threadLocalRdatas, IReadOnlyList<Stream> threadLocalCaches, IReadOnlyList<Stream> blockLocalRdatas, ulong rdataAlign)
+    public unsafe LinkedModule(string moduleKind, IReadOnlyList<ILinkedFunction> functions, Stream desc, Stream text, Stream rdata, IReadOnlyList<Stream> threadLocalRdatas, IReadOnlyList<Stream> threadLocalCaches, IReadOnlyList<Stream> warpLocalRdatas, IReadOnlyList<Stream> blockLocalRdatas, ulong rdataAlign)
     {
+        ModuleKind = moduleKind;
         Functions = functions;
         Sections =
         [
@@ -40,11 +41,12 @@ public unsafe LinkedModule(IReadOnlyList<ILinkedFunction> functions, Stream desc
             new LinkedSection(rdata, WellknownSectionNames.Rdata, 0, (uint)rdataAlign, (ulong)rdata.Length),
             new LinkedMultipleContentsSection(threadLocalRdatas, WellknownSectionNames.ThreadLocalRdata, 0, (uint)rdataAlign),
             new LinkedMultipleContentsSection(threadLocalCaches, WellknownSectionNames.ThreadLocalCache, 0, (uint)rdataAlign),
+            new LinkedMultipleContentsSection(warpLocalRdatas, WellknownSectionNames.WarpLocalRdata, 0, (uint)rdataAlign),
             new LinkedMultipleContentsSection(blockLocalRdatas, WellknownSectionNames.BlockLocalRdata, 0, (uint)rdataAlign),
         ];
     }
 
-    public string ModuleKind => "cpu";
+    public string ModuleKind { get; }
 
     public uint Version => 0;
 
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/ModuleBuilder.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/ModuleBuilder.cs
index fe934826a..1d4fef416 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/ModuleBuilder.cs
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/ModuleBuilder.cs
@@ -17,22 +17,36 @@ public sealed class NTTModuleBuilder : IModuleBuilder
     private readonly BinaryWriter _rdataWriter;
     private readonly BinaryWriter[] _threadLocalRdataWriters;
     private readonly BinaryWriter[] _threadLocalCacheWriters;
+    private readonly BinaryWriter[] _warpLocalRdataWriters;
     private readonly BinaryWriter[] _blockLocalRdataWriters;
 
-    public NTTModuleBuilder(CompileOptions options)
+    public NTTModuleBuilder(string moduleKind, CompileOptions options)
     {
+        var targetOptions = (NTTTargetOptions)options.TargetOptions;
+        var hierarchies = targetOptions.Hierarchies[0];
+        ModuleKind = moduleKind;
         _sectionManager = new();
         _rdataWriter = _sectionManager.GetWriter(WellknownSectionNames.Rdata);
-        var shardCount = TensorUtilities.GetProduct(((Targets.NTTTargetOptions)options.TargetOptions).Hierarchies[0]);
+
+        var shardCount = TensorUtilities.GetProduct(hierarchies);
         _threadLocalRdataWriters = new BinaryWriter[shardCount];
         _threadLocalCacheWriters = new BinaryWriter[shardCount];
-        _blockLocalRdataWriters = new BinaryWriter[shardCount / ((Targets.NTTTargetOptions)options.TargetOptions).Hierarchies[0][^1]];
         for (int i = 0; i < shardCount; i++)
         {
             _threadLocalRdataWriters[i] = _sectionManager.GetWriter(WellknownSectionNames.ThreadLocalRdata, i);
             _threadLocalCacheWriters[i] = _sectionManager.GetWriter(WellknownSectionNames.ThreadLocalCache, i);
         }
 
+        var isCUDA = ModuleKind == CUDATarget.Kind;
+        var warpsCount = isCUDA ? shardCount / hierarchies[^1] : 0;
+        _warpLocalRdataWriters = new BinaryWriter[warpsCount];
+        for (int i = 0; i < _warpLocalRdataWriters.Length; i++)
+        {
+            _warpLocalRdataWriters[i] = _sectionManager.GetWriter(WellknownSectionNames.WarpLocalRdata, i);
+        }
+
+        var blocksCount = isCUDA ? (hierarchies.Length > 1 ? warpsCount / hierarchies[^2] : 1) : shardCount / hierarchies[^1];
+        _blockLocalRdataWriters = new BinaryWriter[blocksCount];
         for (int i = 0; i < _blockLocalRdataWriters.Length; i++)
         {
             _blockLocalRdataWriters[i] = _sectionManager.GetWriter(WellknownSectionNames.BlockLocalRdata, i);
@@ -44,7 +58,7 @@ public NTTModuleBuilder(CompileOptions options)
     public CompileOptions CompileOptions { get; }
 
     /// <inheritdoc/>
-    public string ModuleKind => "cpu";
+    public string ModuleKind { get; }
 
     /// <inheritdoc/>
     public ILinkableModule Build(IReadOnlyList<BaseFunction> functions)
@@ -55,9 +69,21 @@ public ILinkableModule Build(IReadOnlyList<BaseFunction> functions)
         using (var writer = _sectionManager.GetWriter(LinkedModule.ModuleHeaderSectionName))
         {
             var header = default(ModuleDescHeader);
+            var hasWarp = targetOptions.HierarchyNames.Contains('w', StringComparison.Ordinal);
             header.ThreadDim = (uint)targetOptions.Hierarchies[0][^1];
-            header.BlockDim = targetOptions.Hierarchies[0].Length < 2 ? 1 : (uint)targetOptions.Hierarchies[0][^2];
-            header.ChipDim = targetOptions.Hierarchies[0].Length < 3 ? 1 : (uint)targetOptions.Hierarchies[0][^3];
+            if (hasWarp)
+            {
+                header.WarpDim = targetOptions.Hierarchies[0].Length < 2 ? 1 : (uint)targetOptions.Hierarchies[0][^2];
+                header.BlockDim = targetOptions.Hierarchies[0].Length < 3 ? 1 : (uint)targetOptions.Hierarchies[0][^3];
+                header.ChipDim = targetOptions.Hierarchies[0].Length < 4 ? 1 : (uint)targetOptions.Hierarchies[0][^4];
+            }
+            else
+            {
+                header.WarpDim = 1;
+                header.BlockDim = targetOptions.Hierarchies[0].Length < 2 ? 1 : (uint)targetOptions.Hierarchies[0][^2];
+                header.ChipDim = targetOptions.Hierarchies[0].Length < 3 ? 1 : (uint)targetOptions.Hierarchies[0][^3];
+            }
+
             writer.Write(ref header);
 
             // cache offsets.
@@ -76,7 +102,7 @@ public ILinkableModule Build(IReadOnlyList<BaseFunction> functions)
             }
         }
 
-        var linkableFunctions = functions.OfType<BaseFunction>().Select((f, i) => new FunctionBuilder((uint)i, _rdataWriter, _threadLocalRdataWriters, _blockLocalRdataWriters, (Targets.NTTTargetOptions)CompileOptions.TargetOptions).Build(f)).ToArray();
+        var linkableFunctions = functions.OfType<BaseFunction>().Select((f, i) => new FunctionBuilder((uint)i, _rdataWriter, _threadLocalRdataWriters, _warpLocalRdataWriters, _blockLocalRdataWriters, (Targets.NTTTargetOptions)CompileOptions.TargetOptions).Build(f)).ToArray();
         _rdataWriter.Flush();
         var threadLocalRdataContents = Enumerable.Range(0, _threadLocalRdataWriters.Length).Select(i =>
         {
@@ -96,12 +122,18 @@ public ILinkableModule Build(IReadOnlyList<BaseFunction> functions)
             return _sectionManager.GetContent(WellknownSectionNames.ThreadLocalCache, i)!;
         }).ToArray();
 
+        var warpLocalRdataContents = Enumerable.Range(0, _warpLocalRdataWriters.Length).Select(i =>
+        {
+            _warpLocalRdataWriters[i].Flush();
+            return _sectionManager.GetContent(WellknownSectionNames.WarpLocalRdata, i)!;
+        }).ToArray();
+
         var blockLocalRdataContents = Enumerable.Range(0, _blockLocalRdataWriters.Length).Select(i =>
         {
             _blockLocalRdataWriters[i].Flush();
             return _sectionManager.GetContent(WellknownSectionNames.BlockLocalRdata, i)!;
         }).ToArray();
 
-        return new LinkableModule(_sectionManager.GetContent(LinkedModule.ModuleHeaderSectionName)!, _sectionManager.GetContent(WellknownSectionNames.Rdata)!, threadLocalRdataContents, threadLocalCacheContents, blockLocalRdataContents, linkableFunctions, CompileOptions);
+        return new LinkableModule(ModuleKind, _sectionManager.GetContent(LinkedModule.ModuleHeaderSectionName)!, _sectionManager.GetContent(WellknownSectionNames.Rdata)!, threadLocalRdataContents, threadLocalCacheContents, warpLocalRdataContents, blockLocalRdataContents, linkableFunctions, CompileOptions);
     }
 }
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/CMakeLists.txt.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/CMakeLists.txt.cshtml
index 27199b31c..6b6b6a6fe 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/CMakeLists.txt.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/CMakeLists.txt.cshtml
@@ -2,7 +2,11 @@
 
 cmake_minimum_required(VERSION 3.15)
 
-project(nncase_cpu_module)
+@if (Model.IsCUDA) {
+@:project(nncase_cpu_module CXX CUDA)
+} else {
+@:project(nncase_cpu_module CXX)
+}
 
 option(BUILD_SHARED "Build shared library in linux" OFF)
 option(BUILD_STANDALONE "Build standalone executable" OFF)
@@ -12,5 +16,9 @@ endif()
 
 include(@Html.Raw(Model.CMakePath))
 
-target_sources(nncase_ntt_module PRIVATE thread_main.cpp)
-target_include_directories(nncase_ntt_module PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+@if (Model.IsCUDA) {
+@:target_sources(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE thread_main.cu)
+} else {
+@:target_sources(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE thread_main.cpp)
+}
+target_include_directories(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/Matmul.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/Matmul.cshtml
index 5c81a9b8c..57adcaaa0 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/Matmul.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/Matmul.cshtml
@@ -8,18 +8,18 @@
 {
 @:@(Model.Indent)if (@Html.Raw(Model.Arguments[3].Symbol.Name)) {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  matmul<true, @(Model.Target.TransposeA.ToString().ToLowerInvariant()), @(Model.Target.TransposeB.ToString().ToLowerInvariant())>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @scale, fixed_shape_v<@string.Join(",", Model.Target.LhsVectorizedAxes)>,  fixed_shape_v<>, fixed_shape_v<@string.Join(",", Model.Target.RhsVectorizedAxes)>, fixed_shape_v<>);
 @:@(Model.Indent)} else {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  matmul<false, @(Model.Target.TransposeA.ToString().ToLowerInvariant()), @(Model.Target.TransposeB.ToString().ToLowerInvariant())>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @scale, fixed_shape_v<@string.Join(",", Model.Target.LhsVectorizedAxes)>,  fixed_shape_v<>, fixed_shape_v<@string.Join(",", Model.Target.RhsVectorizedAxes)>, fixed_shape_v<>);
 @:@(Model.Indent)}
 }
 else
 {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  matmul<false, @(Model.Target.TransposeA.ToString().ToLowerInvariant()), @(Model.Target.TransposeB.ToString().ToLowerInvariant())>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @scale, fixed_shape_v<@string.Join(",", Model.Target.LhsVectorizedAxes)>,  fixed_shape_v<>, fixed_shape_v<@string.Join(",", Model.Target.RhsVectorizedAxes)>, fixed_shape_v<>);
 }
 @(Model.Indent)}
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/PackedMatMul.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/PackedMatMul.cshtml
index 51d6568ff..0b42768ae 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/PackedMatMul.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/Kernels/PackedMatMul.cshtml
@@ -8,18 +8,18 @@
 {
 @:@(Model.Indent)if (@Html.Raw(Model.Arguments[3].Symbol.Name)) {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  packed_matmul<true>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @(scale));
 @:@(Model.Indent)} else {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  packed_matmul<false>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @(scale));
 @:@(Model.Indent)}
 }
 else
 {
 // @:@(Model.Indent)  constexpr std::string_view function_name = "matmul";
-// @:@(Model.Indent)  auto_profiler profiler(function_name, runtime::profiling_level::device);
+// @:@(Model.Indent)  profile_scope profiler(function_name, profile_level::device);
 @:@(Model.Indent)  packed_matmul<false>(@Html.Raw(Model.Arguments[0].Symbol.Name), @Html.Raw(Model.Arguments[1].Symbol.Name), @Html.Raw(Model.Arguments[2].Symbol.Name), @(scale));
 }
 @(Model.Indent)}
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/module_topology_def.h.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/module_topology_def.h.cshtml
index 58b3a43e0..83806efdc 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/module_topology_def.h.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/module_topology_def.h.cshtml
@@ -1,14 +1,14 @@
 @using System.Linq
 @using NetFabric.Hyperlinq
 @using Nncase
-@model Nncase.Targets.NTTTargetOptions
 @{
-  var hierarchy = Model.Hierarchies[0];
+  var hierarchy = (int[])Model.Hierarchies;
+  var topologyLevels = Model.IsCUDA ? 4 : 3;
 }
 
 #pragma once
 #include <nncase/ntt/shape.h>
 
 namespace nncase::ntt::distributed {
-    constexpr auto topology_shape = ntt::fixed_shape_v<@(string.Join(", ", Enumerable.Repeat(1, 3 - hierarchy.Length).Concat(hierarchy)))>;
+    constexpr auto topology_shape = ntt::fixed_shape_v<@(string.Join(", ", Enumerable.Repeat(1, topologyLevels - hierarchy.Length).Concat(hierarchy)))>;
 }
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/thread_main.cpp.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/thread_main.cpp.cshtml
index 51f00f036..5eb35ec89 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/thread_main.cpp.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/thread_main.cpp.cshtml
@@ -7,12 +7,14 @@
   var inputCount = Model.PrimFunction.Parameters.Length;
 }
 
-extern "C" void thread_main(const nncase::ntt::runtime::thread_inout_desc *input_descs,
+extern "C" NTT_DEVICE void thread_main(const nncase::ntt::runtime::thread_inout_desc *input_descs,
   nncase::ntt::runtime::thread_inout_desc *const output_descs,
   const std::byte *rdata,
   const std::byte *thread_local_rdata,
+  const std::byte *warp_local_rdata,
   const std::byte *block_local_rdata,
   std::byte *thread_local_data,
+  std::byte *warp_local_data,
   std::byte *block_local_data,
   std::byte *output) {
   /* prepare inputs */
@@ -72,7 +74,7 @@ extern "C" void thread_main(const nncase::ntt::runtime::thread_inout_desc *input
     throw new NotSupportedException($"not support multi form topology!");
   }
   
-  @(Model.PrimFunction.Name)(@(string.Concat(names.Select(x => $"{x}, ")))rdata, thread_local_rdata, block_local_rdata, thread_local_data, block_local_data, output, output_descs);
+  @(Model.PrimFunction.Name)(@(string.Concat(names.Select(x => $"{x}, ")))rdata, thread_local_rdata, warp_local_rdata, block_local_rdata, thread_local_data, warp_local_data, block_local_data, output, output_descs);
 }
 
 #ifdef NNCASE_STANDALONE
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml
index 0ad5b8837..c401fd887 100644
--- a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml
+++ b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml
@@ -15,8 +15,7 @@
 
 #pragma once
 #include <nncase/ntt/ntt.h>
-#include <thread>
-#include <barrier>
+#include <nncase/ntt/std_containers.h>
 
 /**
  * @@brief topology aware runtime
@@ -33,22 +32,15 @@ namespace tar {
     foreach (var i in comb) {
       shape[i] = 1;
     }
-    var groupRawName = groupName + "_raw";
 
-@:std::barrier<> @(groupRawName)[@(groups)] {
-    @for (int i = 0; i < groups; i++) 
-    {
-  @:std::barrier(@(groupSize)), 
-    }
-@:};
-@:auto @(groupName) = nncase::ntt::make_tensor_view_from_address<std::barrier<>>(@(groupRawName), nncase::ntt::fixed_shape_v<@(string.Join(",", shape))>);
+@:NTT_DEVICE decltype(nncase::ntt::make_tensor<ntt::fixed_barrier<@(groupSize)>>(nncase::ntt::fixed_shape_v<@(string.Join(",", shape))>)) @(groupName);
 @:
   }
 
 @if (Model.CollectivePoolSize > 0) {
-@:alignas(@Model.Alignment) uint8_t collective_pool_ptr[@Model.CollectivePoolSize];
+@:alignas(@Model.Alignment) NTT_DEVICE uint8_t collective_pool_ptr[@Model.CollectivePoolSize];
 } else {
-@:alignas(@Model.Alignment) uint8_t collective_pool_ptr[1];
+@:alignas(@Model.Alignment) NTT_DEVICE uint8_t collective_pool_ptr[1];
 }
 
 enum reduce_kind {
@@ -57,11 +49,11 @@ enum reduce_kind {
 }
 };
 
-constexpr std::array<size_t, @(hierarchy.Length)> Hierarchy = {@(string.Join(", ", hierarchy))};
-auto src_ptr_tensor = nncase::ntt::make_tensor<void *>(nncase::ntt::fixed_shape_v<@(string.Join(",", hierarchy))>);
-auto dest_ptr_tensor = nncase::ntt::make_tensor<void *>(nncase::ntt::fixed_shape_v<@(string.Join(",", hierarchy))>);
+NTT_DEVICE constexpr ntt::array<size_t, @(hierarchy.Length)> Hierarchy = {@(string.Join(", ", hierarchy))};
+NTT_DEVICE auto src_ptr_tensor = nncase::ntt::make_tensor<void *>(nncase::ntt::fixed_shape_v<@(string.Join(",", hierarchy))>);
+NTT_DEVICE auto dest_ptr_tensor = nncase::ntt::make_tensor<void *>(nncase::ntt::fixed_shape_v<@(string.Join(",", hierarchy))>);
 
-template <size_t Level> static std::byte *get_cache_address() {
+template <size_t Level> NTT_DEVICE static std::byte *get_cache_address() {
     return reinterpret_cast<std::byte *>(
         ntt::distributed::detail::global_thread_local_cache_ptr(program_ids())(
             Level));
@@ -77,7 +69,7 @@ namespace tac {
 using namespace nncase;
 
 template <ntt::Shape GlobalShape, ntt::Shape Index, ntt::Tensor TDst>
-void tensor_boxing_load_sync(const GlobalShape &global_shape, const Index &index, TDst &dest)
+NTT_DEVICE void tensor_boxing_load_sync(const GlobalShape &global_shape, const Index &index, TDst &dest)
 {
     using TOutBase = std::decay_t<TDst>;
     using TElem = typename TOutBase::element_type;
@@ -87,7 +79,7 @@ void tensor_boxing_load_sync(const GlobalShape &global_shape, const Index &index
 }
 
 template <ntt::Shape GlobalShape, ntt::Shape Index, ntt::Tensor TSrc>
-void tensor_boxing_store_sync(const GlobalShape &global_shape, const Index &index, TSrc &src)
+NTT_DEVICE void tensor_boxing_store_sync(const GlobalShape &global_shape, const Index &index, TSrc &src)
 {
     using TSrcBase = std::decay_t<TSrc>;
     using TElem = typename TSrcBase::element_type;
@@ -103,14 +95,14 @@ template <tar::reduce_kind Kind> class group_hierarchy_getter;
 @:template <> class group_hierarchy_getter<tar::reduce_kind::@(GetName(comb, string.Empty))> {
   var shape = Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ?  hierarchy[i] : 1).ToArray();
 @:public:
-@:    static constexpr auto group_hierarchy = ntt::fixed_shape_v<@(string.Join(", ", shape))>;
+@:    NTT_DEVICE static constexpr auto group_hierarchy = ntt::fixed_shape_v<@(string.Join(", ", shape))>;
 @:};
 }
 
 template <ntt::reduce_op Op, tar::reduce_kind Kind>
 class tensor_reduce_sync_impl {
   public:
-    void reduce_group_sync() const noexcept {
+    NTT_DEVICE void reduce_group_sync() const noexcept {
         @foreach(var comb in combinations) {
           var reduce_group_index = string.Join(", ", Enumerable.Range(0, hierarchy.Length).Select(i => comb.Contains(i) ? "0" : "ntt::distributed::" + hierarchyNames[i] + "id()"));
         @:if constexpr (Kind == tar::reduce_kind::@(GetName(comb, string.Empty))) {
@@ -124,7 +116,7 @@ class tensor_reduce_sync_impl {
     }
 
     template <ntt::Shape TIndexInGroup, ntt::Shape TIndexInGlobal>
-    constexpr auto index_group2global(const TIndexInGroup &index_in_group, const TIndexInGlobal &index_in_global) const noexcept {
+    NTT_DEVICE constexpr auto index_group2global(const TIndexInGroup &index_in_group, const TIndexInGlobal &index_in_global) const noexcept {
         return ntt::generate_shape<TIndexInGlobal::rank()>([&](auto axis) {
             if constexpr (Kind & (1 << (TIndexInGlobal::rank() - axis))) {
                 return index_in_group[axis];
@@ -135,7 +127,7 @@ class tensor_reduce_sync_impl {
     }
 
     template <ntt::Shape TIndexInGlobal>
-    constexpr auto index_global2group(const TIndexInGlobal &index_in_global) const noexcept {
+    NTT_DEVICE constexpr auto index_global2group(const TIndexInGlobal &index_in_global) const noexcept {
         return ntt::generate_shape<TIndexInGlobal::rank()>([&](auto axis) {
             if constexpr (Kind & (1 << (TIndexInGlobal::rank() - axis))) {
                 return index_in_global[axis];
@@ -145,7 +137,7 @@ class tensor_reduce_sync_impl {
         });
     }
 
-    static constexpr auto get_group_size() {
+    NTT_DEVICE static constexpr auto get_group_size() {
         size_t group_size = 1;
         for (size_t i = 1; i <= tar::Hierarchy.size(); i++) {
             if (Kind & (1 << i)) {
@@ -160,7 +152,7 @@ class tensor_reduce_sync_impl {
     }
 
     template <class TSliceIn, class TSliceOut>
-    void reduce_impl(TSliceIn &local, TSliceIn &remote, TSliceOut &dest) {
+    NTT_DEVICE void reduce_impl(TSliceIn &local, TSliceIn &remote, TSliceOut &dest) {
         if constexpr (Op == ntt::reduce_op::max) {
             ntt::binary<ntt::ops::max>(local, remote, dest);
         } else if constexpr (Op == ntt::reduce_op::sum ||
@@ -173,7 +165,7 @@ class tensor_reduce_sync_impl {
         }
     }
 
-    template <class TIn, class TOut> void operator()(TIn &src, TOut &&dest) {
+    template <class TIn, class TOut> NTT_DEVICE void operator()(TIn &src, TOut &&dest) {
         // collect all tensors pointer for access tensor from other nodes.
         using TElem = typename TIn::element_type;
         using TOutBase = std::decay_t<TOut>;
@@ -289,7 +281,7 @@ class tensor_reduce_sync_impl {
 } // namespace detail
 
 template <ntt::reduce_op Op, tar::reduce_kind Kind, class TIn, class TOut>
-void tensor_reduce_sync(TIn &input, TOut &&output) {
+NTT_DEVICE void tensor_reduce_sync(TIn &input, TOut &&output) {
     detail::tensor_reduce_sync_impl<Op, Kind> impl;
     impl(input, output);
 }
diff --git a/modules/Nncase.Modules.NTT/NTTModule.cs b/modules/Nncase.Modules.NTT/NTTModule.cs
index 9ba23e357..fe7497b94 100644
--- a/modules/Nncase.Modules.NTT/NTTModule.cs
+++ b/modules/Nncase.Modules.NTT/NTTModule.cs
@@ -15,5 +15,6 @@ internal class NTTModule : IApplicationPart
     public void ConfigureServices(IRegistrator registrator)
     {
         registrator.Register<ITarget, CPUTarget>(reuse: Reuse.Singleton);
+        registrator.Register<ITarget, CUDATarget>(reuse: Reuse.Singleton);
     }
 }
diff --git a/modules/Nncase.Modules.NTT/Targets/NTTModuleCompiler.cs b/modules/Nncase.Modules.NTT/Targets/CPUModuleCompiler.cs
similarity index 93%
rename from modules/Nncase.Modules.NTT/Targets/NTTModuleCompiler.cs
rename to modules/Nncase.Modules.NTT/Targets/CPUModuleCompiler.cs
index 2585e965b..e0ab0705e 100644
--- a/modules/Nncase.Modules.NTT/Targets/NTTModuleCompiler.cs
+++ b/modules/Nncase.Modules.NTT/Targets/CPUModuleCompiler.cs
@@ -11,7 +11,7 @@
 
 namespace Nncase.Targets;
 
-public class NTTModuleCompiler : IModuleCompiler
+public class CPUModuleCompiler : INTTModuleCompiler
 {
     public string ModuleKind => CPUTarget.Kind;
 
@@ -35,7 +35,7 @@ public class NTTModuleCompiler : IModuleCompiler
         _ => throw new NotSupportedException($"Unsupported architecture: {RuntimeInformation.ProcessArchitecture}"),
     };
 
-    public IModuleBuilder CreateModuleBuilder(CompileOptions options) => new NTTModuleBuilder(options);
+    public IModuleBuilder CreateModuleBuilder(CompileOptions options) => new NTTModuleBuilder(ModuleKind, options);
 
     public bool IsSupportedCall(Call call, CompileOptions options)
     {
diff --git a/modules/Nncase.Modules.NTT/Targets/CPUTarget.cs b/modules/Nncase.Modules.NTT/Targets/CPUTarget.cs
index 04312ec46..15e5e76ac 100644
--- a/modules/Nncase.Modules.NTT/Targets/CPUTarget.cs
+++ b/modules/Nncase.Modules.NTT/Targets/CPUTarget.cs
@@ -23,110 +23,15 @@
 namespace Nncase.Targets;
 
 /// <summary>
-/// Target for NTT.
+/// Target for CPU.
 /// </summary>
-public class CPUTarget : Target
+public class CPUTarget : NTTTarget
 {
     public const string Kind = "cpu";
 
-    private readonly NTTModuleCompiler _nttModuleCompiler = new();
-
     public CPUTarget()
     {
-        ModuleCompilers = [_nttModuleCompiler];
-    }
-
-    public override string Name => Kind;
-
-    public override IReadOnlyList<IModuleCompiler> ModuleCompilers { get; }
-
-    public override (System.CommandLine.Command Command, Func<InvocationContext, System.CommandLine.Command, ITargetOptions> Parser) RegisterCommandAndParser()
-    {
-        var cmd = new NTTTargetOptionsCommand(Kind);
-
-        ITargetOptions ParseTargetCompileOptions(InvocationContext context, Command command)
-        {
-            var binder = new NTTTargetOptionsBinder(cmd);
-            return binder.GetBoundValue(context);
-        }
-
-        return (cmd, ParseTargetCompileOptions);
-    }
-
-    public override void RegisterAffineSelectionPass(IPassManager passManager, CompileOptions options)
-    {
-        passManager.Add<NTTAffineSelectionPass>();
-    }
-
-    public override void RegisterAutoPackingRules(IRulesAddable pass, CompileOptions options)
-    {
-        var nr = _nttModuleCompiler.Nr;
-
-        pass.Add<Passes.Rules.NTT.PackMatMulByN>(nr);
     }
 
-    public override void RegisterAutoVectorizeRules(IRulesAddable pass, CompileOptions options)
-    {
-        // todo config it in the target options.
-        var rank = 1;
-        var lane = _nttModuleCompiler.Lane;
-        var maskVectorStyle = _nttModuleCompiler.MaskVectorStyle;
-
-        pass.Add<Passes.Rules.NTT.VectorizeConv2D>(rank, lane);
-        pass.Add<Passes.Rules.NTT.VectorizeMatMul>(rank, lane);
-        pass.Add<Passes.Rules.NTT.VectorizeLayerNorm>(rank, lane);
-
-        pass.Add<Passes.Rules.NTT.VectorizeBinaryPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeCastPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeComparePropagation>(maskVectorStyle);
-        pass.Add<Passes.Rules.NTT.VectorizeConcatPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeExpandPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeGatherPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizePadPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeReducePropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeReshapePropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeResizeImagePropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeRoPEPropagation>();
-
-        // pass.Add<Passes.Rules.NTT.VectorizeScatterND>(rank, lane);
-        pass.Add<Passes.Rules.NTT.VectorizeSlicePropagation>();
-
-        // pass.Add<Passes.Rules.NTT.VectorizeSwish>(rank, lane);
-        pass.Add<Passes.Rules.NTT.VectorizeTransposePropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeUnaryPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeUnsqueezePropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizeWherePropagation>(maskVectorStyle);
-
-        pass.Add<Passes.Rules.NTT.CastDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.ConcatDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.BinaryDevectorizeLhsPropagation>();
-        pass.Add<Passes.Rules.NTT.BinaryDevectorizeRhsPropagation>();
-        pass.Add<Passes.Rules.NTT.VectorizedMatMulDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.ReshapeDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.SliceDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.SwishDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.TransposeDevectorizePropagation>();
-        pass.Add<Passes.Rules.NTT.UnaryDevectorizePropagation>();
-
-        pass.Add<Passes.Rules.Neutral.FoldConstCall>();
-        pass.Add<Passes.Rules.NTT.FoldVectorizeDevectorize>();
-        pass.Add<Passes.Rules.NTT.FoldVectorizeConcatDevectorize>();
-        pass.Add<Passes.Rules.NTT.TransposeVectorizeMatMulInputs>();
-        pass.Add<Passes.Rules.Neutral.FoldTwoReshapes>();
-        pass.Add<Passes.Rules.Neutral.FoldTwoTransposes>();
-    }
-
-    public override void RegisterTIRSelectionPass(IPassManager passManager, CompileOptions optionsÍ)
-    {
-        passManager.Add<NTTTIRSelectionPass>();
-    }
-
-    public override void RegisterPostAutoVectorizePass(IPassManager passManager, CompileOptions options)
-    {
-        passManager.AddWithName<DataflowPass>("FoldPostOps").Configure(p =>
-        {
-            p.Add<Passes.Rules.NTT.FoldCastPostOps>();
-            p.Add<Passes.Rules.NTT.FoldBinaryPostOps>();
-        });
-    }
+    protected override INTTModuleCompiler NTTModuleCompiler { get; } = new CPUModuleCompiler();
 }
diff --git a/modules/Nncase.Modules.NTT/Targets/CUDAModuleCompiler.cs b/modules/Nncase.Modules.NTT/Targets/CUDAModuleCompiler.cs
new file mode 100644
index 000000000..534e03a41
--- /dev/null
+++ b/modules/Nncase.Modules.NTT/Targets/CUDAModuleCompiler.cs
@@ -0,0 +1,34 @@
+﻿// Copyright (c) Canaan Inc. All rights reserved.
+// Licensed under the Apache license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using Nncase.CodeGen;
+using Nncase.CodeGen.NTT;
+using Nncase.IR;
+using Nncase.Passes;
+
+namespace Nncase.Targets;
+
+public class CUDAModuleCompiler : INTTModuleCompiler
+{
+    public string ModuleKind => CUDATarget.Kind;
+
+    public MaskVectorStyle MaskVectorStyle => MaskVectorStyle.Fat;
+
+    public int Lane => 16;
+
+    public int Nr => 4;
+
+    public IModuleBuilder CreateModuleBuilder(CompileOptions options) => new NTTModuleBuilder(ModuleKind, options);
+
+    public bool IsSupportedCall(Call call, CompileOptions options)
+    {
+        return call.Target switch
+        {
+            Op op => PassUtility.IsCpuSupported(op, call, call.Arguments, ModuleKind),
+            _ => false,
+        };
+    }
+}
diff --git a/modules/Nncase.Modules.NTT/Targets/CUDATarget.cs b/modules/Nncase.Modules.NTT/Targets/CUDATarget.cs
new file mode 100644
index 000000000..76fd11b39
--- /dev/null
+++ b/modules/Nncase.Modules.NTT/Targets/CUDATarget.cs
@@ -0,0 +1,37 @@
+﻿// Copyright (c) Canaan Inc. All rights reserved.
+// Licensed under the Apache license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Collections.Generic;
+using System.CommandLine;
+using System.CommandLine.Invocation;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.Options;
+using Nncase.CodeGen;
+using Nncase.CodeGen.NTT;
+using Nncase.IR;
+using Nncase.Passes;
+using Nncase.Passes.Rules.Neutral;
+using Nncase.Passes.Rules.ShapeBucket;
+using Nncase.Passes.Transforms;
+using Nncase.Quantization;
+
+namespace Nncase.Targets;
+
+/// <summary>
+/// Target for CUDA.
+/// </summary>
+public class CUDATarget : NTTTarget
+{
+    public const string Kind = "cuda";
+
+    public CUDATarget()
+    {
+    }
+
+    protected override INTTModuleCompiler NTTModuleCompiler { get; } = new CUDAModuleCompiler();
+}
diff --git a/modules/Nncase.Modules.NTT/Targets/INTTModuleCompiler.cs b/modules/Nncase.Modules.NTT/Targets/INTTModuleCompiler.cs
new file mode 100644
index 000000000..cd8d8cd7e
--- /dev/null
+++ b/modules/Nncase.Modules.NTT/Targets/INTTModuleCompiler.cs
@@ -0,0 +1,19 @@
+﻿// Copyright (c) Canaan Inc. All rights reserved.
+// Licensed under the Apache license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using Nncase.CodeGen;
+using Nncase.CodeGen.NTT;
+using Nncase.IR;
+using Nncase.Passes;
+
+namespace Nncase.Targets;
+
+public interface INTTModuleCompiler : IModuleCompiler
+{
+    int Lane { get; }
+
+    int Nr { get; }
+}
diff --git a/modules/Nncase.Modules.NTT/Targets/NTTTarget.cs b/modules/Nncase.Modules.NTT/Targets/NTTTarget.cs
new file mode 100644
index 000000000..2dc8ca7ba
--- /dev/null
+++ b/modules/Nncase.Modules.NTT/Targets/NTTTarget.cs
@@ -0,0 +1,130 @@
+﻿// Copyright (c) Canaan Inc. All rights reserved.
+// Licensed under the Apache license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Collections.Generic;
+using System.CommandLine;
+using System.CommandLine.Invocation;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.Options;
+using Nncase.CodeGen;
+using Nncase.CodeGen.NTT;
+using Nncase.IR;
+using Nncase.Passes;
+using Nncase.Passes.Rules.Neutral;
+using Nncase.Passes.Rules.ShapeBucket;
+using Nncase.Passes.Transforms;
+using Nncase.Quantization;
+
+namespace Nncase.Targets;
+
+/// <summary>
+/// Target for NTT.
+/// </summary>
+public abstract class NTTTarget : Target
+{
+    public NTTTarget()
+    {
+        ModuleCompilers = [NTTModuleCompiler];
+    }
+
+    public override string Name => NTTModuleCompiler.ModuleKind;
+
+    public override IReadOnlyList<IModuleCompiler> ModuleCompilers { get; }
+
+    protected abstract INTTModuleCompiler NTTModuleCompiler { get; }
+
+    public override (System.CommandLine.Command Command, Func<InvocationContext, System.CommandLine.Command, ITargetOptions> Parser) RegisterCommandAndParser()
+    {
+        var cmd = new NTTTargetOptionsCommand(Name);
+
+        ITargetOptions ParseTargetCompileOptions(InvocationContext context, Command command)
+        {
+            var binder = new NTTTargetOptionsBinder(cmd);
+            return binder.GetBoundValue(context);
+        }
+
+        return (cmd, ParseTargetCompileOptions);
+    }
+
+    public override void RegisterAffineSelectionPass(IPassManager passManager, CompileOptions options)
+    {
+        passManager.Add<NTTAffineSelectionPass>();
+    }
+
+    public override void RegisterAutoPackingRules(IRulesAddable pass, CompileOptions options)
+    {
+        var nr = NTTModuleCompiler.Nr;
+
+        pass.Add<Passes.Rules.NTT.PackMatMulByN>(nr);
+    }
+
+    public override void RegisterAutoVectorizeRules(IRulesAddable pass, CompileOptions options)
+    {
+        // todo config it in the target options.
+        var rank = 1;
+        var lane = NTTModuleCompiler.Lane;
+        var maskVectorStyle = NTTModuleCompiler.MaskVectorStyle;
+
+        pass.Add<Passes.Rules.NTT.VectorizeConv2D>(rank, lane);
+        pass.Add<Passes.Rules.NTT.VectorizeMatMul>(rank, lane);
+        pass.Add<Passes.Rules.NTT.VectorizeLayerNorm>(rank, lane);
+
+        pass.Add<Passes.Rules.NTT.VectorizeBinaryPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeCastPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeComparePropagation>(maskVectorStyle);
+        pass.Add<Passes.Rules.NTT.VectorizeConcatPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeExpandPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeGatherPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizePadPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeReducePropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeReshapePropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeResizeImagePropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeRoPEPropagation>();
+
+        // pass.Add<Passes.Rules.NTT.VectorizeScatterND>(rank, lane);
+        pass.Add<Passes.Rules.NTT.VectorizeSlicePropagation>();
+
+        // pass.Add<Passes.Rules.NTT.VectorizeSwish>(rank, lane);
+        pass.Add<Passes.Rules.NTT.VectorizeTransposePropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeUnaryPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeUnsqueezePropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizeWherePropagation>(maskVectorStyle);
+
+        pass.Add<Passes.Rules.NTT.CastDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.ConcatDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.BinaryDevectorizeLhsPropagation>();
+        pass.Add<Passes.Rules.NTT.BinaryDevectorizeRhsPropagation>();
+        pass.Add<Passes.Rules.NTT.VectorizedMatMulDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.ReshapeDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.SliceDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.SwishDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.TransposeDevectorizePropagation>();
+        pass.Add<Passes.Rules.NTT.UnaryDevectorizePropagation>();
+
+        pass.Add<Passes.Rules.Neutral.FoldConstCall>();
+        pass.Add<Passes.Rules.NTT.FoldVectorizeDevectorize>();
+        pass.Add<Passes.Rules.NTT.FoldVectorizeConcatDevectorize>();
+        pass.Add<Passes.Rules.NTT.TransposeVectorizeMatMulInputs>();
+        pass.Add<Passes.Rules.Neutral.FoldTwoReshapes>();
+        pass.Add<Passes.Rules.Neutral.FoldTwoTransposes>();
+    }
+
+    public override void RegisterTIRSelectionPass(IPassManager passManager, CompileOptions optionsÍ)
+    {
+        passManager.Add<NTTTIRSelectionPass>(NTTModuleCompiler.ModuleKind);
+    }
+
+    public override void RegisterPostAutoVectorizePass(IPassManager passManager, CompileOptions options)
+    {
+        passManager.AddWithName<DataflowPass>("FoldPostOps").Configure(p =>
+        {
+            p.Add<Passes.Rules.NTT.FoldCastPostOps>();
+            p.Add<Passes.Rules.NTT.FoldBinaryPostOps>();
+        });
+    }
+}
diff --git a/ntt/cmake/compile_flags.cmake b/ntt/cmake/compile_flags.cmake
index 0fac799dc..c9050db84 100644
--- a/ntt/cmake/compile_flags.cmake
+++ b/ntt/cmake/compile_flags.cmake
@@ -70,3 +70,6 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES
        message(FATAL_ERROR "Unsupported riscv64 target")
    endif()
 endif()
+
+if (CMAKE_CUDA_COMPILER)
+endif()
diff --git a/ntt/cmake/ntt_module.cmake b/ntt/cmake/ntt_module.cmake
index 8b91c5cfb..0f9ab7db9 100644
--- a/ntt/cmake/ntt_module.cmake
+++ b/ntt/cmake/ntt_module.cmake
@@ -1,37 +1,105 @@
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.16)
 
 include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake)
 
-if (BUILD_STANDALONE)
-    add_executable(nncase_ntt_module ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
+if (CMAKE_CUDA_COMPILER)
+    set(NNCASE_NTT_MODULE_TARGET_NAME nncase_ntt_module_bundle)
+
+    find_program(NVLINK nvlink REQUIRED)
+    find_program(FATBINARY fatbinary REQUIRED)
+    message(STATUS "Found nvlink: ${NVLINK}")
+    message(STATUS "Found fatbinary: ${FATBINARY}")
 else()
-    add_library(nncase_ntt_module SHARED ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
+    set(NNCASE_NTT_MODULE_TARGET_NAME nncase_ntt_module)
 endif()
 
-target_compile_features(nncase_ntt_module PUBLIC cxx_std_20)
-target_include_directories(nncase_ntt_module PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include)
-set_target_properties(nncase_ntt_module PROPERTIES PREFIX "" SUFFIX "")
-set_target_properties(nncase_ntt_module PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_CPU_MODULE=1)
-set_property(TARGET nncase_ntt_module PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+if (BUILD_STANDALONE)
+    add_executable(${NNCASE_NTT_MODULE_TARGET_NAME} ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
+elseif (CMAKE_CUDA_COMPILER)
+    add_library(${NNCASE_NTT_MODULE_TARGET_NAME} OBJECT)
+else()
+    add_library(${NNCASE_NTT_MODULE_TARGET_NAME} SHARED ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp)
+endif()
 
-target_sources(nncase_ntt_module PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp)
+target_compile_features(${NNCASE_NTT_MODULE_TARGET_NAME} PUBLIC cxx_std_20)
+target_include_directories(${NNCASE_NTT_MODULE_TARGET_NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include)
+set_target_properties(${NNCASE_NTT_MODULE_TARGET_NAME} PROPERTIES PREFIX "" SUFFIX "")
+set_target_properties(${NNCASE_NTT_MODULE_TARGET_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET ${NNCASE_NTT_MODULE_TARGET_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
 
 if (BUILD_STANDALONE)
-    target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_STANDALONE=1)
+    target_compile_definitions(${NNCASE_NTT_MODULE_TARGET_NAME} PUBLIC -DNNCASE_STANDALONE=1)
 endif()
 
 if (MSVC)
-    set_property(TARGET nncase_ntt_module PROPERTY
+    set_property(TARGET ${NNCASE_NTT_MODULE_TARGET_NAME} PROPERTY
         MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-    set_target_properties(nncase_ntt_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE)
-    target_link_options(nncase_ntt_module PRIVATE /NODEFAULTLIB)
-    target_link_libraries(nncase_ntt_module PRIVATE "libvcruntime$<$<CONFIG:Debug>:d>"
+    set_target_properties(${NNCASE_NTT_MODULE_TARGET_NAME} PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE)
+    target_link_options(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE /NODEFAULTLIB)
+    target_link_libraries(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE "libvcruntime$<$<CONFIG:Debug>:d>"
                                                     "msvcrt$<$<CONFIG:Debug>:d>"
                                                     "ucrt$<$<CONFIG:Debug>:d>"
                                                     "libcpmt$<$<CONFIG:Debug>:d>")
 elseif(APPLE)
-    target_link_options(nncase_ntt_module PRIVATE -ld_classic -lc)
+    target_link_options(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE -ld_classic -lc)
+else()
+    target_link_libraries(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE pthread)
+endif()
+
+if (CMAKE_CUDA_COMPILER)
+    target_sources(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cuda_runtime.cu)
+    target_compile_definitions(${NNCASE_NTT_MODULE_TARGET_NAME} PUBLIC -DNNCASE_CUDA_MODULE=1)
+    target_compile_options(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+        -fgpu-rdc
+        --cuda-device-only
+        >)
+
+    foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
+        # Link device code for this architecture
+        set(linked_obj "${CMAKE_CURRENT_BINARY_DIR}/linked_sm_${arch}.o")
+        add_custom_command(
+            OUTPUT ${linked_obj}
+            COMMAND ${NVLINK}
+                -arch=sm_${arch}
+                $<TARGET_OBJECTS:${NNCASE_NTT_MODULE_TARGET_NAME}>
+                -o ${linked_obj}
+            DEPENDS ${NNCASE_NTT_MODULE_TARGET_NAME} $<TARGET_OBJECTS:${NNCASE_NTT_MODULE_TARGET_NAME}>
+            COMMAND_EXPAND_LISTS
+            VERBATIM
+            COMMENT "Linking device code for sm_${arch}"
+        )
+
+        # Add to the list of all linked objects
+        list(APPEND ALL_LINKED_OBJECTS ${linked_obj})
+    endforeach()
+
+    add_custom_target(device_link ALL
+        DEPENDS ${ALL_LINKED_OBJECTS}
+    )
+
+    set(FATBIN_ARGS "")
+    foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
+        # Find the linked object for this architecture
+        set(arch_obj "${CMAKE_CURRENT_BINARY_DIR}/linked_sm_${arch}.o")
+        list(APPEND FATBIN_ARGS --image3=kind=elf,sm=${arch},file="${arch_obj}")
+    endforeach()
+
+    # Create the fatbinary from all linked objects
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/nncase_ntt_module
+        COMMAND ${FATBINARY}
+            -64
+            --create ${CMAKE_CURRENT_BINARY_DIR}/nncase_ntt_module
+            ${FATBIN_ARGS}
+        DEPENDS ${ALL_LINKED_OBJECTS}
+        COMMENT "Creating fatbinary from linked objects"
+        VERBATIM
+    )
+
+    add_custom_target(fatbin ALL
+        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/nncase_ntt_module
+    )
 else()
-    target_link_libraries(nncase_ntt_module PRIVATE pthread)
+    target_sources(${NNCASE_NTT_MODULE_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp)
+    target_compile_definitions(${NNCASE_NTT_MODULE_TARGET_NAME} PUBLIC -DNNCASE_CPU_MODULE=1)
 endif()
diff --git a/ntt/include/nncase/bfloat16.h b/ntt/include/nncase/bfloat16.h
index 1051c31e1..8c614eb63 100644
--- a/ntt/include/nncase/bfloat16.h
+++ b/ntt/include/nncase/bfloat16.h
@@ -43,11 +43,11 @@ struct bfloat16 {
     constexpr operator __bf16() const noexcept {
         return std::bit_cast<__bf16>(value_);
     }
-// #else
-//     constexpr operator float() const noexcept {
-//         uint32_t value = raw() << 16;
-//         return std::bit_cast<float>(value);
-//     }
+    // #else
+    //     constexpr operator float() const noexcept {
+    //         uint32_t value = raw() << 16;
+    //         return std::bit_cast<float>(value);
+    //     }
 
 #endif
 
@@ -133,7 +133,6 @@ struct bfloat16 {
         return uint64_t(double(*this));
     }
 
-
     constexpr explicit operator uint8_t() const noexcept {
         return uint8_t(float(*this));
     }
@@ -142,7 +141,6 @@ struct bfloat16 {
         return int8_t(float(*this));
     }
 
-
     constexpr explicit operator int16_t() const noexcept {
         return int16_t(float(*this));
     }
@@ -151,7 +149,6 @@ struct bfloat16 {
         return uint16_t(float(*this));
     }
 
-
     constexpr explicit operator bool() const noexcept {
         return bool(std::bit_cast<uint16_t>(*this));
     }
@@ -162,8 +159,6 @@ struct bfloat16 {
                               : nan();
     }
 
-
-
     static constexpr bfloat16 epsilon() noexcept {
         // 0x1.0p-7
         return from_raw(0x3c00);
@@ -199,12 +194,14 @@ struct bfloat16 {
 };
 
 #define DEFINE_BF16_BINARY_BF16RET(x)                                          \
-    inline bfloat16 operator x(bfloat16 a, bfloat16 b) noexcept {              \
+    NTT_ALWAYS_INLINE constexpr bfloat16 operator x(bfloat16 a,                \
+                                                    bfloat16 b) noexcept {     \
         return bfloat16::round_to_bfloat16(float(a) x float(b));               \
     }
 
 #define DEFINE_BF16_BINARY_BOOLRET(x)                                          \
-    inline bool operator x(bfloat16 a, bfloat16 b) noexcept {                  \
+    NTT_ALWAYS_INLINE constexpr bool operator x(bfloat16 a,                    \
+                                                bfloat16 b) noexcept {         \
         return float(a) x float(b);                                            \
     }
 
@@ -218,7 +215,8 @@ DEFINE_BF16_BINARY_BOOLRET(>=)
 DEFINE_BF16_BINARY_BOOLRET(>)
 
 #define DEFINE_BF16_BINARY_SELF_MOD(x, op)                                     \
-    inline bfloat16 &operator x(bfloat16 & a, bfloat16 b) noexcept {           \
+    NTT_ALWAYS_INLINE constexpr bfloat16 &operator x(bfloat16 &a,              \
+                                                     bfloat16 b) noexcept {    \
         a = a op b;                                                            \
         return a;                                                              \
     }
@@ -228,15 +226,17 @@ DEFINE_BF16_BINARY_SELF_MOD(-=, -)
 DEFINE_BF16_BINARY_SELF_MOD(*=, *)
 DEFINE_BF16_BINARY_SELF_MOD(/=, /)
 
-inline bfloat16 operator-(bfloat16 a) noexcept {
+NTT_ALWAYS_INLINE constexpr bfloat16 operator-(bfloat16 a) noexcept {
     return bfloat16::round_to_bfloat16(-float(a));
 }
 
-inline bool operator==(const bfloat16 &lhs, const bfloat16 &rhs) noexcept {
+NTT_ALWAYS_INLINE constexpr bool operator==(const bfloat16 &lhs,
+                                            const bfloat16 &rhs) noexcept {
     return lhs.raw() == rhs.raw();
 }
 
-inline bool operator!=(const bfloat16 &lhs, const bfloat16 &rhs) noexcept {
+NTT_ALWAYS_INLINE constexpr bool operator!=(const bfloat16 &lhs,
+                                            const bfloat16 &rhs) noexcept {
     return lhs.raw() != rhs.raw();
 }
 } // namespace nncase
@@ -305,67 +305,76 @@ template <> struct numeric_limits<nncase::bfloat16> {
 };
 
 using nncase::bfloat16;
-inline bool isinf(const bfloat16 &a) { return std::isinf(float(a)); }
-inline bool isnan(const bfloat16 &a) { return std::isnan(float(a)); }
-inline bool isfinite(const bfloat16 &a) { return std::isfinite(float(a)); }
-inline bfloat16 abs(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isinf(const bfloat16 &a) {
+    return std::isinf(float(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isnan(const bfloat16 &a) {
+    return std::isnan(float(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isfinite(const bfloat16 &a) {
+    return std::isfinite(float(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 abs(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(fabsf(float(a)));
 }
-inline bfloat16 acos(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 acos(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(std::acos(float(a)));
 }
-inline bfloat16 asin(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 asin(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(std::asin(float(a)));
 }
-inline bfloat16 erf(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 erf(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(std::erff(float(a)));
 }
-inline bfloat16 exp(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 exp(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(expf(float(a)));
 }
-inline bfloat16 log(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 log(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(logf(float(a)));
 }
-inline bfloat16 log10(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 log10(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(log10f(float(a)));
 }
-inline bfloat16 sqrt(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 sqrt(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(sqrtf(float(a)));
 }
-inline bfloat16 pow(const bfloat16 &a, const bfloat16 &b) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 pow(const bfloat16 &a,
+                                               const bfloat16 &b) {
     return bfloat16::round_to_bfloat16(powf(float(a), float(b)));
 }
-inline bfloat16 sin(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 sin(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(sinf(float(a)));
 }
-inline bfloat16 cos(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 cos(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(cosf(float(a)));
 }
-inline bfloat16 tan(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 tan(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(tanf(float(a)));
 }
-inline bfloat16 tanh(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 tanh(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(tanhf(float(a)));
 }
-inline bfloat16 floor(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 floor(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(floorf(float(a)));
 }
-inline bfloat16 ceil(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 ceil(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(ceilf(float(a)));
 }
-inline bfloat16 round(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 round(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(roundf(float(a)));
 }
-inline bfloat16 nearbyint(const bfloat16 &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bfloat16 nearbyint(const bfloat16 &a) {
     return bfloat16::round_to_bfloat16(nearbyintf(float(a)));
 }
-inline long lrint(const bfloat16 &a) { return lrintf(float(a)); }
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE long lrint(const bfloat16 &a) {
+    return lrintf(float(a));
+}
 
 template <> struct is_arithmetic<bfloat16> : public true_type {};
 
 } // namespace std
 
-inline nncase::bfloat16 operator"" _bf16(long double x) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE nncase::bfloat16
+operator""_bf16(long double x) {
     return nncase::bfloat16(float(x));
 }
-
diff --git a/ntt/include/nncase/float8.h b/ntt/include/nncase/float8.h
index c6043c93f..7a36f16e8 100644
--- a/ntt/include/nncase/float8.h
+++ b/ntt/include/nncase/float8.h
@@ -36,7 +36,7 @@
 */
 #pragma once
 
-#if defined(__GNUC__) && defined(__x86_64__)
+#if defined(__GNUC__) && defined(__x86_64__) && !defined(__clang__)
 #pragma GCC optimize("no-strict-aliasing")
 #endif
 
@@ -81,11 +81,12 @@
 // #include <cuda_fp16.h>
 
 // #include "nncase/nncase.h"
+#include "ntt/compiler_defs.h"
 #include "bfloat16.h"
 #include "half.h"
 #ifndef CUTLASS_HOST_DEVICE
-#define CUTLASS_HOST_DEVICE inline
-#define CUTLASS_DEVICE inline
+#define CUTLASS_HOST_DEVICE NTT_HOST_DEVICE inline
+#define CUTLASS_DEVICE NTT_DEVICE inline
 #endif // !CUTLASS_HOST_DEVICE
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1384,22 +1385,22 @@ struct numeric_limits<nncase::float_e5m2_t>
 //
 
 CUTLASS_HOST_DEVICE
-nncase::float_e4m3_t operator"" _fe4m3(long double x) {
+nncase::float_e4m3_t operator""_fe4m3(long double x) {
     return nncase::float_e4m3_t(float(x));
 }
 
 CUTLASS_HOST_DEVICE
-nncase::float_e4m3_t operator"" _fe4m3(unsigned long long int x) {
+nncase::float_e4m3_t operator""_fe4m3(unsigned long long int x) {
     return nncase::float_e4m3_t(int(x));
 }
 
 CUTLASS_HOST_DEVICE
-nncase::float_e5m2_t operator"" _fe5m2(long double x) {
+nncase::float_e5m2_t operator""_fe5m2(long double x) {
     return nncase::float_e5m2_t(float(x));
 }
 
 CUTLASS_HOST_DEVICE
-nncase::float_e5m2_t operator"" _fe5m2(unsigned long long int x) {
+nncase::float_e5m2_t operator""_fe5m2(unsigned long long int x) {
     return nncase::float_e5m2_t(int(x));
 }
 
diff --git a/ntt/include/nncase/half.h b/ntt/include/nncase/half.h
index 7d1a3e0b2..f09826c43 100644
--- a/ntt/include/nncase/half.h
+++ b/ntt/include/nncase/half.h
@@ -24,11 +24,19 @@
 #include <limits>
 #include <type_traits>
 
-#ifdef __F16C__
+#ifdef __CUDA_ARCH__
+#include <cuda_fp16.h>
+#elif defined(__F16C__)
 #include <immintrin.h>
 #endif
 
 namespace nncase {
+#ifdef __CUDA_ARCH__
+using native_half_t = __half;
+#else
+using native_half_t = _Float16;
+#endif
+
 struct fp16_from_raw_t {
     explicit fp16_from_raw_t() = default;
 };
@@ -44,7 +52,7 @@ struct half {
 
   public:
     constexpr half() noexcept = default;
-    constexpr half(_Float16 v) noexcept : value_(v) {}
+    constexpr half(native_half_t v) noexcept : value_(v) {}
 
     template <class T,
               class = std::enable_if_t<std::is_integral<T>::value ||
@@ -54,9 +62,11 @@ struct half {
 
     static constexpr half round_to_half(float v) {
         if (std::is_constant_evaluated()) {
-            return (_Float16)v;
+            return (native_half_t)v;
         } else {
-#ifdef __F16C__
+#ifdef __CUDA_ARCH__
+            return __float2half_rn(v);
+#elif defined(__F16C__)
             // To avoid truncsfhf2
             return from_raw(_cvtss_sh(v, _MM_FROUND_NEARBYINT));
 #else
@@ -64,7 +74,7 @@ struct half {
 #endif
         }
 
-        return (_Float16)v;
+        return (native_half_t)v;
     }
 
     static constexpr half epsilon() noexcept { return from_raw(0x0800); }
@@ -91,14 +101,16 @@ struct half {
         : value_(round_to_half(float(x)).value_) {}
 
     constexpr half(fp16_from_raw_t, uint16_t value) noexcept
-        : value_(std::bit_cast<_Float16>(value)) {}
+        : value_(std::bit_cast<native_half_t>(value)) {}
 
-    constexpr operator _Float16() const noexcept { return value_; }
+    constexpr operator native_half_t() const noexcept { return value_; }
     constexpr operator float() const noexcept {
         if (std::is_constant_evaluated()) {
             return (float)value_;
         } else {
-#ifdef __F16C__
+#ifdef __CUDA_ARCH__
+            return __half2float(value_);
+#elif defined(__F16C__)
             // To avoid extendhfdf2
             return _cvtsh_ss(raw());
 #else
@@ -177,7 +189,7 @@ struct half {
     }
 
   private:
-    _Float16 value_;
+    native_half_t value_;
 };
 
 #define DEFINE_FP16_BINARY_FP16RET(x)                                          \
@@ -216,7 +228,7 @@ DEFINE_FP16_BINARY_BOOLRET(>=)
 DEFINE_FP16_BINARY_BOOLRET(>)
 
 #define DEFINE_FP16_BINARY_SELF_MOD(x, op)                                     \
-    NTT_ALWAYS_INLINE half &operator x(half & a, half b) noexcept {            \
+    NTT_ALWAYS_INLINE half &operator x(half &a, half b) noexcept {             \
         a = a op b;                                                            \
         return a;                                                              \
     }
@@ -242,7 +254,8 @@ inline std::ostream &operator<<(std::ostream &os, const half &a) {
     os << std::to_string(float(a));
     return os;
 }
-inline half nextafter(const half &from, const half &to) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half nextafter(const half &from,
+                                                 const half &to) {
     if (from.raw() == to.raw()) {
         return to;
     }
@@ -365,48 +378,76 @@ template <> struct numeric_limits<nncase::half> {
 };
 
 using nncase::half;
-inline bool isinf(const half &a) { return std::isinf((float)(a)); }
-inline bool isnan(const half &a) { return std::isnan(float(a)); }
-inline bool isfinite(const half &a) { return std::isfinite(float(a)); }
-inline half abs(const half &a) { return half::round_to_half(fabsf(float(a))); }
-inline half fabs(const half &a) { return half::round_to_half(fabs(float(a))); }
-inline half exp(const half &a) { return half::round_to_half(expf(float(a))); }
-inline half log(const half &a) { return half::round_to_half(logf(float(a))); }
-inline half log10(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isinf(const half &a) {
+    return std::isinf((float)(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isnan(const half &a) {
+    return std::isnan(float(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE bool isfinite(const half &a) {
+    return std::isfinite(float(a));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half abs(const half &a) {
+    return half::round_to_half(fabsf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half fabs(const half &a) {
+    return half::round_to_half(fabs(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half exp(const half &a) {
+    return half::round_to_half(expf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half log(const half &a) {
+    return half::round_to_half(logf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half log10(const half &a) {
     return half::round_to_half(log10f(float(a)));
 }
-inline half sqrt(const half &a) { return half::round_to_half(sqrtf(float(a))); }
-inline half sin(const half &a) { return half::round_to_half(sinf(float(a))); }
-inline half cos(const half &a) { return half::round_to_half(cosf(float(a))); }
-inline half tan(const half &a) { return half::round_to_half(tanf(float(a))); }
-inline half tanh(const half &a) { return half::round_to_half(tanh(float(a))); }
-inline half floor(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half sqrt(const half &a) {
+    return half::round_to_half(sqrtf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half sin(const half &a) {
+    return half::round_to_half(sinf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half cos(const half &a) {
+    return half::round_to_half(cosf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half tan(const half &a) {
+    return half::round_to_half(tanf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half tanh(const half &a) {
+    return half::round_to_half(tanh(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half floor(const half &a) {
     return half::round_to_half(floorf(float(a)));
 }
-inline half ceil(const half &a) { return half::round_to_half(ceilf(float(a))); }
-inline half round(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half ceil(const half &a) {
+    return half::round_to_half(ceilf(float(a)));
+}
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half round(const half &a) {
     return half::round_to_half(roundf(float(a)));
 }
-inline half nearbyint(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half nearbyint(const half &a) {
     return half::round_to_half(nearbyintf(float(a)));
 }
-inline half acos(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half acos(const half &a) {
     return half::round_to_half(std::acos(float(a)));
 }
-inline half asin(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half asin(const half &a) {
     return half::round_to_half(std::asin(float(a)));
 }
-inline half cosh(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half cosh(const half &a) {
     return half::round_to_half(std::cosh(float(a)));
 }
-inline half sinh(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half sinh(const half &a) {
     return half::round_to_half(std::sinh(float(a)));
 }
-inline half erf(const half &a) {
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE half erf(const half &a) {
     return half::round_to_half(std::erff(float(a)));
 }
-inline long lrint(const half &a) { return lrintf(float(a)); }
+NTT_ALWAYS_INLINE NTT_HOST_DEVICE long lrint(const half &a) {
+    return lrintf(float(a));
+}
 
 template <> struct is_floating_point<half> : public std::true_type {};
 template <> struct is_arithmetic<half> : public true_type {};
-} // namespace std
\ No newline at end of file
+} // namespace std
diff --git a/ntt/include/nncase/ntt/apply.h b/ntt/include/nncase/ntt/apply.h
index 12677aae4..5192e0ddc 100644
--- a/ntt/include/nncase/ntt/apply.h
+++ b/ntt/include/nncase/ntt/apply.h
@@ -29,7 +29,7 @@ template <size_t Axis, class Shape, FixedShape TTile, class Offsets,
 NTT_ALWAYS_INLINE constexpr void
 apply_impl(dynamic_shape_t<Shape::rank()> &index, Offsets offsets,
            const Shape &shape, const TTile &tile, Callable &&callable,
-           const std::tuple<Strides...> &strides) {
+           const ntt::tuple<Strides...> &strides) {
     auto call = [&]<size_t... I>(std::index_sequence<I...>) {
         if constexpr (sizeof...(Strides)) {
             callable(index, offsets[fixed_dim_v<I>]...);
@@ -47,7 +47,7 @@ apply_impl(dynamic_shape_t<Shape::rank()> &index, Offsets offsets,
                                  std::forward<Callable>(callable), strides);
         }
         ntt::loop<sizeof...(Strides)>([&](auto i) {
-            offsets[i] += std::get<i>(strides)[fixed_dim_v<Axis>] *
+            offsets[i] += ntt::get<i>(strides)[fixed_dim_v<Axis>] *
                           tile[fixed_dim_v<Axis>];
         });
     }
@@ -62,7 +62,7 @@ NTT_ALWAYS_INLINE constexpr void apply(const TShape &shape, Callable &&callable,
         detail::apply_impl<0>(index, make_repeat_shape<sizeof...(TStrides)>(0),
                               shape, make_ones_shape<TShape::rank()>(),
                               std::forward<Callable>(callable),
-                              std::forward_as_tuple(strides...));
+                              ntt::forward_as_tuple(strides...));
     } else {
         if constexpr (sizeof...(TStrides)) {
             callable(fixed_shape_v<>, (strides, (dim_t)0)...);
@@ -80,7 +80,7 @@ apply_tiled(const TShape &shape, const TTile &tile, Callable &&callable,
         dynamic_shape_t<TShape::rank()> index{};
         detail::apply_impl<0>(index, make_repeat_shape<sizeof...(TStrides)>(0),
                               shape, tile, std::forward<Callable>(callable),
-                              std::forward_as_tuple(strides...));
+                              ntt::forward_as_tuple(strides...));
     } else {
         if constexpr (sizeof...(TStrides)) {
             callable(fixed_shape_v<>, (strides, (dim_t)0)...);
diff --git a/ntt/include/nncase/ntt/arch/cpu/remote_tensor.h b/ntt/include/nncase/ntt/arch/cpu/remote_tensor.h
index f339dabd1..b9c7a3538 100644
--- a/ntt/include/nncase/ntt/arch/cpu/remote_tensor.h
+++ b/ntt/include/nncase/ntt/arch/cpu/remote_tensor.h
@@ -34,12 +34,13 @@ extern decltype(nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
 template <class T, topology RemoteScope, topology TensorScope,
           ScopedProgramIds<TensorScope> TLocalProgramIds,
           ScopedProgramIds<TensorScope> TRemoteProgramIds>
-static auto get_remote_address(const TLocalProgramIds &local_program_ids,
-                               const TRemoteProgramIds &remote_program_ids,
-                               T *local_address) {
+auto get_remote_address(const TLocalProgramIds &local_program_ids,
+                        const TRemoteProgramIds &remote_program_ids,
+                        T *local_address) {
     auto start = (size_t)global_local_data_ptr(local_program_ids)(0_dim);
     auto end = (size_t)global_local_data_ptr(local_program_ids)(1_dim);
-    auto remote_address = (size_t)global_local_data_ptr(remote_program_ids)(0_dim);
+    auto remote_address =
+        (size_t)global_local_data_ptr(remote_program_ids)(0_dim);
     if ((uintptr_t)local_address < start || (uintptr_t)local_address >= end) {
         start = (size_t)global_thread_local_rdata_ptr(local_program_ids)(0_dim);
         end = (size_t)global_thread_local_rdata_ptr(local_program_ids)(1_dim);
@@ -47,7 +48,8 @@ static auto get_remote_address(const TLocalProgramIds &local_program_ids,
             (size_t)global_thread_local_rdata_ptr(remote_program_ids)(0_dim);
         if ((uintptr_t)local_address < start ||
             (uintptr_t)local_address >= end) {
-            start = (size_t)global_block_local_rdata_ptr(local_program_ids)(0_dim);
+            start =
+                (size_t)global_block_local_rdata_ptr(local_program_ids)(0_dim);
             remote_address =
                 (size_t)global_block_local_rdata_ptr(remote_program_ids)(0_dim);
         }
diff --git a/ntt/include/nncase/ntt/arch/cpu/runtime.h b/ntt/include/nncase/ntt/arch/cpu/runtime.h
index 902c0f3f1..7a2b524bb 100644
--- a/ntt/include/nncase/ntt/arch/cpu/runtime.h
+++ b/ntt/include/nncase/ntt/arch/cpu/runtime.h
@@ -15,234 +15,13 @@
 #pragma once
 #include "../../profiling.h"
 #include "../../runtime.h"
-#include <cstdarg>
 #include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <string_view>
-#include <unordered_map>
-#include <vector>
 
 #ifdef __APPLE__
 #include <pthread.h>
 #endif
 
 namespace nncase::ntt::runtime {
-
-struct record_id {
-    int cid = -1;
-    int bid = -1;
-    int tid = -1;
-};
-
-class timer_record : public nncase::ntt::runtime::timer_record_base<record_id> {
-  public:
-    bool is_valid() const override {
-        return instance_id_.cid != -1 && instance_id_.bid != -1 &&
-               instance_id_.tid != -1;
-    }
-
-    void set_time(std::string_view function_name, uint64_t start_time,
-                  uint64_t end_time) override {
-        auto &stats = function_stats_[function_name];
-        stats.calls.push_back({start_time, end_time});
-        stats.call_count++;
-        stats.total_time += end_time - start_time;
-    }
-
-    void set_level(std::string_view filename, profiling_level level) override {
-        auto &stats = function_stats_[filename];
-        stats.level = level;
-    }
-
-    // print statistics
-    void console_print() const override {
-
-        if (is_valid()) {
-
-            std::cout << "\033[34m\n"
-                      << "Core Id:" << instance_id_.cid
-                      << ", Block Id:" << instance_id_.bid
-                      << ", Thread Id:" << instance_id_.tid << "\033[0m\n";
-            std::cout << "\033[34mStatistics for NTT kernels. \033[0m\n";
-            for (const auto &[name, stats] : function_stats_) {
-                std::cout << "Function: " << name << "\n";
-                std::cout << "Level: " << ntt::runtime::to_string(stats.level)
-                          << "\n";
-                std::cout << "\tCalls: " << stats.call_count << "\n";
-                std::cout << "\tTotal time: " << stats.total_time
-                          << " microseconds\n";
-                uint64_t call_count = 0;
-                for (const auto &call : stats.calls) {
-                    std::cout << "\t\t"
-                              << "Call " << call_count++ << ": \n";
-                    std::cout << "\t\tStart time: " << call.start_time
-                              << " microseconds\n";
-                    std::cout << "\t\tEnd time: " << call.end_time
-                              << " microseconds\n";
-                    std::cout
-                        << "\t\tDuration: " << call.end_time - call.start_time
-                        << " microseconds\n";
-                }
-            }
-        }
-    }
-
-    void csv_print(std::string_view filename) const override {
-        if (is_valid()) {
-            std::ofstream csv_file(filename.data());
-            if (!csv_file.is_open()) {
-                std::cerr << "Failed to open file: " << filename << std::endl;
-                return;
-            }
-
-            csv_file
-                << "Core Id,Block Id,Thread Id,Function,Level,Calls,Total Time "
-                   "(microseconds),Call Index,Start Time (microseconds),End "
-                   "Time (microseconds),Duration (microseconds)\n";
-
-            for (const auto &[name, stats] : function_stats_) {
-                uint64_t call_count = 0;
-                for (const auto &call : stats.calls) {
-                    csv_file << instance_id_.cid << "," << instance_id_.bid
-                             << "," << instance_id_.tid << "," << name << ","
-                             << ntt::runtime::to_string(stats.level) << ","
-                             << stats.call_count << "," << stats.total_time
-                             << "," << call_count++ << "," << call.start_time
-                             << "," << call.end_time << ","
-                             << (call.end_time - call.start_time) << "\n";
-                }
-            }
-
-            csv_file.close();
-        }
-    }
-
-    void markdown_print(std::string_view filename) const override {
-
-        if (is_valid()) {
-            std::ofstream md_file(filename.data());
-            if (!md_file.is_open()) {
-                std::cerr << "Failed to open file: " << filename << std::endl;
-                return;
-            }
-
-            md_file << "### Core Information\n";
-            md_file << "| Core Id | Block Id | Thread Id |\n";
-            md_file << "|---------|----------|-----------|\n";
-            md_file << "| " << instance_id_.cid << " | " << instance_id_.bid
-                    << " | " << instance_id_.tid << " |\n";
-
-            md_file << "\n### NTT Kernels Statistics\n";
-
-            for (const auto &[name, stats] : function_stats_) {
-                md_file << "#### Function: " << name << "\n";
-                md_file << "| Level | Calls | Total Time (microseconds) |\n";
-                md_file << "|-------|-------|---------------------------|\n";
-                md_file << "| " << ntt::runtime::to_string(stats.level) << " | "
-                        << stats.call_count << " | " << stats.total_time
-                        << " |\n";
-
-                md_file << "\n**Call Details:**\n";
-                md_file << "| Call Index | Start Time (microseconds) | End "
-                           "Time (microseconds) | Duration (microseconds) |\n";
-                md_file << "|------------|---------------------------|---------"
-                           "----------------|-------------------------|\n";
-
-                uint64_t call_count = 0;
-                for (const auto &call : stats.calls) {
-                    md_file << "| " << call_count++ << " | " << call.start_time
-                            << " | " << call.end_time << " | "
-                            << (call.end_time - call.start_time) << " |\n";
-                }
-                md_file << "\n";
-            }
-
-            md_file.close();
-        }
-    }
-
-    void json_print(std::string_view filename) const override {
-
-        if (is_valid()) {
-            std::ofstream json_file(filename.data());
-            if (!json_file.is_open()) {
-                std::cerr << "Failed to open file: " << filename << std::endl;
-                return;
-            }
-
-            std::string pid = "\"cid: " + std::to_string(instance_id_.cid) +
-                              ", bid: " + std::to_string(instance_id_.bid) +
-                              "\"";
-            std::string tid =
-                "\"tid: " + std::to_string(instance_id_.tid) + "\"";
-            json_file << "[\n";
-
-            bool first = true;
-            for (const auto &[name, stats] : function_stats_) {
-                for (const auto &call : stats.calls) {
-                    if (stats.level == profiling_level::kernel) {
-                        if (!first) {
-                            json_file << ",\n";
-                        }
-                        first = false;
-                        json_file << "  {\n";
-                        json_file << "    \"name\": \"" << name << "\",\n";
-                        json_file << "    \"ph\": \"X\",\n";
-                        json_file << "    \"ts\": " << call.start_time << ",\n";
-                        json_file << "    \"dur\": "
-                                  << (call.end_time - call.start_time) << ",\n";
-                        json_file << "    \"pid\": " << pid << ",\n";
-                        json_file << "    \"tid\": " << tid << ",\n";
-                        json_file << "    \"args\": { \"level:\":\""
-                                  << ntt::runtime::to_string(stats.level)
-                                  << " \"}\n";
-                        json_file << "  }";
-                    }
-                }
-            }
-
-            for (const auto &[name, stats] : function_stats_) {
-                for (const auto &call : stats.calls) {
-                    if (stats.level == profiling_level::device) {
-                        if (!first) {
-                            json_file << ",\n";
-                        }
-                        first = false;
-                        json_file << "  {\n";
-                        json_file << "    \"name\": \"" << name << "\",\n";
-                        json_file << "    \"ph\": \"X\",\n";
-                        json_file << "    \"ts\": " << call.start_time << ",\n";
-                        json_file << "    \"dur\": "
-                                  << (call.end_time - call.start_time) << ",\n";
-                        json_file << "    \"pid\": " << pid << ",\n";
-                        json_file << "    \"tid\": " << tid << ",\n";
-                        json_file << "    \"args\": { \"level:\":\""
-                                  << ntt::runtime::to_string(stats.level)
-                                  << " \"}\n";
-                        json_file << "  }";
-                    }
-                }
-            }
-
-            json_file << "\n]\n";
-            json_file.close();
-        }
-    }
-
-    timer_record() = default;
-
-    ~timer_record() {
-        console_print();
-        markdown_print("nncase_profiling.md");
-        csv_print("nncase_profiling.csv");
-        json_print("nncase_profiling.json");
-    }
-
-    void set_id(record_id id) override { instance_id_ = id; }
-};
-
 struct cpu_block_entry_params_t {
     size_t tdim;
     size_t bdim;
@@ -250,12 +29,11 @@ struct cpu_block_entry_params_t {
     size_t bid;
     size_t cid;
     size_t cpu_id_offset;
+    uint8_t enable_profiling;
     const thread_inout_desc *input_descs;
     thread_inout_desc *const output_descs;
     std::span<const std::byte> rdata;
     std::byte *output;
-    uint8_t enable_profiling;
-    timer_record *timer_records;
     const uint64_t *thread_local_rdata_header;
     const uint64_t *thread_local_cache_header;
     std::span<const std::byte> thread_local_rdata;
@@ -264,6 +42,8 @@ struct cpu_block_entry_params_t {
     std::span<const std::byte> block_local_rdata;
     std::span<std::byte> thread_local_data;
     std::span<std::byte> block_local_data;
+    std::span<profile_record> profile_records;
+    uint32_t *profile_record_counts;
 #ifdef __APPLE__
     pthread_key_t cpu_thread_context_key;
 #endif
@@ -273,8 +53,9 @@ struct cpu_thread_context_t {
     size_t tid;
     size_t bid;
     size_t cid;
-    timer_record *timer_records;
     uint8_t enable_profiling;
+    std::span<profile_record> profile_records;
+    uint32_t *profile_record_counts;
 
     static cpu_thread_context_t &current() noexcept;
 };
diff --git a/ntt/include/nncase/ntt/arch/cuda/distributed.h b/ntt/include/nncase/ntt/arch/cuda/distributed.h
new file mode 100644
index 000000000..0aeaee9ca
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/distributed.h
@@ -0,0 +1,17 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "remote_tensor.h"
+#include "topology.h"
diff --git a/ntt/include/nncase/ntt/arch/cpu/profiling.h b/ntt/include/nncase/ntt/arch/cuda/profiling.h
similarity index 74%
rename from ntt/include/nncase/ntt/arch/cpu/profiling.h
rename to ntt/include/nncase/ntt/arch/cuda/profiling.h
index a5022fa17..7105f0f2e 100644
--- a/ntt/include/nncase/ntt/arch/cpu/profiling.h
+++ b/ntt/include/nncase/ntt/arch/cuda/profiling.h
@@ -28,16 +28,16 @@ namespace nncase::ntt {
 // static nncase::ntt::runtime::timer_record
 //     timer_records[CHIP_COUNTER][BLOCK_COUNTER][THREAD_COUNTER];
 
-// auto_profiler, start timing and end timing
-class auto_profiler {
+// profile_scope, start timing and end timing
+class profile_scope {
   public:
-    inline uint64_t get_current_time() const {
+    __device__ inline uint64_t get_current_time() const {
         return std::chrono::duration_cast<std::chrono::microseconds>(
                    std::chrono::high_resolution_clock::now().time_since_epoch())
             .count();
     }
 
-    auto_profiler(std::string_view function_name)
+    __device__ profile_scope(std::string_view function_name)
         : cid_(program_id<topology::chip>()),
           bid_(program_id<topology::block>()),
           tid_(program_id<topology::thread>()) {
@@ -50,15 +50,15 @@ class auto_profiler {
         }
     }
 
-    auto_profiler(std::string_view function_name,
-                  runtime::profiling_level level)
-        : auto_profiler(function_name) { // 调用另一个构造函数
+    __device__ profile_scope(std::string_view function_name,
+                             profile_level level)
+        : profile_scope(function_name) { // 调用另一个构造函数
         if (enable_profiling_) {
             level_ = level; // 设置 level
         }
     }
 
-    ~auto_profiler() {
+    __device__ ~profile_scope() {
         if (enable_profiling_) {
             timer_storage_->set_id({cid_, bid_, tid_});
             end_time_ = get_current_time();
@@ -74,16 +74,17 @@ class auto_profiler {
     int cid_;
     int bid_;
     int tid_;
-    nncase::ntt::runtime::profiling_level level_;
+    nncase::ntt::profile_level level_;
     nncase::ntt::runtime::timer_record *timer_storage_;
     bool enable_profiling_;
 
-    inline bool get_profiler_option() noexcept {
-        return runtime::cpu_thread_context_t::current().enable_profiling;
+    __device__ inline bool get_profiler_option() noexcept {
+        return runtime::cuda_thread_context_t::current().enable_profiling;
     }
 
-    inline nncase::ntt::runtime::timer_record *get_timer_record() noexcept {
-        return runtime::cpu_thread_context_t::current().timer_records;
+    __device__ inline nncase::ntt::runtime::timer_record *
+    get_timer_record() noexcept {
+        return runtime::cuda_thread_context_t::current().timer_records;
     }
 };
 
diff --git a/ntt/include/nncase/ntt/arch/cuda/remote_tensor.h b/ntt/include/nncase/ntt/arch/cuda/remote_tensor.h
new file mode 100644
index 000000000..06e644b89
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/remote_tensor.h
@@ -0,0 +1,81 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../../distributed/remote_tensor.h"
+#include "../../tensor.h"
+#include "../../vector.h"
+
+namespace nncase::ntt::distributed {
+namespace detail {
+extern __device__ decltype(nncase::ntt::make_tensor<
+                           nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape)) global_local_data_ptr;
+
+extern __device__ decltype(nncase::ntt::make_tensor<
+                           nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape)) global_thread_local_rdata_ptr;
+
+extern __device__ decltype(nncase::ntt::make_tensor<
+                           nncase::ntt::vector<uintptr_t, 3>>(
+    nncase::ntt::distributed::topology_shape)) global_thread_local_cache_ptr;
+
+extern __device__ decltype(nncase::ntt::make_tensor<
+                           nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape)) global_block_local_rdata_ptr;
+
+template <class T, topology RemoteScope, topology TensorScope,
+          ScopedProgramIds<TensorScope> TLocalProgramIds,
+          ScopedProgramIds<TensorScope> TRemoteProgramIds>
+__device__ auto get_remote_address(const TLocalProgramIds &local_program_ids,
+                                   const TRemoteProgramIds &remote_program_ids,
+                                   T *local_address) {
+    auto start = (size_t)global_local_data_ptr(local_program_ids)(0_dim);
+    auto end = (size_t)global_local_data_ptr(local_program_ids)(1_dim);
+    auto remote_address =
+        (size_t)global_local_data_ptr(remote_program_ids)(0_dim);
+    if ((uintptr_t)local_address < start || (uintptr_t)local_address >= end) {
+        start = (size_t)global_thread_local_rdata_ptr(local_program_ids)(0_dim);
+        end = (size_t)global_thread_local_rdata_ptr(local_program_ids)(1_dim);
+        remote_address =
+            (size_t)global_thread_local_rdata_ptr(remote_program_ids)(0_dim);
+        if ((uintptr_t)local_address < start ||
+            (uintptr_t)local_address >= end) {
+            start =
+                (size_t)global_block_local_rdata_ptr(local_program_ids)(0_dim);
+            remote_address =
+                (size_t)global_block_local_rdata_ptr(remote_program_ids)(0_dim);
+        }
+    }
+
+    return local_address - (T *)start + (T *)remote_address;
+}
+} // namespace detail
+
+template <topology RemoteScope, topology TensorScope>
+struct remote_tensor_constructor {
+    template <class T, Shape TShape, Strides TStrides,
+              ScopedProgramIds<TensorScope> TLocalProgramIds,
+              ScopedProgramIds<TensorScope> TRemoteProgramIds>
+    constexpr auto operator()(T *data, const TShape &shape,
+                              const TStrides &strides,
+                              const TLocalProgramIds &local_program_ids,
+                              const TRemoteProgramIds &remote_program_ids) {
+        auto remote_address =
+            detail::get_remote_address<T, RemoteScope, TensorScope>(
+                local_program_ids, remote_program_ids, data);
+        return make_tensor_view_from_address(remote_address, shape, strides);
+    }
+};
+} // namespace nncase::ntt::distributed
diff --git a/ntt/include/nncase/ntt/arch/cuda/runtime.h b/ntt/include/nncase/ntt/arch/cuda/runtime.h
new file mode 100644
index 000000000..4db64d06b
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/runtime.h
@@ -0,0 +1,57 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../../profiling.h"
+#include "../../runtime.h"
+#include "../../std_containers.h"
+#include <cstdint>
+
+namespace nncase::ntt::runtime {
+struct cuda_block_entry_params_t {
+    size_t tdim;
+    size_t bdim;
+    size_t cdim;
+    size_t cid;
+    uint8_t enable_profiling;
+    const thread_inout_desc *input_descs;
+    thread_inout_desc *const output_descs;
+    ntt::span<const std::byte> rdata;
+    std::byte *output;
+    const uint64_t *thread_local_rdata_header;
+    ntt::span<const std::byte> thread_local_rdata;
+    const uint64_t *warp_local_rdata_header;
+    ntt::span<const std::byte> warp_local_rdata;
+    const uint64_t *block_local_rdata_header;
+    ntt::span<const std::byte> block_local_rdata;
+    ntt::span<std::byte> thread_local_data;
+    ntt::span<std::byte> warp_local_data;
+    ntt::span<std::byte> block_local_data;
+    ntt::span<profile_record> profile_records;
+    uint32_t *profile_record_counts;
+};
+
+struct cuda_thread_context_t {
+    size_t cid;
+    uint8_t enable_profiling;
+    ntt::span<profile_record> profile_records;
+    uint32_t *profile_record_counts;
+
+    NTT_DEVICE static cuda_thread_context_t &current() noexcept;
+};
+} // namespace nncase::ntt::runtime
+
+extern "C" NTT_KERNEL NTT_RUNTIME_API void
+block_entry(const nncase::ntt::runtime::cuda_block_entry_params_t &params);
+using block_entry_t = decltype(block_entry) *;
diff --git a/ntt/include/nncase/ntt/arch/cuda/topology.h b/ntt/include/nncase/ntt/arch/cuda/topology.h
new file mode 100644
index 000000000..d5a81dd7a
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/topology.h
@@ -0,0 +1,90 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../../distributed/topology.h"
+#include "runtime.h"
+#include <cooperative_groups.h>
+#include <cuda/ptx>
+
+namespace nncase::ntt::distributed {
+template <> struct program_id_getter<topology::thread> {
+    __device__ static size_t id() noexcept {
+        if constexpr (program_dim<topology::thread>() == warpSize) {
+            return cuda::ptx::get_sreg_laneid();
+
+        } else {
+            return threadIdx.x % program_dim<topology::thread>();
+        }
+    }
+};
+
+template <> struct program_id_getter<topology::warp> {
+    __device__ static size_t id() noexcept {
+        return threadIdx.x / program_dim<topology::thread>();
+    }
+};
+
+template <> struct program_id_getter<topology::block> {
+    __device__ static size_t id() noexcept { return blockIdx.x; }
+};
+
+template <> struct program_id_getter<topology::chip> {
+    __device__ static size_t id() noexcept {
+        return runtime::cuda_thread_context_t::current().cid;
+    }
+};
+
+inline __device__ size_t tid() noexcept {
+    return program_id<topology::thread>();
+}
+
+inline __device__ size_t wid() noexcept { return program_id<topology::warp>(); }
+
+inline __device__ size_t bid() noexcept {
+    return program_id<topology::block>();
+}
+
+inline __device__ size_t cid() noexcept { return program_id<topology::chip>(); }
+
+inline constexpr auto tdim() noexcept {
+    return program_dim<topology::thread>();
+}
+inline constexpr auto wdim() noexcept { return program_dim<topology::warp>(); }
+inline constexpr auto bdim() noexcept { return program_dim<topology::block>(); }
+inline constexpr auto cdim() noexcept { return program_dim<topology::chip>(); }
+
+template <> class topology_synchronizer<topology::thread> {
+  public:
+    __device__ static void synchronize() noexcept { __syncwarp(); }
+};
+
+template <> class topology_synchronizer<topology::warp> {
+  public:
+    __device__ static void synchronize() noexcept { __syncthreads(); }
+};
+
+template <> class topology_synchronizer<topology::block> {
+  public:
+    __device__ static void synchronize() noexcept {
+        cooperative_groups::grid_group g = cooperative_groups::this_grid();
+        g.sync();
+    }
+};
+
+template <> class topology_synchronizer<topology::chip> {
+  public:
+    __device__ static void synchronize() noexcept {}
+};
+} // namespace nncase::ntt::distributed
diff --git a/ntt/include/nncase/ntt/arch/cuda/topology_def.h b/ntt/include/nncase/ntt/arch/cuda/topology_def.h
new file mode 100644
index 000000000..ae87a742e
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/topology_def.h
@@ -0,0 +1,19 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace nncase::ntt::distributed {
+enum class topology { chip, block, warp, thread, count__ };
+} // namespace nncase::ntt::distributed
diff --git a/ntt/include/nncase/ntt/arch/cuda/vector_ops.h b/ntt/include/nncase/ntt/arch/cuda/vector_ops.h
new file mode 100644
index 000000000..e3d0a326d
--- /dev/null
+++ b/ntt/include/nncase/ntt/arch/cuda/vector_ops.h
@@ -0,0 +1,19 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../../vector_ops.h"
+#include <type_traits>
+
+namespace nncase::ntt::ops {} // namespace nncase::ntt::ops
diff --git a/ntt/include/nncase/ntt/arch/x86_64/ukernels.h b/ntt/include/nncase/ntt/arch/x86_64/ukernels.h
index 78055ea97..095cf5f8c 100644
--- a/ntt/include/nncase/ntt/arch/x86_64/ukernels.h
+++ b/ntt/include/nncase/ntt/arch/x86_64/ukernels.h
@@ -337,9 +337,9 @@ class u_pack2d<true, TIn, TOut, float, vector<float, 8, 8>> {
 
         constexpr auto axes_temp = TAxes{};
         constexpr auto conti_dims_input =
-            contiguous_dims(input.shape(), input.strides());
+            contiguous_dims(TIn::shape(), TIn::strides());
         constexpr auto conti_dims_output =
-            contiguous_dims(output.shape(), output.strides());
+            contiguous_dims(TOut::shape(), TOut::strides());
 
         if constexpr (TAxes::rank() == 2 &&
                       axes_temp[0_dim] + 1 == axes_temp[1_dim] &&
@@ -349,7 +349,6 @@ class u_pack2d<true, TIn, TOut, float, vector<float, 8, 8>> {
             if constexpr (TAxes::rank() > 0 &&
                           (TAxes{}[-1]) == (TIn::rank() - 1)) {
                 using TVec = vector<float, 8, 8>;
-                constexpr auto in_rank = TIn::rank();
                 constexpr auto out_rank = TOut::rank();
                 constexpr auto lanes = TVec::shape();
                 constexpr auto out_shape = TOut::shape();
@@ -371,7 +370,6 @@ class u_pack2d<true, TIn, TOut, float, vector<float, 8, 8>> {
             } else {
                 using TVec = vector<float, 8, 8>;
                 constexpr auto in_rank = TIn::rank();
-                constexpr auto out_rank = TOut::rank();
                 constexpr auto lanes = TVec::shape();
                 const auto out_shape = output.shape();
 
diff --git a/ntt/include/nncase/ntt/caching.h b/ntt/include/nncase/ntt/caching.h
index 6108f5ed1..ace35c211 100644
--- a/ntt/include/nncase/ntt/caching.h
+++ b/ntt/include/nncase/ntt/caching.h
@@ -72,7 +72,7 @@ struct paged_attention_config
     using vectorized_axes_t = VectorizedAxes;
     using lanes_t = Lanes;
     using sharding_axes_t = ShardingAxes;
-    using axis_policies_t = std::tuple<AxisPolicies...>;
+    using axis_policies_t = ntt::tuple<AxisPolicies...>;
 
     static inline constexpr auto cache_layout = cache_layout_t{};
     static inline constexpr auto block_layout = block_layout_t{};
@@ -88,7 +88,7 @@ struct paged_attention_config
         if constexpr (index == -1_dim) {
             return distributed::shard_policy::B;
         } else {
-            return std::get<index>(axis_policies);
+            return ntt::get<index>(axis_policies);
         }
     }
 };
@@ -125,14 +125,16 @@ template <class TConfig> class attention_kv_cache {
           context_lens_(std::move(context_lens)),
           seq_lens_(std::move(seq_lens)) {}
 
-    size_t num_seqs() const noexcept { return num_seqs_; }
-    size_t num_tokens() const noexcept { return num_tokens_; }
+    constexpr size_t num_seqs() const noexcept { return num_seqs_; }
+    constexpr size_t num_tokens() const noexcept { return num_tokens_; }
 
-    int64_t context_len(int64_t request_id) const noexcept {
+    constexpr int64_t context_len(int64_t request_id) const noexcept {
         return context_lens_(request_id);
     }
 
-    int64_t seq_len(int64_t seq_id) const noexcept { return seq_lens_(seq_id); }
+    constexpr int64_t seq_len(int64_t seq_id) const noexcept {
+        return seq_lens_(seq_id);
+    }
 
   protected:
     size_t num_seqs_;
@@ -150,7 +152,7 @@ kv_dim(const distributed::shard_policy::split<Axes...> &split) noexcept {
 }
 
 template <class Mesh, class... AxisPolicies>
-constexpr auto kv_addr_shape(std::tuple<AxisPolicies...>) noexcept {
+constexpr auto kv_addr_shape(ntt::tuple<AxisPolicies...>) noexcept {
     return fixed_shape_v<kv_dim<Mesh>(AxisPolicies{})...>;
 }
 
@@ -177,7 +179,7 @@ constexpr auto origin_kv_cache_one_block_shape() noexcept {
     auto shard_shape = TConfig::sharding_axes.aggregate(
         vectorized_shape, [&](auto last_shape, auto sharding_axis, auto i) {
             using axis_policy_t =
-                std::tuple_element_t<i, typename TConfig::axis_policies_t>;
+                ntt::tuple_element_t<i, typename TConfig::axis_policies_t>;
             if constexpr (sharding_axis ==
                           fixed_dim_v<(
                               dim_t)paged_kvcache_dim_kind::num_blocks>) {
@@ -246,10 +248,12 @@ class paged_attention_kv_cache : public attention_kv_cache<TConfig> {
     using kv_addrs_t = decltype(make_tensor_view_from_address(
         std::declval<kv_storage_type_t **>(), kv_addrs_shape));
 
-    paged_attention_kv_cache(size_t num_seqs, size_t num_tokens,
-                             context_lens_t context_lens, seq_lens_t seq_lens,
-                             block_table_t block_table,
-                             slot_mapping_t slot_mapping, kv_addrs_t kv_addrs)
+    constexpr paged_attention_kv_cache(size_t num_seqs, size_t num_tokens,
+                                       context_lens_t context_lens,
+                                       seq_lens_t seq_lens,
+                                       block_table_t block_table,
+                                       slot_mapping_t slot_mapping,
+                                       kv_addrs_t kv_addrs)
         : attention_kv_cache<TConfig>(num_seqs, num_tokens, context_lens,
                                       seq_lens),
           block_table_(block_table),
@@ -311,7 +315,7 @@ class paged_attention_kv_cache : public attention_kv_cache<TConfig> {
         }
     }
 
-    auto block_table() const noexcept { return block_table_; }
+    constexpr auto block_table() const noexcept { return block_table_; }
 
   private:
     template <class T>
@@ -326,7 +330,7 @@ class paged_attention_kv_cache : public attention_kv_cache<TConfig> {
             generate_shape<block_shard_index.size()>([&](auto axis) {
                 const auto index = block_shard_index(axis);
                 const auto submesh_axes =
-                    std::get<axis>(TConfig::axis_policies).axes;
+                    ntt::get<axis>(TConfig::axis_policies).axes;
                 const auto submesh_shape = Mesh::shape.select(submesh_axes);
                 const auto local_program_id = linear_offset(
                     local_index.select(submesh_axes), submesh_shape);
diff --git a/ntt/include/nncase/ntt/compiler_defs.h b/ntt/include/nncase/ntt/compiler_defs.h
index a101a294c..516d5c926 100644
--- a/ntt/include/nncase/ntt/compiler_defs.h
+++ b/ntt/include/nncase/ntt/compiler_defs.h
@@ -52,3 +52,13 @@
     defined(__riscv_zvfbf)
 #define NTT_HAVE_NATIVE_BF16 1
 #endif
+
+#ifdef __CUDACC__
+#define NTT_HOST_DEVICE __host__ __device__
+#define NTT_DEVICE __device__
+#define NTT_KERNEL __global__
+#else
+#define NTT_HOST_DEVICE
+#define NTT_DEVICE
+#define NTT_KERNEL
+#endif
diff --git a/ntt/include/nncase/ntt/detail/shape_storage.h b/ntt/include/nncase/ntt/detail/shape_storage.h
index 66ecbb029..aaf13e676 100644
--- a/ntt/include/nncase/ntt/detail/shape_storage.h
+++ b/ntt/include/nncase/ntt/detail/shape_storage.h
@@ -14,7 +14,7 @@
  */
 #pragma once
 #include "../compiler_defs.h"
-#include "nncase/ntt/shape.h"
+#include "../tensor_traits.h"
 #include <cstddef>
 #include <type_traits>
 
@@ -68,7 +68,7 @@ struct NTT_EMPTY_BASES tensor_size_impl : public shape_storage<Shape>,
         class = std::enable_if_t<FixedShape<TDummy1> && FixedStrides<TDummy2>>>
     constexpr tensor_size_impl() noexcept {}
 
-    tensor_size_impl(Shape shape, Strides strides)
+    constexpr tensor_size_impl(Shape shape, Strides strides)
         : shape_storage<Shape>(shape), strides_storage<Strides>(strides) {}
 };
 } // namespace nncase::ntt::detail
diff --git a/ntt/include/nncase/ntt/detail/tensor_storage.h b/ntt/include/nncase/ntt/detail/tensor_storage.h
index 503150114..019c29512 100644
--- a/ntt/include/nncase/ntt/detail/tensor_storage.h
+++ b/ntt/include/nncase/ntt/detail/tensor_storage.h
@@ -14,6 +14,7 @@
  */
 #pragma once
 #include "../shape.h"
+#include "../std_containers.h"
 #include "nncase/ntt/tensor_traits.h"
 #include <vector>
 
@@ -23,7 +24,7 @@ template <class T, size_t MaxSize, bool IsView> class tensor_storage;
 // fixed tensor
 template <class T, size_t MaxSize> class tensor_storage<T, MaxSize, false> {
   public:
-    using buffer_type = std::array<T, MaxSize>;
+    using buffer_type = array<T, MaxSize>;
 
     constexpr tensor_storage() = default;
 
@@ -35,10 +36,10 @@ template <class T, size_t MaxSize> class tensor_storage<T, MaxSize, false> {
     constexpr const buffer_type &buffer() const noexcept { return buffer_; }
     constexpr buffer_type &buffer() noexcept { return buffer_; }
 
-    constexpr std::span<const T, MaxSize> elements() const noexcept {
+    constexpr span<const T, MaxSize> elements() const noexcept {
         return buffer_;
     }
-    constexpr std::span<T, MaxSize> elements() noexcept { return buffer_; }
+    constexpr span<T, MaxSize> elements() noexcept { return buffer_; }
 
   private:
     buffer_type buffer_;
@@ -47,7 +48,7 @@ template <class T, size_t MaxSize> class tensor_storage<T, MaxSize, false> {
 // fixed view
 template <class T, size_t MaxSize> class tensor_storage<T, MaxSize, true> {
   public:
-    using buffer_type = std::span<T, MaxSize>;
+    using buffer_type = span<T, MaxSize>;
 
     constexpr tensor_storage(std::in_place_t, buffer_type value)
         : buffer_(value) {}
@@ -55,10 +56,10 @@ template <class T, size_t MaxSize> class tensor_storage<T, MaxSize, true> {
     constexpr const buffer_type &buffer() const noexcept { return buffer_; }
     constexpr buffer_type &buffer() noexcept { return buffer_; }
 
-    constexpr std::span<const T, MaxSize> elements() const noexcept {
+    constexpr span<const T, MaxSize> elements() const noexcept {
         return buffer_;
     }
-    constexpr std::span<T, MaxSize> elements() noexcept { return buffer_; }
+    constexpr span<T, MaxSize> elements() noexcept { return buffer_; }
 
   private:
     buffer_type buffer_;
@@ -76,10 +77,10 @@ template <class T> class tensor_storage<T, std::dynamic_extent, false> {
     constexpr const buffer_type &buffer() const noexcept { return buffer_; }
     constexpr buffer_type &buffer() noexcept { return buffer_; }
 
-    constexpr std::span<const T> elements() const noexcept {
+    constexpr span<const T> elements() const noexcept {
         return {buffer_.data(), buffer_.size()};
     }
-    constexpr std::span<T> elements() noexcept {
+    constexpr span<T> elements() noexcept {
         return {buffer_.data(), buffer_.size()};
     }
 
@@ -98,10 +99,10 @@ template <> class tensor_storage<bool, std::dynamic_extent, false> {
     constexpr const buffer_type &buffer() const noexcept { return buffer_; }
     constexpr buffer_type &buffer() noexcept { return buffer_; }
 
-    std::span<const bool> elements() const noexcept {
+    span<const bool> elements() const noexcept {
         return {reinterpret_cast<const bool *>(buffer_.data()), buffer_.size()};
     }
-    std::span<bool> elements() noexcept {
+    span<bool> elements() noexcept {
         return {reinterpret_cast<bool *>(buffer_.data()), buffer_.size()};
     }
 
@@ -112,8 +113,8 @@ template <> class tensor_storage<bool, std::dynamic_extent, false> {
 // dynamic view
 template <class T> class tensor_storage<T, std::dynamic_extent, true> {
   public:
-    using const_buffer_type = std::span<const T>;
-    using buffer_type = std::span<T>;
+    using const_buffer_type = span<const T>;
+    using buffer_type = span<T>;
 
     constexpr tensor_storage(std::in_place_t, buffer_type value)
         : buffer_(value) {}
diff --git a/ntt/include/nncase/ntt/dimension.h b/ntt/include/nncase/ntt/dimension.h
index d77e7f5c8..aa9973d09 100644
--- a/ntt/include/nncase/ntt/dimension.h
+++ b/ntt/include/nncase/ntt/dimension.h
@@ -16,6 +16,7 @@
 #include "primitive_ops.h"
 #include "tensor_traits.h"
 #include <algorithm>
+#include <concepts>
 #include <cstdint>
 #include <numeric>
 #include <type_traits>
@@ -60,7 +61,7 @@ template <char c, char... cv> struct char_literal<c, cv...> {
 };
 } // namespace detail
 
-template <char... cv> inline constexpr auto operator"" _dim() {
+template <char... cv> inline constexpr auto operator""_dim() {
     constexpr auto value = detail::char_literal<cv...>::to_int;
     return fixed_dim_v<value>;
 }
@@ -184,16 +185,15 @@ constexpr auto positive_index(const TIndex &index,
         } else {
             return index;
         }
+    } else if constexpr (std::unsigned_integral<TIndex>) {
+        return index;
     } else {
         return index < 0 ? index + dim : index;
     }
 }
 
-namespace detail {
-template <class Cond, class T, class F> struct dim_where_impl;
-
-template <class Cond, Dimension T, Dimension F>
-struct dim_where_impl<Cond, T, F> {
+namespace ops {
+template <class Cond, Dimension T, Dimension F> struct where<Cond, T, F> {
     constexpr dim_t operator()(const Cond &cond, const T &true_dim,
                                const F &false_dim) const noexcept {
         return cond ? dim_value(true_dim) : dim_value(false_dim);
@@ -201,7 +201,7 @@ struct dim_where_impl<Cond, T, F> {
 };
 
 template <bool Value, class T, class F>
-struct dim_where_impl<std::integral_constant<bool, Value>, T, F> {
+struct where<std::integral_constant<bool, Value>, T, F> {
     constexpr auto
     operator()(const std::integral_constant<bool, Value> &,
                [[maybe_unused]] const T &true_dim,
@@ -213,12 +213,5 @@ struct dim_where_impl<std::integral_constant<bool, Value>, T, F> {
         }
     }
 };
-} // namespace detail
-
-template <class Cond, class T, class F>
-constexpr auto where(const Cond &cond, const T &true_dim,
-                     const F &false_dim) noexcept {
-    detail::dim_where_impl<Cond, T, F> impl;
-    return impl(cond, true_dim, false_dim);
-}
+} // namespace ops
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/distributed.h b/ntt/include/nncase/ntt/distributed.h
index 63dd0093f..482928657 100644
--- a/ntt/include/nncase/ntt/distributed.h
+++ b/ntt/include/nncase/ntt/distributed.h
@@ -17,4 +17,3 @@
 #include "distributed/sharded_tensor.h"
 #include "distributed/sharding.h"
 #include "distributed/topology.h"
-#include "kernels/reshard.h"
diff --git a/ntt/include/nncase/ntt/distributed/sharding.h b/ntt/include/nncase/ntt/distributed/sharding.h
index 13919a27b..40a415b41 100644
--- a/ntt/include/nncase/ntt/distributed/sharding.h
+++ b/ntt/include/nncase/ntt/distributed/sharding.h
@@ -116,7 +116,7 @@ concept SplitShardPolicy = is_split_shard_policy<Policy>::value;
 
 template <class Mesh, class... AxisPolicies> struct sharding {
     using mesh_type = Mesh;
-    using axis_policies_type = std::tuple<AxisPolicies...>;
+    using axis_policies_type = ntt::tuple<AxisPolicies...>;
     using dynamic_offset_t = dynamic_shape_t<sizeof...(AxisPolicies)>;
 
     static constexpr auto rank() {
@@ -134,7 +134,7 @@ template <class Mesh, class... AxisPolicies> struct sharding {
     global_offset(const GlobalShape &global_shape,
                   const TShardIndex &shard_index) const noexcept {
         auto get_dim = [&, this]<size_t Axis> {
-            return std::get<Axis>(axis_policies)
+            return ntt::get<Axis>(axis_policies)
                 .template global_offset<Mesh>(global_shape[fixed_dim_v<Axis>],
                                               shard_index);
         };
@@ -148,7 +148,7 @@ template <class Mesh, class... AxisPolicies> struct sharding {
     constexpr auto shard_shape(const GlobalShape &global_shape,
                                const TShardIndex &shard_index) const noexcept {
         auto get_dim = [&, this]<size_t Axis> {
-            return std::get<Axis>(axis_policies)
+            return ntt::get<Axis>(axis_policies)
                 .template shard_dim<Mesh>(global_shape[fixed_dim_v<Axis>],
                                           shard_index);
         };
@@ -158,7 +158,7 @@ template <class Mesh, class... AxisPolicies> struct sharding {
         return get_all_dims(std::make_index_sequence<rank()>{});
     }
 
-    std::tuple<AxisPolicies...> axis_policies;
+    ntt::tuple<AxisPolicies...> axis_policies;
 };
 
 template <class Mesh, class... AxisPolicies>
@@ -170,7 +170,7 @@ namespace detail {
 template <class Sharding, class GlobalShape, size_t... Ids>
 constexpr bool is_divisible(const Sharding &sharding, const GlobalShape &shape,
                             std::index_sequence<Ids...>) noexcept {
-    return ((std::get<Ids>(sharding.axis_policies)
+    return ((ntt::get<Ids>(sharding.axis_policies)
                  .template is_divisible<typename Sharding::mesh_type>(
                      shape.at(Ids))) &&
             ...);
@@ -182,7 +182,7 @@ constexpr auto mesh_axes_mask_of_split_shard_policies() noexcept {
     return generate_shape<mesh_type::rank()>([](auto mesh_axis) {
         return make_index_shape<TSharding::rank()>().aggregate(
             dim_zero, [&](auto last_mask, auto axis, auto) {
-                using policy_t = std::tuple_element_t<
+                using policy_t = ntt::tuple_element_t<
                     axis, typename TSharding::axis_policies_type>;
                 if constexpr (distributed::SplitShardPolicy<policy_t>) {
                     if constexpr (policy_t::axes.contains(mesh_axis)) {
@@ -212,7 +212,7 @@ template <Sharding TSharding>
 constexpr auto tensor_axes_mask_of_split_shard_policies() noexcept {
     return generate_shape<TSharding::rank()>([](auto axis) {
         using policy_t =
-            std::tuple_element_t<axis, typename TSharding::axis_policies_type>;
+            ntt::tuple_element_t<axis, typename TSharding::axis_policies_type>;
         if constexpr (distributed::SplitShardPolicy<policy_t>) {
             return dim_one;
         } else {
@@ -239,7 +239,7 @@ constexpr auto local_shard_dim(const TSharding &sharding,
 
     using mesh_type = typename TSharding::mesh_type;
     const auto local_index = mesh_type::local_index();
-    return std::get<Axis>(sharding.axis_policies)
+    return ntt::get<Axis>(sharding.axis_policies)
         .template shard_dim<typename TSharding::mesh_type>(global_shape[fixed_dim_v<Axis>], local_index);
 }
 
diff --git a/ntt/include/nncase/ntt/distributed/topology.h b/ntt/include/nncase/ntt/distributed/topology.h
index 8491cf91e..c84971962 100644
--- a/ntt/include/nncase/ntt/distributed/topology.h
+++ b/ntt/include/nncase/ntt/distributed/topology.h
@@ -13,8 +13,11 @@
  * limitations under the License.
  */
 #pragma once
+#include "nncase/ntt/compiler_defs.h"
 #if defined(NNCASE_XPU_MODULE)
 #include "../arch/xpu/topology_def.h"
+#elif defined(__CUDA_ARCH__)
+#include "../arch/cuda/topology_def.h"
 #else
 #include "../arch/cpu/topology_def.h"
 #endif
@@ -70,17 +73,17 @@ constexpr auto topology_up_size() noexcept {
 }
 
 template <topology Topology> struct program_id_getter {
-    static dim_t id() noexcept;
+    NTT_HOST_DEVICE static dim_t id() noexcept;
 };
 
-template <topology Topology> dim_t program_id() noexcept {
+template <topology Topology> NTT_HOST_DEVICE dim_t program_id() noexcept {
     return program_id_getter<Topology>::id();
 }
 
 bool get_profiler_option() noexcept;
 
 template <topology Scope = (topology)(topology_levels - 1)>
-auto program_ids() noexcept {
+NTT_HOST_DEVICE auto program_ids() noexcept {
     auto f = []<size_t... Is>(std::index_sequence<Is...>) {
         return make_shape(program_id<static_cast<topology>(Is)>()...);
     };
@@ -89,7 +92,8 @@ auto program_ids() noexcept {
 
 template <topology Scope> class topology_synchronizer;
 
-template <topology Scope = (topology)0> void topology_synchronize() noexcept {
+template <topology Scope = (topology)0>
+NTT_HOST_DEVICE void topology_synchronize() noexcept {
     topology_synchronizer<Scope>::synchronize();
 }
 } // namespace nncase::ntt::distributed
diff --git a/ntt/include/nncase/ntt/kernels/binary.h b/ntt/include/nncase/ntt/kernels/binary.h
index 55f1f388b..8e10e54c3 100644
--- a/ntt/include/nncase/ntt/kernels/binary.h
+++ b/ntt/include/nncase/ntt/kernels/binary.h
@@ -26,8 +26,9 @@ class binary_impl
                               TRhs, TOut> {
   public:
     template <Tensor TBroadcastedLhs, Tensor TBroadcastedRhs, class TOp>
-    void invoke_ukernel(const TBroadcastedLhs &lhs, const TBroadcastedRhs &rhs,
-                        TOut &output, const TOp &op, bool is_broadcast) {
+    constexpr void invoke_ukernel(const TBroadcastedLhs &lhs,
+                                  const TBroadcastedRhs &rhs, TOut &output,
+                                  const TOp &op, bool is_broadcast) {
 
         auto lhs_conti_dims = contiguous_dims(lhs.shape(), lhs.strides());
         auto rhs_conti_dims = contiguous_dims(rhs.shape(), rhs.strides());
@@ -106,7 +107,7 @@ class binary_impl
 template <template <class T1, class T2> class TOp,
           template <class> class TPostOp = DefaultPostOp, Tensor TLhs,
           Tensor TRhs, class TOut>
-void binary(const TLhs &lhs, const TRhs &rhs, TOut &&output) {
+constexpr void binary(const TLhs &lhs, const TRhs &rhs, TOut &&output) {
     const TOp<std::remove_cv_t<typename TLhs::element_type>,
               std::remove_cv_t<typename TRhs::element_type>>
         op;
diff --git a/ntt/include/nncase/ntt/kernels/cast.h b/ntt/include/nncase/ntt/kernels/cast.h
index bba36a1b3..a76c8a84f 100644
--- a/ntt/include/nncase/ntt/kernels/cast.h
+++ b/ntt/include/nncase/ntt/kernels/cast.h
@@ -225,8 +225,8 @@ class cast_impl {
 
 template <template <class> class TPostOp = DefaultPostOp, Tensor TIn,
           Tensor TOut, FixedDimensions VectorizedAxes = shape_t<>>
-void cast(const TIn &input, TOut &&output,
-          const VectorizedAxes &vectorizedAxes = {}) noexcept {
+constexpr void cast(const TIn &input, TOut &&output,
+                    const VectorizedAxes &vectorizedAxes = {}) noexcept {
     detail::cast_impl<TIn, std::decay_t<TOut>, VectorizedAxes, TPostOp> impl;
     impl(input, output, vectorizedAxes);
 }
diff --git a/ntt/include/nncase/ntt/kernels/compare.h b/ntt/include/nncase/ntt/kernels/compare.h
index d97102f9e..92fbb8865 100644
--- a/ntt/include/nncase/ntt/kernels/compare.h
+++ b/ntt/include/nncase/ntt/kernels/compare.h
@@ -24,8 +24,9 @@ class compare_impl : public binary_like_impl<compare_impl<TLhs, TRhs, TOut>,
                                              TLhs, TRhs, TOut> {
   public:
     template <Tensor TBroadcastedLhs, Tensor TBroadcastedRhs, class Op>
-    void invoke_ukernel(const TBroadcastedLhs &lhs, const TBroadcastedRhs &rhs,
-                        TOut &output, const Op &op) {
+    constexpr void invoke_ukernel(const TBroadcastedLhs &lhs,
+                                  const TBroadcastedRhs &rhs, TOut &output,
+                                  const Op &op) {
         ntt::apply(output.shape(), [&](auto index) {
             output(index) = op(lhs(index), rhs(index));
         });
@@ -35,7 +36,7 @@ class compare_impl : public binary_like_impl<compare_impl<TLhs, TRhs, TOut>,
 
 template <template <class T1, class T2> class Op, Tensor TLhs, Tensor TRhs,
           class TOut>
-void compare(
+constexpr void compare(
     const TLhs &lhs, const TRhs &rhs, TOut &&output,
     const Op<typename TLhs::value_type, typename TRhs::value_type> &op = {}) {
     detail::compare_impl<TLhs, TRhs, std::decay_t<TOut>>()(lhs, rhs, output,
diff --git a/ntt/include/nncase/ntt/kernels/concat.h b/ntt/include/nncase/ntt/kernels/concat.h
index 50d0a5c43..cb7e2e74b 100644
--- a/ntt/include/nncase/ntt/kernels/concat.h
+++ b/ntt/include/nncase/ntt/kernels/concat.h
@@ -21,15 +21,15 @@
 
 namespace nncase::ntt {
 template <Tensor... TInputs, class TOut, FixedDimension TAxis>
-void concat(const std::tuple<TInputs...> &inputs, TOut &&output,
-            const TAxis &axis) {
+constexpr void concat(const ntt::tuple<TInputs...> &inputs, TOut &&output,
+                      const TAxis &axis) {
     const auto domain =
         shape_infer::reduced_shape_by_axis<TAxis::value>(output.shape());
     dynamic_shape_t<domain.rank()> in_index{};
     apply(domain, [&](auto index) {
         loop<domain.rank()>([&](auto i) { in_index[i] = index[i]; });
         loop<sizeof...(TInputs)>([&](auto i) {
-            auto input = std::get<i>(inputs);
+            auto input = ntt::get<i>(inputs);
             for (in_index[axis] = 0; in_index[axis] < input.shape()[axis];
                  in_index[axis]++, index[axis]++) {
                 output(index) = input(in_index);
diff --git a/ntt/include/nncase/ntt/kernels/conv2d.h b/ntt/include/nncase/ntt/kernels/conv2d.h
index 99ce2cc97..efef6d815 100644
--- a/ntt/include/nncase/ntt/kernels/conv2d.h
+++ b/ntt/include/nncase/ntt/kernels/conv2d.h
@@ -23,9 +23,10 @@ namespace conv_detail {
 template <Tensor TInput, Tensor TWeights, Tensor TBias, Tensor TOutput,
           Dimensions TStride, Paddings TPadding, Dimensions TDilation,
           Dimension TGroups>
-void impl(const TInput &input, const TWeights &weights, const TBias &bias,
-          TOutput &output, const TStride &stride, const TPadding &padding,
-          const TDilation &dilation, const TGroups &groups) noexcept {
+constexpr void impl(const TInput &input, const TWeights &weights,
+                    const TBias &bias, TOutput &output, const TStride &stride,
+                    const TPadding &padding, const TDilation &dilation,
+                    const TGroups &groups) noexcept {
     using TElem = typename TInput::element_type;
     const auto in_shape = input.shape();
     const auto out_channels = weights.shape().template at<0>();
@@ -112,9 +113,10 @@ void impl(const TInput &input, const TWeights &weights, const TBias &bias,
 template <Tensor TInput, Tensor TWeights, Tensor TBias, class TOutput,
           Dimensions TStride, Paddings TPadding, Dimensions TDilation,
           Dimension TGroups>
-void conv2d(const TInput &input, const TWeights &weights, const TBias &bias,
-            TOutput &&output, const TStride &stride, const TPadding &padding,
-            const TDilation &dilation, const TGroups &groups) noexcept {
+constexpr void
+conv2d(const TInput &input, const TWeights &weights, const TBias &bias,
+       TOutput &&output, const TStride &stride, const TPadding &padding,
+       const TDilation &dilation, const TGroups &groups) noexcept {
     conv_detail::impl(input, weights, bias, output, stride, padding, dilation,
                       groups);
 }
diff --git a/ntt/include/nncase/ntt/kernels/copy.h b/ntt/include/nncase/ntt/kernels/copy.h
index 68451768b..2e6df8e8f 100644
--- a/ntt/include/nncase/ntt/kernels/copy.h
+++ b/ntt/include/nncase/ntt/kernels/copy.h
@@ -21,7 +21,7 @@ namespace detail {
 template <class TIn, class TOut, bool Arch> class copy_impl;
 template <class TIn, class TOut, bool Arch> class copy_impl {
   public:
-    void operator()(const TIn &input, TOut &output) {
+    constexpr void operator()(const TIn &input, TOut &output) {
         nncase::ntt::template unary<ops::copy>(input, output);
     }
 };
@@ -29,21 +29,22 @@ template <class TIn, class TOut, bool Arch> class copy_impl {
 template <class NOUSE, bool Arch> class copy_wait_impl;
 template <class NOUSE, bool Arch> class copy_wait_impl {
   public:
-    void operator()() {}
+    constexpr void operator()() {}
 };
 } // namespace detail
 
-template <class NOUSE>
-void tensor_copy_wait() noexcept { detail::copy_wait_impl<NOUSE, true>()(); }
+template <class NOUSE> constexpr void tensor_copy_wait() noexcept {
+    detail::copy_wait_impl<NOUSE, true>()();
+}
 
 template <class TIn, class TOut>
-void tensor_copy_async(const TIn &input, TOut &&output) noexcept {
+constexpr void tensor_copy_async(const TIn &input, TOut &&output) noexcept {
     detail::copy_impl<TIn, TOut, true> impl;
     impl(input, output);
 }
 
 template <class TIn, class TOut>
-void tensor_copy_sync(const TIn &input, TOut &&output) noexcept {
+constexpr void tensor_copy_sync(const TIn &input, TOut &&output) noexcept {
     detail::copy_impl<TIn, TOut, true> impl;
     impl(input, output);
     tensor_copy_wait<void>();
@@ -57,7 +58,7 @@ template <Tensor TTensor, bool Arch> struct tensor_zero_impl {
 };
 } // namespace detail
 
-template <class TOut> void tensor_zero(TOut &&output) noexcept {
+template <class TOut> constexpr void tensor_zero(TOut &&output) noexcept {
     detail::tensor_zero_impl<std::decay_t<TOut>, true>()(output);
 }
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/kernels/detail/elementwise_impl.h b/ntt/include/nncase/ntt/kernels/detail/elementwise_impl.h
index 835edc297..0a93edd9c 100644
--- a/ntt/include/nncase/ntt/kernels/detail/elementwise_impl.h
+++ b/ntt/include/nncase/ntt/kernels/detail/elementwise_impl.h
@@ -27,7 +27,7 @@ class elementwise_impl {
     constexpr void operator()(const TInputs &...inputs, TOutput &output,
                               TArgs &&...args) {
         apply_broadcasted(
-            std::make_tuple(inputs.broadcast_to(output.shape())...), output,
+            ntt::make_tuple(inputs.broadcast_to(output.shape())...), output,
             std::make_index_sequence<sizeof...(TInputs)>(),
             std::forward<TArgs>(args)...);
     }
@@ -35,14 +35,16 @@ class elementwise_impl {
   private:
     template <Tensor... TBroadcastedInputs, size_t... I, class... TArgs>
     constexpr void
-    apply_broadcasted(const std::tuple<TBroadcastedInputs...> &inputs,
+    apply_broadcasted(const ntt::tuple<TBroadcastedInputs...> &inputs,
                       TOutput &output, std::index_sequence<I...>,
                       TArgs &&...args) {
-        derived().apply(std::get<I>(inputs)..., output,
+        derived().apply(ntt::get<I>(inputs)..., output,
                         std::forward<TArgs>(args)...);
     }
 
   protected:
-    TDerived &derived() noexcept { return static_cast<TDerived &>(*this); }
+    constexpr TDerived &derived() noexcept {
+        return static_cast<TDerived &>(*this);
+    }
 };
 } // namespace nncase::ntt::detail
diff --git a/ntt/include/nncase/ntt/kernels/expand.h b/ntt/include/nncase/ntt/kernels/expand.h
index 1b43daa2f..e95ef01c2 100644
--- a/ntt/include/nncase/ntt/kernels/expand.h
+++ b/ntt/include/nncase/ntt/kernels/expand.h
@@ -25,7 +25,7 @@ class expand_impl : public unary_like_impl<expand_impl<TIn, TOut>, TIn, TOut> {
 
   public:
     template <Tensor TBroadcastedIn>
-    void invoke_ukernel(const TBroadcastedIn &input, TOut &output) {
+    constexpr void invoke_ukernel(const TBroadcastedIn &input, TOut &output) {
         ntt::apply(output.shape(),
                    [&](auto index) { output(index) = input(index); });
     }
@@ -34,7 +34,7 @@ class expand_impl : public unary_like_impl<expand_impl<TIn, TOut>, TIn, TOut> {
 } // namespace detail
 
 template <Tensor TIn, typename TOut>
-void expand(const TIn &input, TOut &&output) noexcept {
+constexpr void expand(const TIn &input, TOut &&output) noexcept {
     detail::expand_impl<TIn, std::decay_t<TOut>> impl;
     impl(input, output);
 }
diff --git a/ntt/include/nncase/ntt/kernels/gather.h b/ntt/include/nncase/ntt/kernels/gather.h
index c0d528dec..83d3f8e3a 100644
--- a/ntt/include/nncase/ntt/kernels/gather.h
+++ b/ntt/include/nncase/ntt/kernels/gather.h
@@ -128,8 +128,8 @@ template <Tensor TA, Tensor TB, Tensor TC> class gather_impl {
     };
 
     template <typename T>
-    size_t find_continuous_segments(const T *arr, size_t arrSize,
-                                    segment *segments) {
+    constexpr size_t find_continuous_segments(const T *arr, size_t arrSize,
+                                              segment *segments) {
         if (arrSize == 0)
             return 0;
 
@@ -221,15 +221,15 @@ class distributed_gather_impl {
 } // namespace detail
 
 template <Tensor TA, Tensor TB, class TC, FixedDimension TAxis>
-void gather(const TA &input, const TB &indices, TC &&output,
-            const TAxis &axis) noexcept {
+constexpr void gather(const TA &input, const TB &indices, TC &&output,
+                      const TAxis &axis) noexcept {
     detail::gather_impl<TA, TB, std::decay_t<TC>> impl;
     impl(input, indices, output, axis);
 }
 
 template <ShardedTensor TA, Tensor TB, class TC, FixedDimension TAxis>
-void gather(const TA &input, const TB &indices, TC &&output,
-            const TAxis &axis) noexcept {
+constexpr void gather(const TA &input, const TB &indices, TC &&output,
+                      const TAxis &axis) noexcept {
     detail::distributed_gather_impl<TA, TB, std::decay_t<TC>> impl;
     impl(input, indices, output, axis);
 }
diff --git a/ntt/include/nncase/ntt/kernels/get_item.h b/ntt/include/nncase/ntt/kernels/get_item.h
index ecc4598a5..722770478 100644
--- a/ntt/include/nncase/ntt/kernels/get_item.h
+++ b/ntt/include/nncase/ntt/kernels/get_item.h
@@ -21,7 +21,8 @@
 namespace nncase::ntt {
 template <Tensor TIn, class TIndices, class TOut>
     requires(Dimensions<TIndices> || Dimension<TIndices>)
-void get_item(const TIn &input, TOut &&output, const TIndices &indices) {
+constexpr void get_item(const TIn &input, TOut &&output,
+                        const TIndices &indices) {
     if constexpr (Dimensions<TIndices>) {
         const auto positive_indices =
             generate_shape<TIndices::rank()>([&](auto axis) {
diff --git a/ntt/include/nncase/ntt/kernels/get_position_ids.h b/ntt/include/nncase/ntt/kernels/get_position_ids.h
index d10d60949..692e81913 100644
--- a/ntt/include/nncase/ntt/kernels/get_position_ids.h
+++ b/ntt/include/nncase/ntt/kernels/get_position_ids.h
@@ -20,9 +20,9 @@
 namespace nncase::ntt {
 
 template <class TKVCache, Tensor TOut, class TSharding, Shape TGlobalShape>
-void get_position_ids(TKVCache &&kv_cache_tensor, TOut output,
-                      const TSharding &sharding,
-                      const TGlobalShape &global_shape) {
+constexpr void get_position_ids(TKVCache &&kv_cache_tensor, TOut output,
+                                const TSharding &sharding,
+                                const TGlobalShape &global_shape) {
     using TOutType = typename std::decay_t<TOut>;
     using mesh_type = typename TSharding::mesh_type;
     using TOutElem = typename TOutType::value_type;
diff --git a/ntt/include/nncase/ntt/kernels/im2col.h b/ntt/include/nncase/ntt/kernels/im2col.h
index ef094fe6d..9951607c8 100644
--- a/ntt/include/nncase/ntt/kernels/im2col.h
+++ b/ntt/include/nncase/ntt/kernels/im2col.h
@@ -33,11 +33,12 @@ template <Tensor TIn, Tensor TOut, Dimensions TKernel, Dimensions TStrides,
           Paddings TPadding, FixedShape VectorizedAxes, FixedShape PadedNums>
     requires(VectorizedAxes::rank() == 0 ||
              (VectorizedAxes::rank() == 1 && VectorizedAxes{}.at(0) == 1))
-void im2col_impl(const TIn &input, TOut &output,
-                 [[maybe_unused]] const TKernel &kernel,
-                 const TStrides &strides, const TPadding &padding,
-                 [[maybe_unused]] const VectorizedAxes &vectorizedAxes,
-                 [[maybe_unused]] const PadedNums &padedNums) {
+constexpr void
+im2col_impl(const TIn &input, TOut &output,
+            [[maybe_unused]] const TKernel &kernel, const TStrides &strides,
+            const TPadding &padding,
+            [[maybe_unused]] const VectorizedAxes &vectorizedAxes,
+            [[maybe_unused]] const PadedNums &padedNums) {
     using TElem = typename TIn::element_type;
     const auto input_shape = input.shape();
     const auto input_strides = input.strides();
@@ -49,7 +50,10 @@ void im2col_impl(const TIn &input, TOut &output,
     const auto OW = shape_infer::windowed_output_size(
         input_shape[3_dim], kernel[1_dim], strides[1_dim], 1_dim,
         padding[1_dim]);
-    const auto [batch, IC, IH, IW] = input_shape;
+    const auto batch = input_shape[0_dim];
+    const auto IC = input_shape[1_dim];
+    const auto IH = input_shape[2_dim];
+    const auto IW = input_shape[3_dim];
     const auto pad_h = padding[0_dim];
     const auto pad_w = padding[1_dim];
     const auto kernel_h = kernel[0_dim];
@@ -99,11 +103,12 @@ void im2col_impl(const TIn &input, TOut &output,
  */
 template <Tensor TIn, class TOut, Dimensions TKernel, Dimensions TStrides,
           Paddings TPadding = decltype(make_zeros_paddings<2>()),
-          FixedShape VectorizedAxes = shape_t<>, FixedShape PadedNums = shape_t<>>
-void im2col(const TIn &input, TOut &&output, const TKernel &kernel,
-            const TStrides &strides, const TPadding &padding = {},
-            const VectorizedAxes &vectorizedAxes = {},
-            const PadedNums &padedNums = {}) {
+          FixedShape VectorizedAxes = shape_t<>,
+          FixedShape PadedNums = shape_t<>>
+constexpr void im2col(const TIn &input, TOut &&output, const TKernel &kernel,
+                      const TStrides &strides, const TPadding &padding = {},
+                      const VectorizedAxes &vectorizedAxes = {},
+                      const PadedNums &padedNums = {}) {
     im2col_details::im2col_impl(input, output, kernel, strides, padding,
                                 vectorizedAxes, padedNums);
 }
diff --git a/ntt/include/nncase/ntt/kernels/layer_norm.h b/ntt/include/nncase/ntt/kernels/layer_norm.h
index 3c6409c59..c85f16ac4 100644
--- a/ntt/include/nncase/ntt/kernels/layer_norm.h
+++ b/ntt/include/nncase/ntt/kernels/layer_norm.h
@@ -26,10 +26,11 @@ namespace vectorized_layer_norm_detail {
 template <bool UseMean, Tensor TIn, Tensor TScale, Tensor TBias, typename TOut,
           FixedDimensions VectorizedAxes, Dimensions PadedNums,
           FixedDimension TAxis>
-void within_axis_vectorize_impl(const TIn &input, const TScale &scale,
-                                const TBias &bias, TOut &output,
-                                const float &epsilon, const VectorizedAxes &,
-                                const PadedNums &, const TAxis &) {
+constexpr void within_axis_vectorize_impl(const TIn &input, const TScale &scale,
+                                          const TBias &bias, TOut &output,
+                                          const float &epsilon,
+                                          const VectorizedAxes &,
+                                          const PadedNums &, const TAxis &) {
 
     using TElem = typename TIn::element_type;
     using TScaleElem = typename TScale::element_type;
@@ -94,19 +95,22 @@ void within_axis_vectorize_impl(const TIn &input, const TScale &scale,
                 const auto extended_sum_s =
                     reduce_sum(extended_sum) * norm_factor;
                 auto extended_add = extended_sum_s + epsilon;
-                auto rsqrt =
-                    ntt::rsqrt(extended_add);
+                auto rsqrt = ntt::rsqrt(extended_add);
 
                 if constexpr (UseMean) {
                     for (auto i = 0; i < inner_size; i++) {
-                        auto val = (input_p[offset + i] - mean) * ntt::cast_elem<TElemScalar>(rsqrt);
+                        auto val = (input_p[offset + i] - mean) *
+                                   ntt::cast_elem<TElemScalar>(rsqrt);
                         output_p[offset + i] =
                             ntt::mul_add(val, scale_p[i], bias_p[i]);
                     }
                 } else {
                     for (auto i = 0; i < inner_size; i++) {
-                        auto val = ntt::cast_elem<float>(input_p[offset + i]) * rsqrt;
-                        output_p[offset + i] = ntt::cast_elem<TElemScalar>(val * ntt::cast_elem<float>(scale_p[i]) + ntt::cast_elem<float>(bias_p[i]));
+                        auto val =
+                            ntt::cast_elem<float>(input_p[offset + i]) * rsqrt;
+                        output_p[offset + i] = ntt::cast_elem<TElemScalar>(
+                            val * ntt::cast_elem<float>(scale_p[i]) +
+                            ntt::cast_elem<float>(bias_p[i]));
                     }
                 }
             } else {
@@ -135,19 +139,22 @@ void within_axis_vectorize_impl(const TIn &input, const TScale &scale,
 
                 extended_sum *= norm_factor;
                 auto extended_add = extended_sum + epsilon;
-                auto rsqrt =
-                    ntt::rsqrt(extended_add);
+                auto rsqrt = ntt::rsqrt(extended_add);
 
                 if constexpr (UseMean) {
                     for (auto i = 0; i < inner_size; i++) {
-                        auto val = (input_p[offset + i] - mean) * ntt::cast_elem<TElemScalar>(rsqrt);
+                        auto val = (input_p[offset + i] - mean) *
+                                   ntt::cast_elem<TElemScalar>(rsqrt);
                         output_p[offset + i] =
                             ntt::mul_add(val, scale_p[i], bias_p[i]);
                     }
                 } else {
                     for (auto i = 0; i < inner_size; i++) {
-                        auto val = ntt::cast_elem<float>(input_p[offset + i]) * rsqrt;
-                        output_p[offset + i] = ntt::cast_elem<TElemScalar>(val * ntt::cast_elem<float>(scale_p[i]) + ntt::cast_elem<float>(bias_p[i]));
+                        auto val =
+                            ntt::cast_elem<float>(input_p[offset + i]) * rsqrt;
+                        output_p[offset + i] = ntt::cast_elem<TElemScalar>(
+                            val * ntt::cast_elem<float>(scale_p[i]) +
+                            ntt::cast_elem<float>(bias_p[i]));
                     }
                 }
             }
@@ -161,11 +168,12 @@ template <bool UseMean = true, Tensor TIn, Tensor TScale, Tensor TBias,
           typename TOut, FixedDimension TAxis,
           FixedDimensions VectorizedAxes = shape_t<>,
           Dimensions PadedNums = shape_t<>>
-void vectorized_layer_norm(const TIn &input, const TScale &scale,
-                           const TBias &bias, TOut &&output,
-                           const float &epsilon, const TAxis &axis = -1_dim,
-                           const VectorizedAxes &vectorizedAxes = {},
-                           const PadedNums &padedNums = {}) {
+constexpr void vectorized_layer_norm(const TIn &input, const TScale &scale,
+                                     const TBias &bias, TOut &&output,
+                                     const float &epsilon,
+                                     const TAxis &axis = -1_dim,
+                                     const VectorizedAxes &vectorizedAxes = {},
+                                     const PadedNums &padedNums = {}) {
     static_assert(VectorizedAxes::rank() < 2,
                   "currently not support 2d packing.");
 
diff --git a/ntt/include/nncase/ntt/kernels/matmul.h b/ntt/include/nncase/ntt/kernels/matmul.h
index f5bdd1989..7d1c1f2e8 100644
--- a/ntt/include/nncase/ntt/kernels/matmul.h
+++ b/ntt/include/nncase/ntt/kernels/matmul.h
@@ -22,12 +22,14 @@
 namespace nncase::ntt {
 namespace detail {
 
-template <typename T> concept HasValidRank = requires(T t) {
+template <typename T>
+concept HasValidRank = requires(T t) {
     T::rank();
     requires T::rank() >= 2;
 };
 
-template <typename T> concept ValidMatmulTensor = Tensor<T> &&HasValidRank<T>;
+template <typename T>
+concept ValidMatmulTensor = Tensor<T> && HasValidRank<T>;
 
 template <class TLhs, class TRhs, typename LhsVectorizedAxes,
           typename RhsVectorizedAxes, bool TransposedA = false,
@@ -122,8 +124,8 @@ class matmul_impl<AccumulateC, false, TransposedB, TLhs, TRhs, TOut, TScale,
         TOutElem, true>;
 
   public:
-    void operator()(const TLhs &lhs, const TRhs &rhs, TOut &output,
-                    const TScale &scale) {
+    constexpr void operator()(const TLhs &lhs, const TRhs &rhs, TOut &output,
+                              const TScale &scale) {
         const auto domain =
             output.shape().template slice<0, TOut::rank() - 2>();
         ntt::apply(domain, [&](auto out_offset_prefix) {
@@ -200,8 +202,9 @@ class matmul_impl<AccumulateC, false, TransposedB, TLhs, TRhs, TOut, TScale,
 
     template <dim_t M0Tile, dim_t N0Tile, class TA, class TB, class TC,
               Dimension TK>
-    void matmul_2d_l0(const TA &a, const TB &b, TC &c, const TScale &scale,
-                      const TK &K, dim_t m1, dim_t n1) {
+    constexpr void matmul_2d_l0(const TA &a, const TB &b, TC &c,
+                                const TScale &scale, const TK &K, dim_t m1,
+                                dim_t n1) {
 
         constexpr auto m0_scale =
             ukernels::u_type_scale<vectorize_kind, TA, TB, TC>::m0_scale;
@@ -250,16 +253,16 @@ template <bool AccumulateC = false, bool TransposedA = false,
           FixedDimensions LhsPadedNums = shape_t<>,
           FixedDimensions RhsVectorizedAxes = shape_t<>,
           FixedDimensions RhsPadedNums = shape_t<>>
-void matmul(
-    [[maybe_unused]] const TLhs &lhs, [[maybe_unused]] const TRhs &rhs,
-    [[maybe_unused]] TOut &&output,
-    [[maybe_unused]] const TScale &scale = nullptr,
-    [[maybe_unused]] const LhsVectorizedAxes &lhsVectorizedAxes =
-        fixed_shape_v<>,
-    [[maybe_unused]] const LhsPadedNums &lhsPadedNums = fixed_shape_v<>,
-    [[maybe_unused]] const RhsVectorizedAxes &rhsVectorizedAxes =
-        fixed_shape_v<>,
-    [[maybe_unused]] const RhsPadedNums &rhsPadedNums = fixed_shape_v<>) {
+constexpr void
+matmul([[maybe_unused]] const TLhs &lhs, [[maybe_unused]] const TRhs &rhs,
+       [[maybe_unused]] TOut &&output,
+       [[maybe_unused]] const TScale &scale = nullptr,
+       [[maybe_unused]] const LhsVectorizedAxes &lhsVectorizedAxes =
+           fixed_shape_v<>,
+       [[maybe_unused]] const LhsPadedNums &lhsPadedNums = fixed_shape_v<>,
+       [[maybe_unused]] const RhsVectorizedAxes &rhsVectorizedAxes =
+           fixed_shape_v<>,
+       [[maybe_unused]] const RhsPadedNums &rhsPadedNums = fixed_shape_v<>) {
 
     constexpr LhsPadedNums lhsPadedNumsType;
     constexpr RhsPadedNums rhsPadedNumsType;
diff --git a/ntt/include/nncase/ntt/kernels/pack.h b/ntt/include/nncase/ntt/kernels/pack.h
index 536c3f37a..0d47a0145 100644
--- a/ntt/include/nncase/ntt/kernels/pack.h
+++ b/ntt/include/nncase/ntt/kernels/pack.h
@@ -156,7 +156,8 @@ template <Tensor TIn, Tensor TOut> class pack_impl<TIn, TOut, 1> {
 } // namespace detail
 
 template <Tensor TIn, class TOut, FixedDimensions TAxes>
-void pack(const TIn &input, TOut &&output, const TAxes &axes) noexcept {
+constexpr void pack(const TIn &input, TOut &&output,
+                    const TAxes &axes) noexcept {
     using TVec = typename std::decay_t<TOut>::element_type;
     static_assert(TVec::rank() ==
                       vector_rank_v<typename TIn::value_type> + TAxes::rank(),
diff --git a/ntt/include/nncase/ntt/kernels/pad.h b/ntt/include/nncase/ntt/kernels/pad.h
index 281759d5e..71cb4fffe 100644
--- a/ntt/include/nncase/ntt/kernels/pad.h
+++ b/ntt/include/nncase/ntt/kernels/pad.h
@@ -21,8 +21,8 @@
 namespace nncase::ntt {
 namespace pad_detail {
 template <Tensor TIn, Tensor TOut, Paddings TPaddings, ScalarOrVector TElem>
-void pad_impl(const TIn &input, TOut &output, const TPaddings &paddings,
-              const TElem &pad_value) {
+constexpr void pad_impl(const TIn &input, TOut &output,
+                        const TPaddings &paddings, const TElem &pad_value) {
     constexpr auto rank = TIn::rank();
     ntt::apply(output.shape(), [&](auto out_index) {
         bool dopad = false;
@@ -52,8 +52,8 @@ void pad_impl(const TIn &input, TOut &output, const TPaddings &paddings,
 template <Tensor TIn, class TOut, Paddings TPaddings,
           ScalarOrVector TElem = typename TIn::element_type>
     requires(bool(TIn::rank() == TPaddings::rank()))
-void pad(const TIn &input, TOut &&output, const TPaddings &paddings,
-         const TElem &pad_value = {}) noexcept {
+constexpr void pad(const TIn &input, TOut &&output, const TPaddings &paddings,
+                   const TElem &pad_value = {}) noexcept {
     pad_detail::pad_impl(input, output, paddings, pad_value);
 }
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/kernels/paged_attention.h b/ntt/include/nncase/ntt/kernels/paged_attention.h
index d109d965b..2f8f6cb11 100644
--- a/ntt/include/nncase/ntt/kernels/paged_attention.h
+++ b/ntt/include/nncase/ntt/kernels/paged_attention.h
@@ -180,12 +180,12 @@ constexpr void update_paged_attention_kv_cache(const TSlots &slots_tensor,
 template <FixedDimensions QLayout, ShardedTensor TQ, Tensor TKVCache,
           Tensor TScale, class TOutput, Tensor TExtra>
     requires(Tensor<std::decay_t<TOutput>>)
-void paged_attention(
-    const TQ &q_tensor, TKVCache &kv_cache_tensor,
-    TExtra &extra_tensor, /* [head_q, max_query_len, max_seq_len] + [head_q,
-                            max_query_len, 1] */
-    const TScale &scale, size_t layer_id, TOutput &&output_tensor,
-    const QLayout &q_layout) noexcept {
+constexpr void
+paged_attention(const TQ &q_tensor, TKVCache &kv_cache_tensor,
+                TExtra &extra_tensor, /* [head_q, max_query_len, max_seq_len] +
+                                        [head_q, max_query_len, 1] */
+                const TScale &scale, size_t layer_id, TOutput &&output_tensor,
+                const QLayout &q_layout) noexcept {
     auto &kv_cache = kv_cache_tensor(fixed_shape_v<>);
     using kv_cache_t = typename std::decay_t<decltype(kv_cache)>;
     using config_t = typename kv_cache_t::config_t;
@@ -229,7 +229,11 @@ void paged_attention(
             (s_shape.length() + reduce_s_shape.length()) *
                 (sizeof(kv_prim_type_t))) {
             printf("extra_tensor is not enough.\n");
+#ifdef __CUDA_ARCH__
+            cuda::std::terminate();
+#else
             std::terminate();
+#endif
         }
 
         auto s = make_tensor_view_from_address(
@@ -398,7 +402,7 @@ void paged_attention(
 
 template <class T0, class T1, class T2, class T3, class T4, class T5, class T6,
           class T7, class T8>
-void identity_paged_attention_kv_cache(
+constexpr void identity_paged_attention_kv_cache(
     [[maybe_unused]] T0 input, [[maybe_unused]] T1 num_seqs,
     [[maybe_unused]] T2 num_tokens, [[maybe_unused]] T3 context_lens,
     [[maybe_unused]] T4 seq_lens, [[maybe_unused]] T5 block_table,
@@ -408,8 +412,9 @@ void identity_paged_attention_kv_cache(
 }
 
 template <ShardedTensor T0, class T1, class T2>
-void gather_paged_attention_kv_cache([[maybe_unused]] const T0 &value,
-                                     T1 &&kv_cache_tensor, T2 &&output_tensor) {
+constexpr void gather_paged_attention_kv_cache([[maybe_unused]] const T0 &value,
+                                               T1 &&kv_cache_tensor,
+                                               T2 &&output_tensor) {
     auto &kv_cache = kv_cache_tensor(fixed_shape_v<>);
     using kv_cache_t = typename std::decay_t<decltype(kv_cache)>;
     using config_t = typename kv_cache_t::config_t;
@@ -419,7 +424,7 @@ void gather_paged_attention_kv_cache([[maybe_unused]] const T0 &value,
     const auto kv_cache_index =
         generate_shape<config_t::sharding_axes_t::rank()>([&](auto axis) {
             const auto submesh_axes =
-                std::get<axis>(config_t::axis_policies).axes;
+                ntt::get<axis>(config_t::axis_policies).axes;
             const auto submesh_shape = mesh_type::shape.select(submesh_axes);
             const auto local_program_id =
                 linear_offset(local_index.select(submesh_axes), submesh_shape);
diff --git a/ntt/include/nncase/ntt/kernels/reduce.h b/ntt/include/nncase/ntt/kernels/reduce.h
index 25d4d786d..ab1f48aab 100644
--- a/ntt/include/nncase/ntt/kernels/reduce.h
+++ b/ntt/include/nncase/ntt/kernels/reduce.h
@@ -14,11 +14,10 @@
  */
 #pragma once
 #include "../primitive_ops.h"
+#include "../shape.h"
 #include "../shape_infer/reduce.h"
 #include "../ukernels.h"
 #include "../utility.h"
-#include "nncase/ntt/dimension.h"
-#include "nncase/ntt/tensor_traits.h"
 #include <limits>
 
 namespace nncase::ntt {
@@ -152,11 +151,10 @@ class reduce_impl {
         }
     }
 
-    template <size_t Axis, class TSubIn>
-    constexpr void
-    apply_contiguous_reduce(dynamic_shape_t<TSubIn::rank()> &index,
-                            size_t conti_dims, const TSubIn &input,
-                            TInElem &reduced_in) {
+    template <size_t Axis, class TIndex, class TSubIn>
+    constexpr void apply_contiguous_reduce(TIndex &index, size_t conti_dims,
+                                           const TSubIn &input,
+                                           TInElem &reduced_in) {
         const auto outer_dims = TSubIn::rank() - conti_dims;
         const auto axis_v = fixed_dim_v<Axis>;
         if (Axis >= outer_dims) {
@@ -201,9 +199,11 @@ template <reduce_op Op, bool LoadPrevious = false, Tensor TIn, class TOut,
           FixedDimensions VectorizedAxes = shape_t<>,
           FixedDimensions PadedNums =
               decltype(make_zeros_shape<VectorizedAxes::rank()>())>
-void reduce(const TIn &input, TOut &&output, const TReduceAxes &reduce_axes,
-            const VectorizedAxes &vectorized_axes = {},
-            [[maybe_unused]] const PadedNums &paded_nums = {}) noexcept {
+constexpr void
+reduce(const TIn &input, TOut &&output,
+       [[maybe_unused]] const TReduceAxes &reduce_axes,
+       [[maybe_unused]] const VectorizedAxes &vectorized_axes = {},
+       [[maybe_unused]] const PadedNums &paded_nums = {}) noexcept {
     static_assert(!(LoadPrevious && Op == reduce_op::mean),
                   "not support reduce mean splited on reduce axis");
     detail::reduce_impl<Op, LoadPrevious, TIn, std::decay_t<TOut>, PadedNums>
@@ -217,10 +217,10 @@ void reduce(const TIn &input, TOut &&output, const TReduceAxes &reduce_axes,
               FixedDimensions VectorizedAxes = shape_t<>,                      \
               FixedDimensions PadedNums =                                      \
                   decltype(make_zeros_shape<VectorizedAxes::rank()>())>        \
-    void reduce_##op(const TIn &input, TOut &&output,                          \
-                     const TReduceAxes &reduce_axes,                           \
-                     const VectorizedAxes &vectorized_axes = {},               \
-                     const PadedNums &paded_nums = {}) noexcept {              \
+    constexpr void reduce_##op(const TIn &input, TOut &&output,                \
+                               const TReduceAxes &reduce_axes,                 \
+                               const VectorizedAxes &vectorized_axes = {},     \
+                               const PadedNums &paded_nums = {}) noexcept {    \
         return reduce<reduce_op::op, LoadPrevious>(                            \
             input, std::forward<TOut>(output), reduce_axes, vectorized_axes,   \
             paded_nums);                                                       \
diff --git a/ntt/include/nncase/ntt/kernels/reshard.h b/ntt/include/nncase/ntt/kernels/reshard.h
index 312359159..24c2521b9 100644
--- a/ntt/include/nncase/ntt/kernels/reshard.h
+++ b/ntt/include/nncase/ntt/kernels/reshard.h
@@ -18,6 +18,7 @@
 #include "../distributed/topology.h"
 #include "../primitive_ops.h"
 #include "../shape.h"
+#include "../std_containers.h"
 #include "../tensor_traits.h"
 #include "copy.h"
 #include <cstddef>
@@ -33,7 +34,7 @@ namespace tar {
 #if defined(NNCASE_XPU_MODULE) && defined(SYS_MODE)
 __device__ extern uint8_t collective_pool_ptr[];
 #else
-extern uint8_t collective_pool_ptr[];
+NTT_DEVICE extern uint8_t collective_pool_ptr[];
 #endif
 } // namespace tar
 
@@ -116,12 +117,12 @@ struct reshard_impl<SrcTensor, DestTensor> {
                                       const TShardIndex &shard_index) {
         // 1. Fill split axes.
         auto split_phase1 = src.shape().aggregate(
-            std::make_tuple(fixed_shape_v<>, fixed_shape_v<>, fixed_shape_v<>),
+            ntt::make_tuple(fixed_shape_v<>, fixed_shape_v<>, fixed_shape_v<>),
             [&](auto last_acc, auto global_dim, auto axis) {
                 auto [last_global_offset, last_local_offset, last_shape] =
                     last_acc;
                 const auto policy =
-                    std::get<axis>(src.sharding().axis_policies);
+                    ntt::get<axis>(src.sharding().axis_policies);
                 if constexpr (distributed::SplitShardPolicy<
                                   std::decay_t<decltype(policy)>>) {
                     // Split axis, simply calculate the global offset and
@@ -133,12 +134,12 @@ struct reshard_impl<SrcTensor, DestTensor> {
                     const auto local_shape =
                         policy.template shard_dim<mesh_type>(global_dim,
                                                              shard_index);
-                    return std::make_tuple(
+                    return ntt::make_tuple(
                         last_global_offset.append(global_offset),
                         last_local_offset.append(local_offset),
                         last_shape.append(local_shape));
                 } else {
-                    return std::make_tuple(last_global_offset.append(dim_zero),
+                    return ntt::make_tuple(last_global_offset.append(dim_zero),
                                            last_local_offset.append(dim_zero),
                                            last_shape.append(dim_zero));
                 }
@@ -169,7 +170,7 @@ struct reshard_impl<SrcTensor, DestTensor> {
                 const auto global_end =
                     ntt::min(global_offset + local_dim, global_dim);
                 const auto in_bound = global_offset < global_dim;
-                return std::make_tuple(
+                return ntt::make_tuple(
                     last_global_offset.template replace_at<axis>(
                         ntt::where(in_bound, global_offset, dim_zero)),
                     last_local_offset.template replace_at<axis>(
@@ -228,11 +229,11 @@ struct reshard_impl<SrcTensor, DestTensor> {
         constexpr auto non_split_tensor_axes =
             distributed::detail::tensor_axes_of_non_split_shard_policies<
                 sharding_type>();
-        std::array<dim_t, non_split_tensor_axes.rank()> split_counts{};
+        ntt::array<dim_t, non_split_tensor_axes.rank()> split_counts{};
 
         // 1. Calculate the initial split counts.
         {
-            std::array<float, non_split_tensor_axes.rank()> log_dims;
+            ntt::array<float, non_split_tensor_axes.rank()> log_dims;
             for (size_t i = 0; i < non_split_tensor_axes.rank(); i++) {
                 auto dim = (float)shape.at(non_split_tensor_axes[i]);
                 log_dims[i] = std::log(dim);
@@ -346,7 +347,7 @@ struct reshard_impl<SrcTensor, DestTensor> {
     }
 
   private:
-    void copy_to_global(const SrcTensor &src) noexcept {
+    constexpr void copy_to_global(const SrcTensor &src) noexcept {
         auto global_buffer_address =
             reinterpret_cast<typename SrcTensor::value_type *>(
                 tar::collective_pool_ptr);
@@ -355,7 +356,7 @@ struct reshard_impl<SrcTensor, DestTensor> {
         reshard(src, global_tensor);
     }
 
-    void copy_from_global(DestTensor &dest) noexcept {
+    constexpr void copy_from_global(DestTensor &dest) noexcept {
         auto global_buffer_address =
             reinterpret_cast<const typename DestTensor::value_type *>(
                 tar::collective_pool_ptr);
@@ -382,23 +383,31 @@ struct reshard_impl<SrcTensor, DestTensor> {
         // 2. get mesh index of src candidates
         // 2.1 generate coords for each split axis
         auto make_coords = [&]<std::size_t... Is>(std::index_sequence<Is...>) {
-            return std::make_tuple(std::array<size_t, mesh_type::shape[Is]>{}...);
+            return ntt::make_tuple(
+                std::array<size_t, mesh_type::shape[Is]>{}...);
         };
-        auto coords = make_coords(std::make_index_sequence<mesh_type::rank()>{});
+        auto coords =
+            make_coords(std::make_index_sequence<mesh_type::rank()>{});
         std::array<size_t, mesh_type::rank()> counts{};
 
         auto get_coord = [&]<size_t tensor_axis>() {
-            const auto policy = std::get<tensor_axis>(src.sharding().axis_policies);
-            if constexpr (distributed::SplitShardPolicy<std::decay_t<decltype(policy)>>) {
+            const auto policy =
+                ntt::get<tensor_axis>(src.sharding().axis_policies);
+            if constexpr (distributed::SplitShardPolicy<
+                              std::decay_t<decltype(policy)>>) {
                 size_t num_blocks = 1;
                 constexpr auto policy_rank = policy.axes.rank();
                 for (size_t i = 0; i < policy_rank; ++i) {
                     num_blocks *= mesh_type::shape.at(policy.axes.at(i));
                 }
 
-                size_t block_size = (global_shape[tensor_axis] + num_blocks - 1) / num_blocks;
-                size_t begin_block = dest_start_offset[tensor_axis] / block_size;
-                size_t end_block = (dest_start_offset[tensor_axis] + dest_local_shape[tensor_axis] - 1) / block_size;
+                size_t block_size =
+                    (global_shape[tensor_axis] + num_blocks - 1) / num_blocks;
+                size_t begin_block =
+                    dest_start_offset[tensor_axis] / block_size;
+                size_t end_block = (dest_start_offset[tensor_axis] +
+                                    dest_local_shape[tensor_axis] - 1) /
+                                   block_size;
                 for (size_t block = begin_block; block <= end_block; ++block) {
                     size_t remainder = block;
                     auto axes_reverse = policy.axes.reverse();
@@ -408,7 +417,7 @@ struct reshard_impl<SrcTensor, DestTensor> {
                         size_t c = remainder % dim;
                         remainder /= dim;
 
-                        auto& coord = std::get<mesh_axis>(coords);
+                        auto &coord = ntt::get<mesh_axis>(coords);
                         bool exist = false;
                         for (size_t i = 0; i < counts[mesh_axis]; ++i) {
                             if (coord[i] == c) {
@@ -431,38 +440,41 @@ struct reshard_impl<SrcTensor, DestTensor> {
         get_all_coords(std::make_index_sequence<tensor_rank>{});
 
         // 2.2 update coords for mesh broadcast axis
-        auto update_broadcast_axis = [&]<std::size_t... Is>(std::index_sequence<Is...>) {
-            (([&] {
-                if (counts[Is] == 0) {
-                    auto& coord = std::get<Is>(coords);
-                    coord[counts[Is]++] = local_mesh_index[Is];
-                }
-            }()), ...);
-        };
+        auto update_broadcast_axis =
+            [&]<std::size_t... Is>(std::index_sequence<Is...>) {
+                (([&] {
+                     if (counts[Is] == 0) {
+                         auto &coord = ntt::get<Is>(coords);
+                         coord[counts[Is]++] = local_mesh_index[Is];
+                     }
+                 }()),
+                 ...);
+            };
         update_broadcast_axis(std::make_index_sequence<mesh_type::rank()>{});
 
         // 2.3 compute Cartesian product of coords
         std::array<bool, mesh_type::shape.length()> candidates{};
         std::array<size_t, mesh_type::rank()> current_coord{};
-        auto compute_cartesian_product = [&](auto&& self, auto axis_const) -> void {
+        auto compute_cartesian_product = [&](auto &&self,
+                                             auto axis_const) -> void {
             constexpr size_t axis = decltype(axis_const)::value;
             if constexpr (axis == mesh_type::rank()) {
                 dynamic_shape_t<mesh_type::rank()> coord{};
-                loop<mesh_type::rank()>([&](auto i) {
-                    coord[i] = current_coord[i];
-                });
+                loop<mesh_type::rank()>(
+                    [&](auto i) { coord[i] = current_coord[i]; });
                 size_t linear_idx = linear_offset(coord, mesh_type::shape);
                 candidates[linear_idx] = true;
                 return;
             } else {
-                const auto& coord = std::get<axis>(coords);
+                const auto &coord = ntt::get<axis>(coords);
                 for (size_t i = 0; i < counts[axis]; ++i) {
                     current_coord[axis] = coord[i];
                     self(self, std::integral_constant<size_t, axis + 1>{});
                 }
             }
         };
-        compute_cartesian_product(compute_cartesian_product, std::integral_constant<size_t, 0>{});
+        compute_cartesian_product(compute_cartesian_product,
+                                  std::integral_constant<size_t, 0>{});
 
         // 3. traverse src index
         for (size_t i = 0; i < candidates.size(); ++i) {
diff --git a/ntt/include/nncase/ntt/kernels/resize_image.h b/ntt/include/nncase/ntt/kernels/resize_image.h
index 743dd9708..57ebb2e03 100644
--- a/ntt/include/nncase/ntt/kernels/resize_image.h
+++ b/ntt/include/nncase/ntt/kernels/resize_image.h
@@ -50,13 +50,7 @@ using get_coordinate_func_t = float (*)(float, float, float, float, float,
                                         float);
 using get_nearest_pixel_func_t = int64_t (*)(float);
 
-get_coordinate_func_t get_coordinate_from_resized(
-    image_resize_transformation_mode_t coordinate_transform_mode);
-
-get_nearest_pixel_func_t
-get_nearest_pixel_from_origin(image_resize_nearest_mode_t nearest_mode);
-
-inline get_coordinate_func_t get_coordinate_from_resized(
+inline constexpr get_coordinate_func_t get_coordinate_from_resized(
     image_resize_transformation_mode_t coordinate_transform_mode) {
     switch (coordinate_transform_mode) {
     case image_resize_transformation_mode_t::asymmetric:
@@ -95,7 +89,7 @@ inline get_coordinate_func_t get_coordinate_from_resized(
     }
 }
 
-inline get_nearest_pixel_func_t
+inline constexpr get_nearest_pixel_func_t
 get_nearest_pixel_from_origin(image_resize_nearest_mode_t nearest_mode) {
     switch (nearest_mode) {
     case image_resize_nearest_mode_t::round_prefer_ceil:
@@ -122,21 +116,23 @@ get_nearest_pixel_from_origin(image_resize_nearest_mode_t nearest_mode) {
 }
 
 template <typename TShape>
-inline std::tuple<float, float> get_resize_scales(TShape in_shape,
-                                                  int32_t out_h, int32_t out_w,
-                                                  bool align_corners) noexcept {
+inline constexpr ntt::tuple<float, float>
+get_resize_scales(TShape in_shape, int32_t out_h, int32_t out_w,
+                  bool align_corners) noexcept {
     auto height_scale = (float)in_shape[2] / out_h;
     auto width_scale = (float)in_shape[3] / out_w;
     if (align_corners && out_h > 1)
         height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
     if (align_corners && out_w > 1)
         width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
-    return std::make_tuple(height_scale, width_scale);
+    return ntt::make_tuple(height_scale, width_scale);
 }
 
-inline void set_resize_bilinear(size_t value, float scale,
-                                bool half_pixel_centers, size_t shape_size,
-                                float &scaled_value, int32_t &v0, int32_t &v1) {
+inline constexpr void set_resize_bilinear(size_t value, float scale,
+                                          bool half_pixel_centers,
+                                          size_t shape_size,
+                                          float &scaled_value, int32_t &v0,
+                                          int32_t &v1) {
     if (half_pixel_centers) {
         scaled_value = (value + 0.5f) * scale - 0.5f;
     } else {
@@ -148,20 +144,21 @@ inline void set_resize_bilinear(size_t value, float scale,
                   static_cast<int32_t>(shape_size - 1));
 }
 
-template <typename T> float get_rounding_offset() {
+template <typename T> constexpr float get_rounding_offset() {
     return std::is_integral_v<T> ? .5f : .0f;
 }
 
-template <FixedTensor T> float get_rounding_offset() {
+template <FixedTensor T> constexpr float get_rounding_offset() {
     return std::is_integral_v<typename T::element_type> ? .5f : .0f;
 }
 
 template <typename T, typename TInShape, typename TInStrides,
           typename TOutStrides>
-void resize_bilinear(const T *input, T *output, const TInShape in_shape,
-                     const TInStrides in_strides, const TOutStrides out_strides,
-                     int32_t out_h, int32_t out_w, bool align_corners,
-                     bool half_pixel_centers) noexcept {
+constexpr void
+resize_bilinear(const T *input, T *output, const TInShape in_shape,
+                const TInStrides in_strides, const TOutStrides out_strides,
+                int32_t out_h, int32_t out_w, bool align_corners,
+                bool half_pixel_centers) noexcept {
     auto [height_scale, width_scale] =
         get_resize_scales(in_shape, out_h, out_w, align_corners);
 
@@ -215,13 +212,14 @@ void resize_bilinear(const T *input, T *output, const TInShape in_shape,
 
 template <typename T, typename TInShape, typename TInStrides,
           typename TOutStrides>
-void resize_neareast_neighbor(
-    const T *input, T *output, const TInShape in_shape,
-    const TInStrides in_strides, const TOutStrides out_strides,
-    const int32_t out_h, const int32_t out_w, bool align_corners,
-    [[maybe_unused]] bool half_pixel_centers,
-    get_coordinate_func_t get_coordinate_func,
-    get_nearest_pixel_func_t get_nearset_func) noexcept {
+constexpr void
+resize_neareast_neighbor(const T *input, T *output, const TInShape in_shape,
+                         const TInStrides in_strides,
+                         const TOutStrides out_strides, const int32_t out_h,
+                         const int32_t out_w, bool align_corners,
+                         [[maybe_unused]] bool half_pixel_centers,
+                         get_coordinate_func_t get_coordinate_func,
+                         get_nearest_pixel_func_t get_nearset_func) noexcept {
     auto [height_scale, width_scale] =
         get_resize_scales(in_shape, out_h, out_w, align_corners);
 
@@ -265,12 +263,12 @@ void resize_neareast_neighbor(
 
 template <Tensor TIn, typename TOut, FixedDimensions TVectorizedAxes,
           FixedDimensions TPadedNums, FixedDimensions TNewSize>
-void resize(const TIn &input, TOut &&output,
-            [[maybe_unused]] const TVectorizedAxes &vectorizedAxes,
-            [[maybe_unused]] const TPadedNums &padedNums,
-            const TNewSize &new_size, image_resize_mode_t resize_mode,
-            image_resize_transformation_mode_t transformation_mode,
-            image_resize_nearest_mode_t nearest_mode) {
+constexpr void resize(const TIn &input, TOut &&output,
+                      [[maybe_unused]] const TVectorizedAxes &vectorizedAxes,
+                      [[maybe_unused]] const TPadedNums &padedNums,
+                      const TNewSize &new_size, image_resize_mode_t resize_mode,
+                      image_resize_transformation_mode_t transformation_mode,
+                      image_resize_nearest_mode_t nearest_mode) {
     if (resize_mode == image_resize_mode_t::bilinear) {
         resize_detail::resize_bilinear(
             input.elements().data(), output.elements().data(), input.shape(),
diff --git a/ntt/include/nncase/ntt/kernels/scatter_nd.h b/ntt/include/nncase/ntt/kernels/scatter_nd.h
index 54d16c807..8c4d92048 100644
--- a/ntt/include/nncase/ntt/kernels/scatter_nd.h
+++ b/ntt/include/nncase/ntt/kernels/scatter_nd.h
@@ -23,8 +23,8 @@
 namespace nncase::ntt {
 namespace scatter_nd_detail {
 template <Tensor TIn, Tensor TIndex, Tensor TUpdates, Tensor TOut>
-void scatter_nd_impl(const TIn &input, const TIndex &indices,
-                     const TUpdates &updates, TOut &output) noexcept {
+constexpr void scatter_nd_impl(const TIn &input, const TIndex &indices,
+                               const TUpdates &updates, TOut &output) noexcept {
     using TIElem = typename TIn::element_type;
     [[maybe_unused]] const auto in_shape = input.shape();
     const auto indices_shape = indices.shape();
@@ -65,8 +65,8 @@ void scatter_nd_impl(const TIn &input, const TIndex &indices,
 } // namespace scatter_nd_detail
 
 template <typename TIn, typename TIndex, typename TUpdate, typename TOut>
-void scatter_nd(const TIn &input, TIndex &indices, TUpdate &updates,
-                TOut &&output) noexcept {
+constexpr void scatter_nd(const TIn &input, TIndex &indices, TUpdate &updates,
+                          TOut &&output) noexcept {
     scatter_nd_detail::scatter_nd_impl(input, indices, updates, output);
 }
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/kernels/slice.h b/ntt/include/nncase/ntt/kernels/slice.h
index a63104c1d..3596fcfda 100644
--- a/ntt/include/nncase/ntt/kernels/slice.h
+++ b/ntt/include/nncase/ntt/kernels/slice.h
@@ -54,7 +54,7 @@ constexpr auto translate_slice_params(const TInShape &in_shape,
                                                     auto axis, auto i) {
             return cnt_new_steps.template replace_at<axis>(steps[i]);
         });
-    return std::make_tuple(new_begins, new_steps);
+    return ntt::make_tuple(new_begins, new_steps);
 }
 } // namespace slice_detail
 
@@ -71,9 +71,9 @@ constexpr auto translate_slice_params(const TInShape &in_shape,
 template <Tensor TIn, typename TOut, typename TBegins, typename TEnds,
           FixedDimensions TAxes = decltype(make_index_shape<TBegins::rank()>()),
           FixedDimensions TSteps = decltype(make_ones_shape<TBegins::rank()>())>
-void slice(const TIn &input, TOut &&output, const TBegins &begins,
-           [[maybe_unused]] const TEnds &ends, const TAxes &axes = {},
-           const TSteps &steps = {}) {
+constexpr void slice(const TIn &input, TOut &&output, const TBegins &begins,
+                     [[maybe_unused]] const TEnds &ends, const TAxes &axes = {},
+                     const TSteps &steps = {}) {
     static_assert(TBegins::rank() == TEnds::rank() &&
                       TBegins::rank() == TAxes::rank() &&
                       TBegins::rank() == TSteps::rank(),
diff --git a/ntt/include/nncase/ntt/kernels/stack.h b/ntt/include/nncase/ntt/kernels/stack.h
index d2b936bc6..ec61f75cd 100644
--- a/ntt/include/nncase/ntt/kernels/stack.h
+++ b/ntt/include/nncase/ntt/kernels/stack.h
@@ -21,14 +21,14 @@
 
 namespace nncase::ntt {
 template <size_t Axis, Tensor... TInputs, class TOut>
-void stack(const std::tuple<TInputs...> &inputs, TOut &&output) {
+void stack(const ntt::tuple<TInputs...> &inputs, TOut &&output) {
     auto domain = output.shape().template replace_at<Axis>(dim_one);
     apply(domain, [&](auto out_index) {
         auto left_shape = slice_dims<Axis>(out_index);
         auto right_shape = slice_dims<output.rank() - Axis - 1>(out_index);
         auto in_index = concat_dims(left_shape, right_shape);
         loop<sizeof...(TInputs)>([&](auto i) {
-            auto input = std::get<i>(inputs);
+            auto input = ntt::get<i>(inputs);
             for (out_index[Axis] = 0; out_index[Axis] < input.size();
                  out_index[Axis]++) {
                 output(out_index) = input(in_index);
diff --git a/ntt/include/nncase/ntt/kernels/summa.h b/ntt/include/nncase/ntt/kernels/summa.h
index f39699c1f..a08161ab7 100644
--- a/ntt/include/nncase/ntt/kernels/summa.h
+++ b/ntt/include/nncase/ntt/kernels/summa.h
@@ -29,12 +29,12 @@ template <bool AccumulateC = false, bool TransposedA = false,
           class TOut, class TScale, FixedDimensions LhsVectorizedAxes,
           FixedDimensions LhsPadedNums, FixedDimensions RhsVectorizedAxes,
           FixedDimensions RhsPadedNums>
-void summa(
-    const TLhs &lhs, const TRhs &rhs, TOut &&output, const TScale &scale,
-    [[maybe_unused]] LhsVectorizedAxes lhsVectorizedAxes = fixed_shape_v<>,
-    [[maybe_unused]] LhsPadedNums lhsPadedNums = fixed_shape_v<>,
-    [[maybe_unused]] RhsVectorizedAxes rhsVectorizedAxes = fixed_shape_v<>,
-    [[maybe_unused]] RhsPadedNums rhsPadedNums = fixed_shape_v<>) {
+constexpr void
+summa(const TLhs &lhs, const TRhs &rhs, TOut &&output, const TScale &scale,
+      [[maybe_unused]] LhsVectorizedAxes lhsVectorizedAxes = fixed_shape_v<>,
+      [[maybe_unused]] LhsPadedNums lhsPadedNums = fixed_shape_v<>,
+      [[maybe_unused]] RhsVectorizedAxes rhsVectorizedAxes = fixed_shape_v<>,
+      [[maybe_unused]] RhsPadedNums rhsPadedNums = fixed_shape_v<>) {
     static_assert(TransposedA == false && TransposedB == false,
                   "not supported for now");
     using TLhsElem = typename TLhs::element_type;
diff --git a/ntt/include/nncase/ntt/kernels/transpose.h b/ntt/include/nncase/ntt/kernels/transpose.h
index 3f376b79b..93355271a 100644
--- a/ntt/include/nncase/ntt/kernels/transpose.h
+++ b/ntt/include/nncase/ntt/kernels/transpose.h
@@ -38,9 +38,9 @@ constexpr auto segments_cnt(const TPerms &) noexcept {
 template <Tensor TIn, class TOut, FixedDimensions TPerms>
     requires(bool(TIn::rank() == std::decay_t<TOut>::rank()) &&
              bool(TIn::rank() == TPerms::rank()))
-void transpose(const TIn &input, TOut &&output,
-               [[maybe_unused]] const TPerms &perms =
-                   make_index_shape<TIn::rank()>().reverse()) {
+constexpr void transpose(const TIn &input, TOut &&output,
+                         [[maybe_unused]] const TPerms &perms =
+                             make_index_shape<TIn::rank()>().reverse()) {
     using TInElem = TIn::element_type;
     using TOutElem = std::decay_t<TOut>::element_type;
 
diff --git a/ntt/include/nncase/ntt/kernels/unary.h b/ntt/include/nncase/ntt/kernels/unary.h
index 8cfd33cdc..198ebea0b 100644
--- a/ntt/include/nncase/ntt/kernels/unary.h
+++ b/ntt/include/nncase/ntt/kernels/unary.h
@@ -22,8 +22,8 @@ template <Tensor TIn, Tensor TOut>
 class unary_impl : public unary_like_impl<unary_impl<TIn, TOut>, TIn, TOut> {
   public:
     template <Tensor TBroadcastedIn, class Op>
-    void invoke_ukernel(const TBroadcastedIn &input, TOut &output,
-                        const Op &op) {
+    constexpr void invoke_ukernel(const TBroadcastedIn &input, TOut &output,
+                                  const Op &op) {
         auto input_conti_dims = contiguous_dims(input.shape(), input.strides());
         auto output_conti_dims =
             contiguous_dims(output.shape(), output.strides());
@@ -55,8 +55,9 @@ class unary_impl : public unary_like_impl<unary_impl<TIn, TOut>, TIn, TOut> {
 } // namespace detail
 
 template <template <class T> class Op, Tensor TIn, class TOut>
-void unary(const TIn &input, TOut &&output,
-           const Op<std::remove_cv_t<typename TIn::element_type>> &op = {}) {
+constexpr void
+unary(const TIn &input, TOut &&output,
+      const Op<std::remove_cv_t<typename TIn::element_type>> &op = {}) {
     detail::unary_impl<TIn, std::decay_t<TOut>> impl;
     impl(input, output, op);
 }
diff --git a/ntt/include/nncase/ntt/kernels/unpack.h b/ntt/include/nncase/ntt/kernels/unpack.h
index f78d7d1ab..1aaa03f25 100644
--- a/ntt/include/nncase/ntt/kernels/unpack.h
+++ b/ntt/include/nncase/ntt/kernels/unpack.h
@@ -65,7 +65,8 @@ template <Tensor TIn, Tensor TOut, size_t AxesRank> class unpack_impl {
 } // namespace detail
 
 template <Tensor TIn, class TOut, FixedDimensions TAxes>
-void unpack(const TIn &input, TOut &&output, const TAxes &axes) noexcept {
+constexpr void unpack(const TIn &input, TOut &&output,
+                      const TAxes &axes) noexcept {
     detail::unpack_impl<TIn, std::decay_t<TOut>, TAxes::rank()> impl;
     impl(input, output, axes);
 }
diff --git a/ntt/include/nncase/ntt/kernels/where.h b/ntt/include/nncase/ntt/kernels/where.h
index 8e4ccadd8..e2604528a 100644
--- a/ntt/include/nncase/ntt/kernels/where.h
+++ b/ntt/include/nncase/ntt/kernels/where.h
@@ -51,7 +51,7 @@ class where_impl : public elementwise_impl<where_impl<TCond, TX, TY, TOut>,
 } // namespace detail
 
 template <Tensor TCond, Tensor TX, Tensor TY, class TOut>
-void where(const TCond &cond, const TX &x, const TY &y, TOut &&output) {
+constexpr void where(const TCond &cond, const TX &x, const TY &y, TOut &&output) {
     detail::where_impl<TCond, TX, TY, std::decay_t<TOut>>()(cond, x, y, output);
 }
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/ntt.h b/ntt/include/nncase/ntt/ntt.h
index 9afb67da9..5a75317d9 100644
--- a/ntt/include/nncase/ntt/ntt.h
+++ b/ntt/include/nncase/ntt/ntt.h
@@ -16,17 +16,37 @@
 #include "../bfloat16.h"
 #include "../float8.h"
 #include "../half.h"
+#include "std_containers.h"
 
 #include "caching.h"
 #include "distributed.h"
 #include "primitive_ops.h"
 
+// Arch specific types & ops
+
+#ifdef __CUDA_ARCH__
+#include "arch/cuda/vector_ops.h"
+#else
+#ifndef NNCASE_XPU_MODULE
+#ifdef __AVX2__
+#include "arch/x86_64/arch_types.h"
+#include "arch/x86_64/primitive_ops.h"
+#include "arch/x86_64/ukernels.h"
+#include "arch/x86_64/vector_ops.h"
+#elif __aarch64__
+#include "arch/aarch64/arch_types.h"
+#include "arch/aarch64/primitive_ops.h"
+#include "arch/aarch64/vector_ops.h"
+#endif
+#endif
+
 #ifdef __riscv_vector
 #include "arch/riscv64/arch_types.h"
 #include "arch/riscv64/primitive_ops.h"
 #include "arch/riscv64/ukernels.h"
 #include "arch/riscv64/vector_ops.h"
 #endif
+#endif
 
 #include "kernels/binary.h"
 #include "kernels/cast.h"
@@ -37,7 +57,6 @@
 #include "kernels/conv2d.h"
 #include "kernels/copy.h"
 #include "kernels/expand.h"
-#include "kernels/gather.h"
 #include "kernels/get_item.h"
 #include "kernels/get_position_ids.h"
 #include "kernels/im2col.h"
@@ -75,26 +94,22 @@
 #include "vector.h"
 #include "vector_ops.h"
 
-#ifndef NNCASE_XPU_MODULE
-#ifdef __AVX2__
-#include "arch/x86_64/arch_types.h"
-#include "arch/x86_64/primitive_ops.h"
-#include "arch/x86_64/ukernels.h"
-#include "arch/x86_64/vector_ops.h"
-#elif __aarch64__
-#include "arch/aarch64/arch_types.h"
-#include "arch/aarch64/primitive_ops.h"
-#include "arch/aarch64/vector_ops.h"
-#endif
-#endif
-
+// Distributed & Runtime
 
 #ifdef NNCASE_XPU_MODULE
 #include "arch/xpu/arch_types.h"
 #include "arch/xpu/distributed.h"
 #include "arch/xpu/runtime.h"
+#elif defined(__CUDA_ARCH__)
+#include "arch/cuda/distributed.h"
+#include "arch/cuda/runtime.h"
 #else
 #include "arch/cpu/distributed.h"
-#include "arch/cpu/profiling.h"
 #include "arch/cpu/runtime.h"
 #endif
+
+#include "caching.h"
+#include "distributed.h"
+#include "kernels/gather.h"
+#include "kernels/paged_attention.h"
+#include "kernels/reshard.h"
diff --git a/ntt/include/nncase/ntt/padding.h b/ntt/include/nncase/ntt/padding.h
index 2e5fab05d..c1295db63 100644
--- a/ntt/include/nncase/ntt/padding.h
+++ b/ntt/include/nncase/ntt/padding.h
@@ -72,12 +72,12 @@ constexpr auto make_paddings(const TPaddings &...paddings) noexcept;
 
 template <Padding... TPaddings> class paddings_t {
   public:
-    using paddings_type = std::tuple<TPaddings...>;
+    using paddings_type = ntt::tuple<TPaddings...>;
 
     constexpr paddings_t() = default;
 
     constexpr paddings_t(const TPaddings &...paddings) noexcept
-        : paddings_(std::make_tuple(paddings...)) {}
+        : paddings_(ntt::make_tuple(paddings...)) {}
 
     static constexpr auto rank() noexcept {
         return fixed_dim_v<sizeof...(TPaddings)>;
@@ -95,7 +95,7 @@ template <Padding... TPaddings> class paddings_t {
 
     template <size_t Rank = rank(), class = std::enable_if_t<Rank != 0>>
     constexpr paddings_t() noexcept
-        : paddings_(std::make_tuple(TPaddings{}...)) {}
+        : paddings_(ntt::make_tuple(TPaddings{}...)) {}
 
     template <Dimension TIndex>
     constexpr auto operator[](const TIndex &index) const noexcept {
@@ -135,23 +135,23 @@ template <Padding... TPaddings> class paddings_t {
     template <Dimension TIndex>
     constexpr auto at(const TIndex &) const noexcept {
         if constexpr (FixedDimension<TIndex>) {
-            return std::get<TIndex::value>(paddings_);
+            return ntt::get<TIndex::value>(paddings_);
         } else {
             return to_array()[TIndex{}];
         }
     }
 
     template <dim_t TIndex> constexpr auto at() const noexcept {
-        return std::get<TIndex>(paddings_);
+        return ntt::get<TIndex>(paddings_);
     }
 
     template <dim_t TIndex> constexpr auto &at() noexcept {
-        return std::get<TIndex>(paddings_);
+        return ntt::get<TIndex>(paddings_);
     }
 
     template <FixedDimension TIndex>
     constexpr auto &at(const TIndex &) noexcept {
-        return std::get<TIndex::value>(paddings_);
+        return ntt::get<TIndex::value>(paddings_);
     }
 
     constexpr std::array<std::array<dim_t, 2>, rank()>
@@ -165,7 +165,7 @@ template <Padding... TPaddings> class paddings_t {
     }
 
   private:
-    NTT_NO_UNIQUE_ADDRESS std::tuple<TPaddings...> paddings_;
+    NTT_NO_UNIQUE_ADDRESS ntt::tuple<TPaddings...> paddings_;
 };
 
 template <class T>
@@ -184,9 +184,11 @@ template <class I> struct dynamic_paddings_type_impl;
 
 template <size_t... I>
 struct dynamic_paddings_type_impl<std::index_sequence<I...>> {
-    template <std::size_t> using elem_type = dim_t;
+    template <std::size_t> struct elem_type {
+        using type = padding_t<dim_t, dim_t>;
+    };
 
-    using type = paddings_t<elem_type<I>...>;
+    using type = paddings_t<typename elem_type<I>::type...>;
 };
 
 template <size_t Rank>
@@ -255,7 +257,8 @@ struct tuple_size<TPaddings>
 
 template <size_t I, nncase::ntt::Padding... TPaddings>
 struct tuple_element<I, nncase::ntt::padding_t<TPaddings...>> {
-    using type = std::tuple_element_t<I, std::tuple<TPaddings...>>;
+    using type =
+        nncase::ntt::tuple_element_t<I, nncase::ntt::tuple<TPaddings...>>;
 };
 
 template <size_t I, nncase::ntt::Paddings TPaddings>
diff --git a/ntt/include/nncase/ntt/primitive_ops.h b/ntt/include/nncase/ntt/primitive_ops.h
index 430e9406f..5ff4334f3 100644
--- a/ntt/include/nncase/ntt/primitive_ops.h
+++ b/ntt/include/nncase/ntt/primitive_ops.h
@@ -41,7 +41,9 @@ namespace ops {
  */
 
 template <prefetch_hint Hint, bool Arch> struct prefetch {
-    void operator()(const void *ptr) const noexcept { __builtin_prefetch(ptr); }
+    constexpr void operator()(const void *ptr) const noexcept {
+        __builtin_prefetch(ptr);
+    }
 };
 
 template <class TDest, class TSource> struct store {
@@ -198,7 +200,7 @@ template <class T1, class T2> struct ceil_div {
  */
 template <class T1, class T2> struct floor_mod {
     constexpr auto operator()(const T1 &v1, const T2 &v2) const noexcept {
-#ifdef __clang__
+#if defined(__clang__) && !defined(__CUDA_ARCH__)
 #pragma float_control(precise, on)
 #endif
         return (T1)(double(v1) - std::floor(static_cast<double>(v1) /
@@ -486,7 +488,7 @@ constexpr T1 clamp(const T1 &v, const T2 &min, const T2 &max) noexcept {
     return ops::clamp<T1, T2>()(v, min, max);
 }
 
-template <ScalarOrVector TCond, ScalarOrVector TX, ScalarOrVector TY>
+template <class TCond, class TX, class TY>
 constexpr auto where(const TCond &cond, const TX &x, const TY &y) noexcept {
     return ops::where<TCond, TX, TY>()(cond, x, y);
 }
diff --git a/ntt/include/nncase/ntt/profiling.h b/ntt/include/nncase/ntt/profiling.h
index a6a97254d..83718da11 100644
--- a/ntt/include/nncase/ntt/profiling.h
+++ b/ntt/include/nncase/ntt/profiling.h
@@ -13,71 +13,49 @@
  * limitations under the License.
  */
 #pragma once
+#include "compiler_defs.h"
 #include <cstdint>
-#include <string_view>
-#include <unordered_map>
-#include <vector>
 
-namespace nncase::ntt::runtime {
+namespace nncase::ntt {
+enum class profile_level { kernel, device };
 
-enum class profiling_level { kernel, device };
+namespace runtime {
+struct profile_record {
+    uint32_t function_id;
+    uint64_t duration;
+};
 
-static std::string_view to_string(profiling_level level) {
-    switch (level) {
-    case profiling_level::kernel:
-        return "kernel";
-    case profiling_level::device:
-        return "device";
-    default:
-        return "unknown";
-    }
-}
+NTT_DEVICE bool is_profiling_enabled() noexcept;
+NTT_DEVICE uint64_t get_profile_time() noexcept;
+NTT_DEVICE void record_profile(profile_level level,
+                               const profile_record &record) noexcept;
+} // namespace runtime
 
-template <class TOPOLOGY> class timer_record_base {
+class profile_scope {
   public:
-    struct call_instance {
-        uint64_t start_time;
-        uint64_t end_time;
-    };
-
-    struct function_stats {
-        uint64_t call_count = 0;
-        uint64_t total_time = 0;
-        profiling_level level;
-        std::vector<call_instance> calls;
-    };
-
-    // 判断记录是否有效
-    virtual bool is_valid() const = 0;
-
-    // 设置计时记录
-    virtual void set_time(std::string_view function_name, uint64_t start_time,
-                          uint64_t end_time) = 0;
-
-    // 控制台打印统计信息
-    virtual void console_print() const = 0;
-
-    // 导出为 CSV 文件
-    virtual void csv_print(std::string_view filename) const = 0;
-
-    // 导出为 JSON 文件
-    virtual void markdown_print(std::string_view filename) const = 0;
-
-    // 导出为 JSON 文件
-    virtual void json_print(std::string_view filename) const = 0;
-
-    // 设置记录 ID
-    virtual void set_id(TOPOLOGY id) = 0;
-
-    virtual void set_level(std::string_view filename,
-                           profiling_level level) = 0;
+    NTT_DEVICE
+    profile_scope(uint32_t function_id,
+                  profile_level level = profile_level::kernel) noexcept
+        : enabled_(runtime::is_profiling_enabled()),
+          function_id_(function_id),
+          level_(level) {
+        if (enabled_) {
+            start_time_ = runtime::get_profile_time();
+        }
+    }
 
-    // 虚析构函数，确保子类正确释放资源
-    virtual ~timer_record_base() = default;
+    NTT_DEVICE ~profile_scope() noexcept {
+        if (enabled_) {
+            auto duration = runtime::get_profile_time() - start_time_;
+            runtime::profile_record record{function_id_, duration};
+            runtime::record_profile(level_, record);
+        }
+    }
 
-  protected:
-    timer_record_base() = default; // 禁止直接实例化抽象类
-    TOPOLOGY instance_id_;
-    std::unordered_map<std::string_view, function_stats> function_stats_;
+  private:
+    bool enabled_;
+    uint32_t function_id_;
+    profile_level level_;
+    uint64_t start_time_;
 };
-} // namespace nncase::ntt::runtime
+} // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/runtime.h b/ntt/include/nncase/ntt/runtime.h
index daa5169f5..e822c1b18 100644
--- a/ntt/include/nncase/ntt/runtime.h
+++ b/ntt/include/nncase/ntt/runtime.h
@@ -55,9 +55,11 @@ void *thread_alloc(size_t bytes, size_t alignment);
 void thread_free(void *ptr);
 } // namespace nncase::ntt::runtime
 
-extern "C" void
+extern "C" NTT_DEVICE void
 thread_main(const nncase::ntt::runtime::thread_inout_desc *input_descs,
             nncase::ntt::runtime::thread_inout_desc *const output_descs,
             const std::byte *rdata, const std::byte *thread_local_rdata,
+            const std::byte *warp_local_rdata,
             const std::byte *block_local_rdata, std::byte *thread_local_data,
-            std::byte *block_local_data, std::byte *output);
+            std::byte *warp_local_data, std::byte *block_local_data,
+            std::byte *output);
diff --git a/ntt/include/nncase/ntt/shape.h b/ntt/include/nncase/ntt/shape.h
index a667f36c5..3feade42f 100644
--- a/ntt/include/nncase/ntt/shape.h
+++ b/ntt/include/nncase/ntt/shape.h
@@ -17,7 +17,7 @@
 #include "dimension.h"
 #include "loop.h"
 #include "nncase/ntt/tensor_traits.h"
-#include <array>
+#include "std_containers.h"
 #include <cmath>
 #include <cstddef>
 #include <cstring>
@@ -41,9 +41,11 @@ struct dynamic_dims_type_impl;
 
 template <template <class... TDims> class Derived, size_t... I>
 struct dynamic_dims_type_impl<Derived, std::index_sequence<I...>> {
-    template <std::size_t> using elem_type = dim_t;
+    template <std::size_t> struct elem_type {
+        using type = dim_t;
+    };
 
-    using type = Derived<elem_type<I>...>;
+    using type = Derived<typename elem_type<I>::type...>;
 };
 
 template <template <class... TDims> class Derived, size_t Rank>
@@ -134,7 +136,7 @@ template <dims_usage Usage, template <class... TDims> class Derived,
           Dimension... TDims>
 struct dims_base {
     NTT_ALWAYS_INLINE constexpr dims_base(const TDims &...dims) noexcept
-        : dims_(std::make_tuple(dims...)) {}
+        : dims_(ntt::make_tuple(dims...)) {}
 
     static constexpr dims_usage usage() noexcept { return Usage; }
 
@@ -153,7 +155,7 @@ struct dims_base {
     static constexpr bool is_fixed() noexcept { return fixed_rank() == rank(); }
 
     template <size_t Rank = rank(), class = std::enable_if_t<Rank != 0>>
-    constexpr dims_base() noexcept : dims_(std::make_tuple(TDims{}...)) {}
+    constexpr dims_base() noexcept : dims_(ntt::make_tuple(TDims{}...)) {}
 
     template <Dimension TIndex>
     constexpr auto operator[](const TIndex &index) const noexcept {
@@ -198,18 +200,18 @@ struct dims_base {
     template <dim_t Index> constexpr auto at() const noexcept {
         constexpr auto PositiveIndex = positive_index(Index, rank());
         if constexpr (is_fixed()) {
-            return std::tuple_element_t<PositiveIndex, decltype(dims_)>{};
+            return ntt::tuple_element_t<PositiveIndex, decltype(dims_)>{};
         } else {
-            return std::get<PositiveIndex>(dims_);
+            return ntt::get<PositiveIndex>(dims_);
         }
     }
 
     template <dim_t Index> constexpr decltype(auto) at() noexcept {
         constexpr auto PositiveIndex = positive_index(Index, rank());
         if constexpr (is_fixed()) {
-            return std::tuple_element_t<PositiveIndex, decltype(dims_)>{};
+            return ntt::tuple_element_t<PositiveIndex, decltype(dims_)>{};
         } else {
-            return std::get<PositiveIndex>(dims_);
+            return ntt::get<PositiveIndex>(dims_);
         }
     }
 
@@ -337,10 +339,10 @@ struct dims_base {
         return slice_impl(std::make_index_sequence<Rank>());
     }
 
-    constexpr std::array<dim_t, rank()> to_array() const noexcept {
+    constexpr ntt::array<dim_t, rank()> to_array() const noexcept {
         auto at_impl = [this]<size_t... I>(std::index_sequence<I...>) {
             (void)this;
-            return std::array<dim_t, rank()>{at(fixed_dim_v<I>)...};
+            return ntt::array<dim_t, rank()>{at(fixed_dim_v<I>)...};
         };
         return at_impl(std::make_index_sequence<rank()>());
     }
@@ -350,7 +352,7 @@ struct dims_base {
     }
 
   private:
-    NTT_NO_UNIQUE_ADDRESS std::tuple<TDims...> dims_;
+    NTT_NO_UNIQUE_ADDRESS ntt::tuple<TDims...> dims_;
 };
 } // namespace detail
 
@@ -512,6 +514,9 @@ default_strides([[maybe_unused]] const TShape shape) noexcept {
     constexpr auto rank = TShape::rank();
     if constexpr (rank == 0) {
         return strides_t<>();
+    } else if constexpr (FixedDimensions<TShape>) {
+        return detail::default_strides_impl<rank, TShape, Canonical>{}(
+            TShape{}, strides_t<>());
     } else {
         return detail::default_strides_impl<rank, TShape, Canonical>{}(
             shape, strides_t<>());
@@ -549,14 +554,14 @@ template <template <class... TDims> class TDimensions = shape_t,
 NTT_ALWAYS_INLINE constexpr auto unravel_index(const TOffset &offset,
                                                const TShape &shape) noexcept {
     return shape.reverse().aggregate(
-        std::make_tuple(offset, TDimensions<>{}),
+        ntt::make_tuple(offset, TDimensions<>{}),
         [&](auto acc, auto dim, [[maybe_unused]] auto axis) {
             auto [last_remain, index] = acc;
             auto cnt_index = last_remain % dim;
             auto remain = last_remain / dim;
-            return std::make_tuple(remain, index.prepend(cnt_index));
+            return ntt::make_tuple(remain, index.prepend(cnt_index));
         },
-        [](auto acc) { return std::get<1>(acc); });
+        [](auto acc) { return ntt::get<1>(acc); });
 }
 
 namespace detail {
@@ -703,17 +708,17 @@ unsqueeze_dims(const TDims &dims, const TAxes &,
                const TDim &insert_dim) noexcept {
     constexpr auto positive_axes_v = positive_axes(TAxes{}, TDims::rank());
     return make_zeros_shape<TDims::rank() + TAxes::rank()>().aggregate(
-        std::make_tuple(empty_dims_alike_t<TDims>{}, dim_zero),
+        ntt::make_tuple(empty_dims_alike_t<TDims>{}, dim_zero),
         [&](auto acc, auto, auto axis) {
             auto [last_result, offset] = acc;
             if constexpr (positive_axes_v.contains(axis)) {
-                return std::make_tuple(last_result.append(insert_dim), offset);
+                return ntt::make_tuple(last_result.append(insert_dim), offset);
             } else {
-                return std::make_tuple(last_result.append(dims[offset]),
+                return ntt::make_tuple(last_result.append(dims[offset]),
                                        offset + dim_one);
             }
         },
-        [](auto acc) { return std::get<0>(acc); });
+        [](auto acc) { return ntt::get<0>(acc); });
 }
 
 template <Dimensions TDimsA, Dimensions TDimsB>
@@ -759,7 +764,7 @@ template <size_t I, template <class... TDims> class Derived,
           nncase::ntt::Dimension... TDims>
     requires(nncase::ntt::Dimensions<Derived<TDims...>>)
 struct tuple_element<I, Derived<TDims...>> {
-    using type = std::tuple_element_t<I, std::tuple<TDims...>>;
+    using type = nncase::ntt::tuple_element_t<I, nncase::ntt::tuple<TDims...>>;
 };
 
 template <size_t I, nncase::ntt::Dimensions TDims>
diff --git a/ntt/include/nncase/ntt/shape_infer/reduce.h b/ntt/include/nncase/ntt/shape_infer/reduce.h
index 0c9db7d35..0c1da6764 100644
--- a/ntt/include/nncase/ntt/shape_infer/reduce.h
+++ b/ntt/include/nncase/ntt/shape_infer/reduce.h
@@ -41,9 +41,9 @@ struct reduce_source_begin_index_impl {
                               const TOutIndex &out_index) noexcept {
         auto [new_dim, new_shrinked_dims] = [&] {
             if constexpr (ReduceAxes{}.contains(fixed_dim_v<Axis>)) {
-                return std::make_tuple(0, fixed_dim_v<ShrinkedDims + 1>);
+                return ntt::make_tuple(0, fixed_dim_v<ShrinkedDims + 1>);
             } else {
-                return std::make_tuple(
+                return ntt::make_tuple(
                     (dim_t)out_index.template at<Axis - ShrinkedDims>(),
                     fixed_dim_v<ShrinkedDims>);
             }
diff --git a/ntt/include/nncase/ntt/std_containers.h b/ntt/include/nncase/ntt/std_containers.h
new file mode 100644
index 000000000..f6db8cf87
--- /dev/null
+++ b/ntt/include/nncase/ntt/std_containers.h
@@ -0,0 +1,49 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "compiler_defs.h"
+
+#ifdef __CUDA_ARCH__
+#include <cuda/std/array>
+#include <cuda/std/barrier>
+#include <cuda/std/span>
+#include <cuda/std/tuple>
+#else
+#include <array>
+#include <barrier>
+#include <span>
+#include <tuple>
+#endif
+
+#ifdef __CUDA_ARCH__
+#define NTT_NS_STD cuda::std
+#else
+#define NTT_NS_STD std
+#endif
+
+namespace nncase::ntt {
+using NTT_NS_STD::array;
+using NTT_NS_STD::barrier;
+using NTT_NS_STD::forward_as_tuple;
+using NTT_NS_STD::get;
+using NTT_NS_STD::make_tuple;
+using NTT_NS_STD::span;
+using NTT_NS_STD::tuple;
+using NTT_NS_STD::tuple_element_t;
+
+template <size_t Count> struct fixed_barrier : barrier<> {
+    constexpr fixed_barrier() : barrier<>(Count) {}
+};
+} // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/tensor.h b/ntt/include/nncase/ntt/tensor.h
index d6a7b1075..55d72f748 100644
--- a/ntt/include/nncase/ntt/tensor.h
+++ b/ntt/include/nncase/ntt/tensor.h
@@ -63,9 +63,9 @@ constexpr auto make_span(T *data, const TShape &shape,
                          const TStrides &strides) noexcept {
     if constexpr (FixedShape<TShape> && FixedStrides<TStrides>) {
         constexpr size_t size = linear_size(TShape{}, TStrides{});
-        return std::span<T, size>(data, size);
+        return ntt::span<T, size>(data, size);
     } else {
-        return std::span<T>(data, linear_size(shape, strides));
+        return ntt::span<T>(data, linear_size(shape, strides));
     }
 }
 
@@ -149,13 +149,14 @@ class basic_tensor
     using storage_type::elements;
 
     template <bool IsViewV = IsView, class = std::enable_if_t<!IsViewV>>
-    NTT_ALWAYS_INLINE constexpr basic_tensor(TShape shape,
-                                             TStrides strides) noexcept
+    NTT_ALWAYS_INLINE constexpr basic_tensor(TShape shape = {},
+                                             TStrides strides = {}) noexcept
         : size_impl_type(std::move(shape), std::move(strides)),
           storage_type(shape.length()) {}
 
-    NTT_ALWAYS_INLINE constexpr basic_tensor(buffer_type buffer, TShape shape,
-                                             TStrides strides) noexcept
+    NTT_ALWAYS_INLINE constexpr basic_tensor(buffer_type buffer,
+                                             TShape shape = {},
+                                             TStrides strides = {}) noexcept
         : size_impl_type(std::move(shape), std::move(strides)),
           storage_type(std::in_place, std::move(buffer)) {}
 
diff --git a/ntt/include/nncase/ntt/ukernels/u_mul_add.h b/ntt/include/nncase/ntt/ukernels/u_mul_add.h
index 4f593de80..7a6113274 100644
--- a/ntt/include/nncase/ntt/ukernels/u_mul_add.h
+++ b/ntt/include/nncase/ntt/ukernels/u_mul_add.h
@@ -31,42 +31,51 @@ enum class matmul_vectorize_kind {
 };
 } // namespace ukernels
 
-template <ukernels::matmul_vectorize_kind VectorizeKind, bool AccC, class TLhsElem,
-          class TRhsElem, class TOutElem>
-void u_mul_add(const TLhsElem &lhs, const TRhsElem &rhs, TOutElem &output) {
+template <ukernels::matmul_vectorize_kind VectorizeKind, bool AccC,
+          class TLhsElem, class TRhsElem, class TOutElem>
+constexpr void u_mul_add(const TLhsElem &lhs, const TRhsElem &rhs,
+                         TOutElem &output) {
     // 1. 0D-vectorize
-    if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::no_vectorize) {
+    if constexpr (VectorizeKind ==
+                  ukernels::matmul_vectorize_kind::no_vectorize) {
         output = AccC ? ntt::mul_add(lhs, rhs, output) : ntt::mul(lhs, rhs);
     }
     // 2. 1D-vectorize
     // 2.1. vectorize M
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_m) {
+    else if constexpr (VectorizeKind ==
+                       ukernels::matmul_vectorize_kind::vectorize_m) {
         output = AccC ? ntt::mul_add(lhs, rhs, output) : ntt::mul(lhs, rhs);
     }
     // 2.2. vectorize K
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_k) {
+    else if constexpr (VectorizeKind ==
+                       ukernels::matmul_vectorize_kind::vectorize_k) {
         auto value = ntt::inner_product(lhs, rhs);
         output = AccC ? output + value : value;
     }
     // 2.3. vectorize N
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_n) {
+    else if constexpr (VectorizeKind ==
+                       ukernels::matmul_vectorize_kind::vectorize_n) {
         output = AccC ? ntt::mul_add(lhs, rhs, output) : ntt::mul(lhs, rhs);
     }
     // 2.4. vectorize M & N
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_mn ||
-                       VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_kn) {
+    else if constexpr (VectorizeKind ==
+                           ukernels::matmul_vectorize_kind::vectorize_mn ||
+                       VectorizeKind ==
+                           ukernels::matmul_vectorize_kind::vectorize_kn) {
         auto value = ntt::outer_product(lhs, rhs);
         output = AccC ? output + value : value;
     }
     // 3.1. vectorize MK & K
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_mk) {
+    else if constexpr (VectorizeKind ==
+                       ukernels::matmul_vectorize_kind::vectorize_mk) {
         for (size_t m = 0; m < lhs.shape()[0]; m++) {
             auto value = ntt::inner_product(lhs(m), rhs);
             output(m) = AccC ? output(m) + value : value;
         }
     }
     // 3.2. vectorize MK & KN
-    else if constexpr (VectorizeKind == ukernels::matmul_vectorize_kind::vectorize_mkn) {
+    else if constexpr (VectorizeKind ==
+                       ukernels::matmul_vectorize_kind::vectorize_mkn) {
         output = ntt::vmma<AccC, false>(lhs, rhs, output);
     } else {
         static_assert(sizeof(TLhsElem) == 0, "Unsupported vectorize.");
diff --git a/ntt/include/nncase/ntt/ukernels/u_transpose.h b/ntt/include/nncase/ntt/ukernels/u_transpose.h
index 4f8d6efa8..b7c792020 100644
--- a/ntt/include/nncase/ntt/ukernels/u_transpose.h
+++ b/ntt/include/nncase/ntt/ukernels/u_transpose.h
@@ -128,7 +128,7 @@ constexpr std::array<segment, Rank> get_segments() {
 
     constexpr TPerms perms;
 
-    std::array<segment, Rank> segments;
+    std::array<segment, Rank> segments{};
 
     size_t segment_count = 0;
     size_t start = 0;
@@ -227,8 +227,8 @@ template <Tensor TIn, class TOut, FixedDimensions TPerms, size_t Segments,
           size_t... Index>
     requires(bool(TIn::rank() == std::decay_t<TOut>::rank()) &&
              bool(TIn::rank() == TPerms::rank()))
-void u_transpose(const TIn &input, TOut &output, const TPerms &,
-                 std::index_sequence<Index...>) {
+constexpr void u_transpose(const TIn &input, TOut &output, const TPerms &,
+                           std::index_sequence<Index...>) {
 
     const std::array<size_t, Segments> dims_compressed =
         u_transpose_detail::compress_dimensions<TPerms, TIn, Segments>(input);
diff --git a/ntt/include/nncase/ntt/ukernels/u_unpack.h b/ntt/include/nncase/ntt/ukernels/u_unpack.h
index d4eef2206..d8673299a 100644
--- a/ntt/include/nncase/ntt/ukernels/u_unpack.h
+++ b/ntt/include/nncase/ntt/ukernels/u_unpack.h
@@ -56,7 +56,8 @@ class u_unpack_impl {
 } // namespace ukernels
 
 template <Tensor TIn, class TOut, FixedDimensions TAxes>
-void u_unpack(const TIn &input, TOut &&output, const TAxes &axes) noexcept {
+constexpr void u_unpack(const TIn &input, TOut &&output,
+                        const TAxes &axes) noexcept {
     ukernels::u_unpack_impl<TIn, std::decay_t<TOut>, TAxes::rank(), true> impl;
     impl(input, output, axes);
 }
diff --git a/ntt/include/nncase/ntt/utility.h b/ntt/include/nncase/ntt/utility.h
index 27bde344a..bda505347 100644
--- a/ntt/include/nncase/ntt/utility.h
+++ b/ntt/include/nncase/ntt/utility.h
@@ -13,11 +13,10 @@
  * limitations under the License.
  */
 #pragma once
-#include "nncase/ntt/dimension.h"
-#include "shape.h"
+#include "dimension.h"
+#include "std_containers.h"
 #include "tensor_traits.h"
 #include <cstring>
-#include <span>
 #include <type_traits>
 
 namespace nncase::ntt {
@@ -49,8 +48,8 @@ constexpr auto get_safe_stride(const TTensor &tensor,
 } // namespace utility_detail
 
 template <class U, class T, size_t Extent>
-constexpr auto span_cast(std::span<T, Extent> src) noexcept {
-    using return_type = std::span<U, Extent == std::dynamic_extent
+constexpr auto span_cast(ntt::span<T, Extent> src) noexcept {
+    using return_type = ntt::span<U, Extent == std::dynamic_extent
                                          ? std::dynamic_extent
                                          : Extent * sizeof(T) / sizeof(U)>;
     if constexpr (std::is_const_v<U>) {
@@ -61,9 +60,9 @@ constexpr auto span_cast(std::span<T, Extent> src) noexcept {
 }
 
 template <class T, size_t SrcExtent, Dimension TOffset, Dimension TExtent>
-constexpr auto make_subspan(std::span<T, SrcExtent> src, const TOffset &offset,
+constexpr auto make_subspan(ntt::span<T, SrcExtent> src, const TOffset &offset,
                             const TExtent &extent) noexcept {
-    using return_type = std::span<
+    using return_type = ntt::span<
         T, ntt::where(std::integral_constant<bool, FixedDimension<TExtent>>{},
                       TExtent{}, std::dynamic_extent)>;
     return return_type{src.data() + dim_value(offset),
diff --git a/ntt/include/nncase/ntt/vector.h b/ntt/include/nncase/ntt/vector.h
index f383b2c8a..32adf47fd 100644
--- a/ntt/include/nncase/ntt/vector.h
+++ b/ntt/include/nncase/ntt/vector.h
@@ -47,10 +47,11 @@ class basic_vector
     }
 
     template <ScalarOrVector U>
-    static basic_vector<T, Lanes> from_scalar(U value) noexcept;
+    constexpr static basic_vector<T, Lanes> from_scalar(U value) noexcept;
 
     template <ScalarOrVector U>
-    static basic_vector<T, Lanes> unaligned_load_from(const U *ptr) noexcept;
+    constexpr static basic_vector<T, Lanes>
+    unaligned_load_from(const U *ptr) noexcept;
 
     constexpr basic_vector() noexcept = default;
     constexpr basic_vector(const buffer_type &buffer) noexcept
@@ -58,8 +59,8 @@ class basic_vector
     constexpr explicit basic_vector(element_type value) noexcept
         : basic_vector(from_scalar(value)) {}
 
-    operator const buffer_type &() const noexcept { return buffer(); }
-    operator buffer_type &() noexcept { return buffer(); }
+    constexpr operator const buffer_type &() const noexcept { return buffer(); }
+    constexpr operator buffer_type &() noexcept { return buffer(); }
 
     constexpr const buffer_type &buffer() const noexcept { return buffer_; }
     constexpr buffer_type &buffer() noexcept { return buffer_; }
@@ -155,31 +156,28 @@ template <Vector T> struct vector_rank<T> {
 
 template <class T> constexpr inline auto vector_rank_v = vector_rank<T>::value;
 
-template <typename TShape>
-struct last_lane;
+template <typename TShape> struct last_lane;
 
-template <nncase::ntt::Dimension D>
-struct last_lane<nncase::ntt::shape_t<D>> {
+template <nncase::ntt::Dimension D> struct last_lane<nncase::ntt::shape_t<D>> {
     static constexpr size_t value = D::value;
 };
 
 template <nncase::ntt::Dimension D1, nncase::ntt::Dimension... Dims>
 struct last_lane<nncase::ntt::shape_t<D1, Dims...>> {
-    static constexpr size_t value = last_lane<nncase::ntt::shape_t<Dims...>>::value;
+    static constexpr size_t value =
+        last_lane<nncase::ntt::shape_t<Dims...>>::value;
 };
 
-template <nncase::ntt::Vector TVec>
-struct get_last_lane_vector {
+template <nncase::ntt::Vector TVec> struct get_last_lane_vector {
     using element_type = typename TVec::element_type;
     using shape_type = typename TVec::shape_type;
-    
+
     static constexpr size_t last_dim = last_lane<shape_type>::value;
-    
+
     using type = nncase::ntt::replace_lanes_t<TVec, last_dim>;
 };
 
-template<typename TVec>
+template <typename TVec>
 using get_last_lane_vector_t = typename get_last_lane_vector<TVec>::type;
 
-
 } // namespace nncase::ntt
diff --git a/ntt/include/nncase/ntt/vector_ops.h b/ntt/include/nncase/ntt/vector_ops.h
index c01dec2e0..b8ea8bc92 100644
--- a/ntt/include/nncase/ntt/vector_ops.h
+++ b/ntt/include/nncase/ntt/vector_ops.h
@@ -116,9 +116,6 @@ struct tensor_binary_impl<Op, TVector, T2> {
 
         return value;
     }
-
-    //   private:
-    //     Op<element_type1, element_type2> op_;
 };
 
 // T1 scalar, T2 1D vector or 2D vector
@@ -823,14 +820,15 @@ struct vmma {
 namespace nncase::ntt {
 template <Scalar T, FixedShape Lanes>
 template <ScalarOrVector U>
-basic_vector<T, Lanes> basic_vector<T, Lanes>::from_scalar(U value) noexcept {
+constexpr basic_vector<T, Lanes>
+basic_vector<T, Lanes>::from_scalar(U value) noexcept {
     return vector_ops::vload_scalar<basic_vector<T, Lanes>>()(value);
 }
 
 template <Scalar T, FixedShape Lanes>
 template <ScalarOrVector U>
-basic_vector<T, Lanes>
-basic_vector<T, Lanes>::unaligned_load_from(const U *ptr) noexcept {
+basic_vector<T, Lanes> constexpr basic_vector<T, Lanes>::unaligned_load_from(
+    const U *ptr) noexcept {
     return vector_ops::vunaligned_load<basic_vector<T, Lanes>, U>()(ptr);
 }
 
diff --git a/ntt/src/cpu_runtime.cpp b/ntt/src/cpu_runtime.cpp
index 90409b5d5..22af8a7b5 100644
--- a/ntt/src/cpu_runtime.cpp
+++ b/ntt/src/cpu_runtime.cpp
@@ -12,6 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <chrono>
 #include <cstdarg>
 #include <cstddef>
 #include <cstring>
@@ -19,6 +20,7 @@
 #include <nncase/ntt/arch/cpu/distributed.h>
 #include <nncase/ntt/arch/cpu/runtime.h>
 #include <nncase/ntt/distributed.h>
+#include <nncase/ntt/profiling.h>
 #include <nncase/ntt/shape.h>
 #include <thread>
 #include <vector>
@@ -92,6 +94,26 @@ void thread_free(void *ptr) {
     free(ptr);
 #endif
 }
+
+bool is_profiling_enabled() noexcept {
+    return cpu_thread_context_t::current().enable_profiling;
+}
+
+uint64_t get_profile_time() noexcept {
+    return std::chrono::duration_cast<std::chrono::microseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch())
+        .count();
+}
+
+void record_profile(profile_level level,
+                    const profile_record &record) noexcept {
+    // Other levels are not supported yet.
+    if (level == profile_level::kernel) {
+        auto &ctx = cpu_thread_context_t::current();
+        auto idx = ctx.profile_record_counts[0]++;
+        ctx.profile_records[idx] = record;
+    }
+}
 } // namespace nncase::ntt::runtime
 
 cpu_thread_context_t &cpu_thread_context_t::current() noexcept {
@@ -115,18 +137,25 @@ extern "C" void block_entry(const cpu_block_entry_params_t &params) {
     std::vector<std::thread> threads;
     for (size_t tid = 0; tid < tdim; tid++) {
         threads.emplace_back([tid, params] {
+            // Get thread local profile records
+            auto block_profile_records = params.profile_records;
+            const auto profile_records_size =
+                block_profile_records.size() / params.tdim;
+            auto profile_records = block_profile_records.subspan(
+                profile_records_size * tid, profile_records_size);
+
 #ifdef __APPLE__
             pthread_setspecific(
-                cpu_thread_context_key,
-                new cpu_thread_context_t
+                cpu_thread_context_key, new cpu_thread_context_t
 #else
             cpu_thread_context_t::current() =
 #endif
-                {
-                    .tid = tid, .bid = params.bid, .cid = params.cid,
-                    .timer_records = &(params.timer_records[tid]),
-                    .enable_profiling = params.enable_profiling
-                }
+                {.tid = tid,
+                 .bid = params.bid,
+                 .cid = params.cid,
+                 .enable_profiling = params.enable_profiling,
+                 .profile_records = profile_records,
+                 .profile_record_counts = params.profile_record_counts + tid}
 #ifdef __APPLE__
             );
 #else
@@ -207,9 +236,9 @@ extern "C" void block_entry(const cpu_block_entry_params_t &params) {
 
             thread_main(params.input_descs, params.output_descs,
                         params.rdata.data(), thread_local_rdata.data(),
-                        params.block_local_rdata.data(),
-                        thread_local_data.data(), block_local_data.data(),
-                        params.output);
+                        nullptr, params.block_local_rdata.data(),
+                        thread_local_data.data(), nullptr,
+                        block_local_data.data(), params.output);
         });
     }
 
diff --git a/ntt/src/cuda_runtime.cu b/ntt/src/cuda_runtime.cu
new file mode 100644
index 000000000..caf15516d
--- /dev/null
+++ b/ntt/src/cuda_runtime.cu
@@ -0,0 +1,173 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstddef>
+#include <cstring>
+#include <nncase/ntt/arch/cuda/distributed.h>
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/ntt/distributed.h>
+#include <nncase/ntt/profiling.h>
+#include <nncase/ntt/shape.h>
+#include <nncase/ntt/vector.h>
+
+using namespace nncase;
+using namespace nncase::ntt;
+using namespace nncase::ntt::distributed;
+using namespace nncase::ntt::runtime;
+
+decltype(nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape))
+    nncase::ntt::distributed::detail::global_local_data_ptr =
+        nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+            nncase::ntt::distributed::topology_shape);
+
+decltype(nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape))
+    nncase::ntt::distributed::detail::global_thread_local_rdata_ptr =
+        nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+            nncase::ntt::distributed::topology_shape);
+
+decltype(nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 3>>(
+    nncase::ntt::distributed::topology_shape))
+    nncase::ntt::distributed::detail::global_thread_local_cache_ptr =
+        nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 3>>(
+            nncase::ntt::distributed::topology_shape);
+
+decltype(nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+    nncase::ntt::distributed::topology_shape))
+    nncase::ntt::distributed::detail::global_block_local_rdata_ptr =
+        nncase::ntt::make_tensor<nncase::ntt::vector<uintptr_t, 2>>(
+            nncase::ntt::distributed::topology_shape);
+
+namespace nncase::ntt::runtime {
+alignas(cuda_thread_context_t) __shared__ std::byte
+    cuda_thread_contexts_storage[sizeof(cuda_thread_context_t) * tdim() *
+                                 wdim()];
+
+__device__ bool is_profiling_enabled() noexcept {
+    return cuda_thread_context_t::current().enable_profiling;
+}
+
+__device__ uint64_t get_profile_time() noexcept { return clock64(); }
+
+__device__ void record_profile(profile_level level,
+                               const profile_record &record) noexcept {
+    // Other levels are not supported yet.
+    if (level == profile_level::kernel) {
+        auto &ctx = cuda_thread_context_t::current();
+        auto idx = ctx.profile_record_counts[0]++;
+        ctx.profile_records[idx] = record;
+    }
+}
+} // namespace nncase::ntt::runtime
+
+__device__ cuda_thread_context_t &cuda_thread_context_t::current() noexcept {
+    auto &cuda_thread_contexts =
+        *reinterpret_cast<cuda_thread_context_t(*)[tdim() * wdim()]>(
+            cuda_thread_contexts_storage);
+    return cuda_thread_contexts[wid() * tdim() + tid()];
+}
+
+extern "C" __global__ void
+block_entry(const cuda_block_entry_params_t &params) {
+    const auto linear_wid = bid() * wdim() + wid();
+    const auto linear_tid = linear_wid * tdim() + tid();
+
+    // Get thread local rdata
+    auto thread_local_rdata_offset =
+        (size_t)params.thread_local_rdata_header[linear_tid * 2];
+    auto thread_local_rdata_size =
+        (size_t)params.thread_local_rdata_header[linear_tid * 2 + 1];
+    auto thread_local_rdata = params.thread_local_rdata.subspan(
+        thread_local_rdata_offset, thread_local_rdata_size);
+
+    // Get thread local data
+    auto thread_local_chip_data = params.thread_local_data;
+    const auto thread_local_data_size =
+        thread_local_chip_data.size_bytes() / (bdim() * wdim() * tdim());
+    auto thread_local_data = thread_local_chip_data.subspan(
+        thread_local_data_size * linear_tid, thread_local_data_size);
+
+    // Get warp local rdata
+    auto warp_local_rdata_offset =
+        (size_t)params.warp_local_rdata_header[linear_wid * 2];
+    auto warp_local_rdata_size =
+        (size_t)params.warp_local_rdata_header[linear_wid * 2 + 1];
+    auto warp_local_rdata = params.warp_local_rdata.subspan(
+        warp_local_rdata_offset, warp_local_rdata_size);
+
+    // Get warp local data
+    auto warp_local_chip_data = params.warp_local_data;
+    const auto warp_local_data_size =
+        warp_local_chip_data.size_bytes() / (bdim() * wdim());
+    auto warp_local_data = warp_local_chip_data.subspan(
+        warp_local_data_size * linear_wid, warp_local_data_size);
+
+    // Get block local rdata
+    auto block_local_rdata_offset =
+        (size_t)params.block_local_rdata_header[bid() * 2];
+    auto block_local_rdata_size =
+        (size_t)params.block_local_rdata_header[bid() * 2 + 1];
+    auto block_local_rdata = params.block_local_rdata.subspan(
+        block_local_rdata_offset, block_local_rdata_size);
+
+    // Get block local data
+    auto block_local_chip_data = params.block_local_data;
+    const auto block_local_data_size =
+        block_local_chip_data.size_bytes() / bdim();
+    auto block_local_data = block_local_chip_data.subspan(
+        block_local_data_size * bid(), block_local_data_size);
+
+    // Get thread local profile records
+    auto block_profile_records = params.profile_records;
+    const auto profile_records_size =
+        block_profile_records.size() / (bdim() * wdim() * tdim());
+    auto profile_records = block_profile_records.subspan(
+        profile_records_size * linear_tid, profile_records_size);
+
+    cuda_thread_context_t::current() = {
+        .cid = params.cid,
+        .enable_profiling = params.enable_profiling,
+        .profile_records = profile_records,
+        .profile_record_counts = params.profile_record_counts + linear_tid};
+
+    const auto program_ids = make_shape(params.cid, bid(), wid(), tid());
+
+    // Set distributed pointers
+    ntt::distributed::detail::global_thread_local_rdata_ptr(program_ids)(
+        0_dim) = (uintptr_t)thread_local_rdata.data();
+    ntt::distributed::detail::global_thread_local_rdata_ptr(program_ids)(
+        1_dim) = (uintptr_t)(thread_local_rdata.data() +
+                             thread_local_rdata.size_bytes());
+    ntt::distributed::detail::global_local_data_ptr(program_ids)(0_dim) =
+        (uintptr_t)thread_local_data.data();
+    ntt::distributed::detail::global_local_data_ptr(program_ids)(1_dim) =
+        (uintptr_t)(thread_local_data.data() + thread_local_data.size_bytes());
+    ntt::distributed::detail::global_block_local_rdata_ptr(program_ids)(0_dim) =
+        (uintptr_t)params.block_local_rdata.data();
+    ntt::distributed::detail::global_block_local_rdata_ptr(program_ids)(1_dim) =
+        (uintptr_t)(params.block_local_rdata.data() +
+                    params.block_local_rdata.size_bytes());
+
+    distributed::topology_synchronize();
+    thread_main(params.input_descs, params.output_descs, params.rdata.data(),
+                thread_local_rdata.data(), warp_local_rdata.data(),
+                block_local_rdata.data(), thread_local_data.data(),
+                warp_local_data.data(), block_local_data.data(), params.output);
+}
+
+int main() {
+    cuda_block_entry_params_t params = {};
+    block_entry<<<1, 1>>>(params);
+}
diff --git a/ntt/test/ctest/test_ntt_cuda.cpp b/ntt/test/ctest/test_ntt_cuda.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/nncase/__init__.py b/python/nncase/__init__.py
index efe4206cc..c83cd8afd 100644
--- a/python/nncase/__init__.py
+++ b/python/nncase/__init__.py
@@ -321,7 +321,7 @@ def _import_huggingface_module(self, model_dir: str, options: ImportOptions) ->
 
 def check_target(target: str):
     def test_target(target: str):
-        return target in ["cpu", "k510", "k230", "xpu"]
+        return target in ["cpu", "k510", "k230", "xpu", "cuda"]
 
     def target_exists(target: str):
         return _nncase.Target.exists(target)
diff --git a/src/Native/include/nncase/runtime/cuda/runtime_module.h b/src/Native/include/nncase/runtime/cuda/runtime_module.h
new file mode 100644
index 000000000..abd6af9e8
--- /dev/null
+++ b/src/Native/include/nncase/runtime/cuda/runtime_module.h
@@ -0,0 +1,26 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../runtime_module.h"
+
+BEGIN_NS_NNCASE_RT_MODULE(cuda)
+
+NNCASE_INLINE_VAR constexpr module_kind_t cuda_module_kind =
+    to_module_kind("cuda");
+NNCASE_INLINE_VAR constexpr uint32_t cuda_module_version = 1;
+
+NNCASE_API result<std::unique_ptr<runtime_module>> create_cuda_runtime_module();
+
+END_NS_NNCASE_RT_MODULE
diff --git a/src/Native/include/nncase/runtime/model.h b/src/Native/include/nncase/runtime/model.h
index 270db2815..c3d25d8b9 100644
--- a/src/Native/include/nncase/runtime/model.h
+++ b/src/Native/include/nncase/runtime/model.h
@@ -20,7 +20,7 @@ BEGIN_NS_NNCASE_RUNTIME
 
 inline constexpr size_t MAX_SECTION_NAME_LENGTH = 32;
 inline constexpr size_t MAX_MODULE_KIND_LENGTH = 16;
-inline constexpr uint32_t MODEL_HAS_NO_ENTRY = -1;
+inline constexpr uint32_t MODEL_HAS_NO_ENTRY = uint32_t(-1);
 
 typedef std::array<char, MAX_MODULE_KIND_LENGTH> module_kind_t;
 
@@ -95,7 +95,7 @@ struct shape_header {
     }
 };
 
-NNCASE_INLINE_VAR constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
+NNCASE_INLINE_VAR constexpr uint32_t MODEL_IDENTIFIER = 0x4b4d444c; // 'KMDL';
 NNCASE_INLINE_VAR constexpr uint32_t MODEL_VERSION = 7;
 
 END_NS_NNCASE_RUNTIME
diff --git a/src/Native/include/nncase/runtime/result.h b/src/Native/include/nncase/runtime/result.h
index b16d6db1d..b38297657 100644
--- a/src/Native/include/nncase/runtime/result.h
+++ b/src/Native/include/nncase/runtime/result.h
@@ -283,6 +283,13 @@ template <class T> class NNCASE_NODISCARD result {
             return std::move(*this);
     }
 
+    constexpr const T &or_(const T &default_value) noexcept {
+        if (is_ok())
+            return ok_;
+        else
+            return default_value;
+    }
+
   private:
     void destroy() {
         if (is_ok())
diff --git a/src/Native/include/nncase/runtime/simple_types.h b/src/Native/include/nncase/runtime/simple_types.h
index 2a09c64bb..78e522063 100644
--- a/src/Native/include/nncase/runtime/simple_types.h
+++ b/src/Native/include/nncase/runtime/simple_types.h
@@ -131,7 +131,7 @@ inline constexpr size_t typecode_bytes(typecode_t typecode) {
     case dt_reference:
         return sizeof(intptr_t);
     default:
-        return -1;
+        return size_t(-1);
     }
 }
 
diff --git a/src/Native/src/CMakeLists.txt b/src/Native/src/CMakeLists.txt
index 7a7e58177..e3ece8087 100644
--- a/src/Native/src/CMakeLists.txt
+++ b/src/Native/src/CMakeLists.txt
@@ -30,6 +30,10 @@ add_subdirectory(runtime)
 if (NOT BUILDING_RUNTIME)
     add_executable(nncasetest test.cpp)
     target_link_libraries(nncasetest PRIVATE nncaseruntime)
+    if(ENABLE_CUDA_RUNTIME)
+        add_executable(nncase_cudatest cuda_test.cu)
+        target_link_libraries(nncase_cudatest PRIVATE nncaseruntime CUDA::cudart)
+    endif()
     if(MSVC)
     else()
       if(APPLE)
@@ -63,4 +67,4 @@ else()
     install(EXPORT nncaseTargets DESTINATION lib/cmake/nncase)
     configure_file(${CMAKE_CURRENT_LIST_DIR}/../../../cmake/nncaseConfig.cmake.in nncaseConfig.cmake @ONLY)
     install(FILES ${CMAKE_CURRENT_BINARY_DIR}/nncaseConfig.cmake DESTINATION lib/cmake/nncase)
-endif()
\ No newline at end of file
+endif()
diff --git a/src/Native/src/cuda_test.cu b/src/Native/src/cuda_test.cu
new file mode 100644
index 000000000..4d5291f49
--- /dev/null
+++ b/src/Native/src/cuda_test.cu
@@ -0,0 +1,1481 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <nncase/api.h>
+#include <nncase/compiler.h>
+#include <nncase/io_utils.h>
+#include <nncase/ntt/ntt.h>
+#include <nncase/runtime/runtime_tensor.h>
+#include <string_view>
+#include <sys/stat.h>
+#include <type_traits>
+
+namespace nncase::ntt::runtime {
+// just for test
+#ifdef __CUDA_ARCH__
+__device__ cuda_thread_context_t &cuda_thread_context_t::current() noexcept {
+    static cuda_thread_context_t ctx{};
+    return ctx;
+}
+#else
+cpu_thread_context_t &cpu_thread_context_t::current() noexcept {
+    static cpu_thread_context_t ctx{.tid = 0, .bid = 0, .cid = 0};
+    return ctx;
+}
+#endif
+} // namespace nncase::ntt::runtime
+
+using namespace nncase;
+using namespace nncase::clr;
+using namespace nncase::runtime;
+using namespace std::string_view_literals;
+
+constexpr bool are_floats_equal(float a, float b, float epsilon = 1e-6) {
+    return std::fabs(a - b) < epsilon;
+}
+
+#define CHECK_CUDA(ans)                                                        \
+    {                                                                          \
+        gpuAssert((ans), __FILE__, __LINE__);                                  \
+    }
+inline void gpuAssert(cudaError_t code, const char *file, int line) {
+    if (code != cudaSuccess) {
+        fprintf(stderr, "CUDA error %s %d: %s\n", file, line,
+                cudaGetErrorString(code));
+        exit(code);
+    }
+}
+
+#define TRY(x)                                                                 \
+    if (x)                                                                     \
+        throw 1;
+
+NTT_HOST_DEVICE void test_shape() {
+
+    // fixed shape
+    {
+        constexpr auto shape = ntt::fixed_shape_v<1, 16>;
+        auto dim1 = shape[dim_zero];
+        static_assert(dim1.value == 1);
+        auto dim2 = shape[dim_one];
+        static_assert(dim2.value == 16);
+        auto sub_dim = dim2 - shape.rank();
+        static_assert(sub_dim.value == 14);
+        static_assert(FixedDimension<decltype(sub_dim)>);
+        static_assert(shape.contains(1));
+        static_assert(shape.contains((size_t)16));
+        static_assert(!shape.contains(2));
+
+        auto appended_shape = shape.append(fixed_dim_v<2>);
+        static_assert(appended_shape.rank() == 3);
+        static_assert(appended_shape[dim_zero] == 1);
+        static_assert(appended_shape[dim_one] == 16);
+        static_assert(appended_shape[2] == 2);
+
+        auto squeezed_shape = ntt::squeeze_dims(shape, fixed_shape_v<0>);
+        static_assert(squeezed_shape.rank() == 1);
+
+        auto concat_shape = shape.concat(fixed_shape_v<2, 3>);
+        static_assert(concat_shape.rank() == 4);
+
+        constexpr auto replaced_shape = shape.replace_at<0>(fixed_dim_v<2>);
+        static_assert(replaced_shape.rank() == 2);
+        static_assert(replaced_shape.length() == 32);
+        static_assert(
+            linear_size(replaced_shape, default_strides(replaced_shape)) == 32);
+    }
+
+    {
+        float arr[] = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+        auto buffer = ntt::span(arr);
+        auto tv = ntt::make_tensor_view(buffer, ntt::fixed_shape_v<2, 4>);
+        static_assert(tv.rank() == 2);
+    }
+
+    {
+        const float arr[] = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+        auto buffer = ntt::span(arr);
+        auto tv = ntt::make_tensor_view(buffer, ntt::fixed_shape_v<2, 4>);
+        static_assert(tv.rank() == 2);
+    }
+
+    {
+        const float buffer[] = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+        auto tv = ntt::make_tensor_view(buffer, ntt::fixed_shape_v<2, 4>);
+        static_assert(tv.rank() == 2);
+    }
+
+    {
+        using v1_type = ntt::vector<float, 8>;
+        using v2_type = ntt::replace_element_t<v1_type, int>;
+        static_assert(std::is_same_v<v2_type, ntt::vector<int, 8>>);
+    }
+}
+
+NTT_HOST_DEVICE void test_strides() {
+    // dynamic shape strides
+    {
+        NNCASE_UNUSED auto shape =
+            ntt::make_shape(2_dim, 1, 3_dim); // dim1 is dynamic
+        NNCASE_UNUSED auto strides = ntt::make_strides(3_dim, 3_dim, 1_dim);
+        assert(ntt::contiguous_dims(shape, strides) == 3);
+
+        NNCASE_UNUSED auto shape1 =
+            ntt::make_shape(8_dim, 1); // dim1 is dynamic
+        NNCASE_UNUSED auto strides1 = ntt::make_strides(2_dim, 1_dim);
+        assert(ntt::contiguous_dims(shape1, strides1) == 1);
+    }
+}
+
+NTT_HOST_DEVICE void test_sharding() {
+    // local_index
+    {
+        using namespace ntt::distributed;
+        using mesh_type =
+            ntt::distributed::mesh<ntt::distributed::topology::thread, 1>;
+
+        static_assert(ntt::distributed::program_dim<
+                          ntt::distributed::topology::thread>() == 1);
+        static_assert(
+            ntt::distributed::detail::get_submesh_end<mesh_type,
+                                                      topology::thread>() == 1);
+        static_assert(ntt::distributed::detail::get_submesh_rank<
+                          mesh_type, topology::thread>() == 1);
+        static_assert(
+            ntt::distributed::detail::get_submesh_rank<mesh_type,
+                                                       topology::chip>() == 0);
+        static_assert(ntt::distributed::detail::get_submesh_start<
+                          mesh_type, topology::thread>() == 0);
+        auto program_ids = ntt::distributed::program_ids<>();
+        auto local_index = mesh_type::index_from_program_ids(program_ids);
+        static_assert(local_index.rank() == 1);
+
+        auto sharding = ntt::distributed::make_sharding<mesh_type>(
+            ntt::distributed::shard_policy::B,
+            ntt::distributed::shard_policy::B);
+
+        auto global_shape = ntt::fixed_shape_v<2, 4>;
+        constexpr auto local_shape =
+            sharding.shard_shape(global_shape, local_index);
+        static_assert(local_shape == ntt::fixed_shape_v<2, 4>);
+
+        const float buffer[] = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+        auto s_tensor = ntt::distributed::make_sharded_tensor_view(
+            buffer, global_shape,
+            ntt::distributed::make_sharding<mesh_type>(
+                ntt::distributed::shard_policy::B,
+                ntt::distributed::shard_policy::B),
+            ntt::fixed_strides_v<4, 1>);
+        static_assert(s_tensor.local().shape() == ntt::fixed_shape_v<2, 4>);
+    }
+
+    // Sharding
+    {
+        using mesh_type =
+            ntt::distributed::mesh<ntt::distributed::topology::thread, 1, 1, 1>;
+
+        auto sharding = ntt::distributed::make_sharding<mesh_type>(
+            ntt::distributed::shard_policy::B,
+            ntt::distributed::shard_policy::S<2>(),
+            ntt::distributed::shard_policy::B);
+        using sharding_type = std::remove_cv_t<decltype(sharding)>;
+        static_assert(
+            ntt::distributed::detail::mesh_axes_mask_of_split_shard_policies<
+                sharding_type>() == ntt::fixed_shape_v<0, 0, 1>);
+        static_assert(
+            ntt::distributed::detail::mesh_axes_of_non_split_shard_policies<
+                sharding_type>() == ntt::fixed_shape_v<0, 1>);
+    }
+}
+
+NTT_HOST_DEVICE void test_matmul_normal() {
+    // no vectorize
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 4>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 2>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 2>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        ntt::matmul<false>(ta, tb, tc);
+        assert(tc(0, 0) == 28.f);
+        assert(tc(0, 1) == 34.f);
+        assert(tc(1, 0) == 76.f);
+        assert(tc(1, 1) == 98.f);
+        assert(tc(2, 0) == 124.f);
+        assert(tc(2, 1) == 162.f);
+        ntt::matmul<false>(ta, tb, tc, 0.5f);
+        assert(tc(0, 0) == 28.f * 0.5f);
+        assert(tc(0, 1) == 34.f * 0.5f);
+        assert(tc(1, 0) == 76.f * 0.5f);
+        assert(tc(1, 1) == 98.f * 0.5f);
+        assert(tc(2, 0) == 124.f * 0.5f);
+        assert(tc(2, 1) == 162.f * 0.5f);
+
+        // auto ta_f16 = ntt::make_tensor<half>(ntt::fixed_shape_v<3, 4>);
+        // auto tb_f16 = ntt::make_tensor<half>(ntt::fixed_shape_v<4, 2>);
+        // std::iota(ta_f16.elements().begin(), ta_f16.elements().end(), 0.f);
+        // std::iota(tb_f16.elements().begin(), tb_f16.elements().end(), 0.f);
+        // ntt::matmul<false>(ta_f16, tb_f16, tc, 0.5f);
+        // assert(tc(0, 0) == 28.f * 0.5f);
+        // assert(tc(0, 1) == 34.f * 0.5f);
+        // assert(tc(1, 0) == 76.f * 0.5f);
+        // assert(tc(1, 1) == 98.f * 0.5f);
+        // assert(tc(2, 0) == 124.f * 0.5f);
+        // assert(tc(2, 1) == 162.f * 0.5f);
+
+        auto te = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 3, 4>);
+        auto tf = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 4, 5>);
+        std::iota(te.elements().begin(), te.elements().end(), 0.f);
+        std::iota(tf.elements().begin(), tf.elements().end(), 0.f);
+        auto tg = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 2, 3, 5>);
+        ntt::matmul<false>(te, tf, tg);
+        assert(tg(0, 0, 0, 0) == 70.f);
+        assert(tg(0, 0, 1, 0) == 190.f);
+        assert(tg(0, 0, 2, 0) == 310.f);
+        assert(tg(0, 1, 0, 0) == 190.f);
+        assert(tg(0, 1, 1, 0) == 630.f);
+        assert(tg(0, 1, 2, 0) == 1070.f);
+    }
+
+    // vectorized matmul 1d on k
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<16, 2>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 2>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 2>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<2, 2>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<0>);
+        ntt::matmul<false>(pa, pb, tc, nullptr, ntt::fixed_shape_v<1>, {},
+                           ntt::fixed_shape_v<0>);
+        assert(tc(0, 0) == 2480.f);
+        assert(tc(0, 1) == 2600.f);
+        assert(tc(1, 0) == 6320.f);
+        assert(tc(1, 1) == 6696.f);
+        assert(tc(2, 0) == 10160.f);
+        assert(tc(2, 1) == 10792.f);
+        ntt::matmul<false>(pa, pb, tc, 0.5f, ntt::fixed_shape_v<1>, {},
+                           ntt::fixed_shape_v<0>);
+        assert(tc(0, 0) == 2480.f * 0.5f);
+        assert(tc(0, 1) == 2600.f * 0.5f);
+        assert(tc(1, 0) == 6320.f * 0.5f);
+        assert(tc(1, 1) == 6696.f * 0.5f);
+        assert(tc(2, 0) == 10160.f * 0.5f);
+        assert(tc(2, 1) == 10792.f * 0.5f);
+    }
+
+    // vectorized matmul 1d on m
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 2>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<1, 8>);
+        auto pc =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<1, 2>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<0>);
+        ntt::matmul<false>(pa, tb, pc, nullptr, ntt::fixed_shape_v<0>, {}, {},
+                           {});
+        assert(are_floats_equal(pc(0, 0)(0), 280.f));
+        assert(are_floats_equal(pc(0, 1)(0), 308.f));
+        assert(are_floats_equal(pc(0, 0)(1), 728.f));
+        assert(are_floats_equal(pc(0, 1)(1), 820.f));
+        assert(are_floats_equal(pc(0, 0)(2), 1176.f));
+        assert(are_floats_equal(pc(0, 1)(2), 1332.f));
+        assert(are_floats_equal(pc(0, 0)(3), 1624.f));
+        assert(are_floats_equal(pc(0, 1)(3), 1844.f));
+
+        // vectorized matmul 1d on m, but not same type
+        // {
+        //     auto ta = ntt::make_tensor<half>(ntt::fixed_shape_v<4, 8>);
+        //     auto tb = ntt::make_tensor<half>(ntt::fixed_shape_v<8, 2>);
+        //     std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        //     std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        //     auto pa = ntt::make_tensor<ntt::vector<half, 8>>(
+        //         ntt::fixed_shape_v<1, 8>);
+        //     auto pc = ntt::make_tensor<ntt::vector<float, 8>>(
+        //         ntt::fixed_shape_v<1, 2>);
+        //     ntt::pack(ta, pa, ntt::fixed_shape_v<0>);
+        //     ntt::matmul<false>(pa, tb, pc, 0.5f, ntt::fixed_shape_v<0>, {},
+        //     {},
+        //                        {});
+        //     assert(are_floats_equal(pc(0, 0)(0), 280.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 1)(0), 308.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 0)(1), 728.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 1)(1), 820.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 0)(2), 1176.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 1)(2), 1332.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 0)(3), 1624.f * 0.5f));
+        //     assert(are_floats_equal(pc(0, 1)(3), 1844.f * 0.5f));
+        // }
+    }
+
+    // vectorized matmul 1d on n
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<8, 1>);
+        auto pc =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<3, 1>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<1>);
+        ntt::matmul<false>(ta, pb, pc, nullptr, ntt::fixed_shape_v<>, {},
+                           ntt::fixed_shape_v<1>, ntt::fixed_shape_v<0>);
+        assert(are_floats_equal(pc(0, 0)(0), 560.f));
+        assert(are_floats_equal(pc(0, 0)(1), 588.f));
+        assert(are_floats_equal(pc(0, 0)(2), 616.f));
+        assert(are_floats_equal(pc(0, 0)(3), 644.f));
+        assert(are_floats_equal(pc(1, 0)(0), 1456.f));
+        assert(are_floats_equal(pc(1, 0)(1), 1548.f));
+        assert(are_floats_equal(pc(1, 0)(2), 1640.f));
+        assert(are_floats_equal(pc(1, 0)(3), 1732.f));
+        assert(are_floats_equal(pc(2, 0)(0), 2352.f));
+        assert(are_floats_equal(pc(2, 0)(1), 2508.f));
+        assert(are_floats_equal(pc(2, 0)(2), 2664.f));
+        assert(are_floats_equal(pc(2, 0)(3), 2820.f));
+    }
+
+    // vectorized matmul 1d on m(A) and n(B)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<1, 8>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<8, 1>);
+        auto pc = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 1>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<0>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<1>);
+        ntt::matmul<false>(pa, pb, pc, nullptr, ntt::fixed_shape_v<0>, {},
+                           ntt::fixed_shape_v<1>, {});
+        assert(are_floats_equal(pc(0, 0)(0, 0), 560.f));
+        assert(are_floats_equal(pc(0, 0)(0, 1), 588.f));
+        assert(are_floats_equal(pc(0, 0)(0, 2), 616.f));
+        assert(are_floats_equal(pc(0, 0)(0, 3), 644.f));
+        assert(are_floats_equal(pc(0, 0)(1, 0), 1456.f));
+        assert(are_floats_equal(pc(0, 0)(1, 1), 1548.f));
+        assert(are_floats_equal(pc(0, 0)(1, 2), 1640.f));
+        assert(are_floats_equal(pc(0, 0)(1, 3), 1732.f));
+        assert(are_floats_equal(pc(0, 0)(2, 0), 2352.f));
+        assert(are_floats_equal(pc(0, 0)(2, 1), 2508.f));
+        assert(are_floats_equal(pc(0, 0)(2, 2), 2664.f));
+        assert(are_floats_equal(pc(0, 0)(2, 3), 2820.f));
+        assert(are_floats_equal(pc(0, 0)(3, 0), 3248.f));
+        assert(are_floats_equal(pc(0, 0)(3, 1), 3468.f));
+        assert(are_floats_equal(pc(0, 0)(3, 2), 3688.f));
+        assert(are_floats_equal(pc(0, 0)(3, 3), 3908.f));
+    }
+
+    // vectorized matmul 2d on mk(A) and k(B)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 2>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2, 4>);
+        auto pc =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<1, 4>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<0, 1>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<0>);
+        ntt::matmul<false>(pa, pb, pc, nullptr, ntt::fixed_shape_v<0, 1>,
+                           ntt::fixed_shape_v<0>, ntt::fixed_shape_v<0>,
+                           ntt::fixed_shape_v<0>);
+        assert(are_floats_equal(pc(0, 0)(0), 560.f));
+        assert(are_floats_equal(pc(0, 1)(0), 588.f));
+        assert(are_floats_equal(pc(0, 2)(0), 616.f));
+        assert(are_floats_equal(pc(0, 3)(0), 644.f));
+        assert(are_floats_equal(pc(0, 0)(1), 1456.f));
+        assert(are_floats_equal(pc(0, 1)(1), 1548.f));
+        assert(are_floats_equal(pc(0, 2)(1), 1640.f));
+        assert(are_floats_equal(pc(0, 3)(1), 1732.f));
+        assert(are_floats_equal(pc(0, 0)(2), 2352.f));
+        assert(are_floats_equal(pc(0, 1)(2), 2508.f));
+        assert(are_floats_equal(pc(0, 2)(2), 2664.f));
+        assert(are_floats_equal(pc(0, 3)(2), 2820.f));
+        assert(are_floats_equal(pc(0, 0)(3), 3248.f));
+        assert(are_floats_equal(pc(0, 1)(3), 3468.f));
+        assert(are_floats_equal(pc(0, 2)(3), 3688.f));
+        assert(are_floats_equal(pc(0, 3)(3), 3908.f));
+    }
+
+    // vectorized matmul 2d on k(A) and kn(B)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<4, 2>);
+        auto pb = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 1>);
+        auto pc =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<4, 1>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<0, 1>);
+        ntt::matmul<false>(pa, pb, pc, nullptr, ntt::fixed_shape_v<1>,
+                           ntt::fixed_shape_v<0>, ntt::fixed_shape_v<0, 1>,
+                           ntt::fixed_shape_v<0, 0>);
+        assert(are_floats_equal(pc(0, 0)(0), 560.f));
+        assert(are_floats_equal(pc(0, 0)(1), 588.f));
+        assert(are_floats_equal(pc(0, 0)(2), 616.f));
+        assert(are_floats_equal(pc(0, 0)(3), 644.f));
+        assert(are_floats_equal(pc(1, 0)(0), 1456.f));
+        assert(are_floats_equal(pc(1, 0)(1), 1548.f));
+        assert(are_floats_equal(pc(1, 0)(2), 1640.f));
+        assert(are_floats_equal(pc(1, 0)(3), 1732.f));
+        assert(are_floats_equal(pc(2, 0)(0), 2352.f));
+        assert(are_floats_equal(pc(2, 0)(1), 2508.f));
+        assert(are_floats_equal(pc(2, 0)(2), 2664.f));
+        assert(are_floats_equal(pc(2, 0)(3), 2820.f));
+        assert(are_floats_equal(pc(3, 0)(0), 3248.f));
+        assert(are_floats_equal(pc(3, 0)(1), 3468.f));
+        assert(are_floats_equal(pc(3, 0)(2), 3688.f));
+        assert(are_floats_equal(pc(3, 0)(3), 3908.f));
+    }
+
+    // vectorized matmul 2d on mk(A) and kn(B)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 4>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 4>);
+        auto devectorizec = ntt::make_tensor<float>(ntt::fixed_shape_v<4, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 2>);
+        auto pb = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 1>);
+        auto pc = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 1>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<0, 1>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<0, 1>);
+        ntt::matmul<false>(pa, pb, pc, nullptr, ntt::fixed_shape_v<0, 1>,
+                           ntt::fixed_shape_v<0>, ntt::fixed_shape_v<0, 1>,
+                           ntt::fixed_shape_v<0>);
+        ntt::unpack(pc, devectorizec.view(), ntt::fixed_shape_v<0, 1>);
+        ntt::matmul<false>(ta, tb, tc);
+        ntt::apply(tc.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tc(index) == devectorizec(index));
+        });
+    }
+
+    // vectorized matmul 1d on k with broadcast
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 3, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 16, 4>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 2, 3, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+        auto pa = ntt::make_tensor<ntt::vector<float, 8>>(
+            ntt::fixed_shape_v<1, 1, 3, 2>);
+        auto pb = ntt::make_tensor<ntt::vector<float, 8>>(
+            ntt::fixed_shape_v<2, 2, 4>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<3>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<1>);
+        ntt::matmul<false>(pa, pb, tc, nullptr, ntt::fixed_shape_v<3>,
+                           ntt::fixed_shape_v<0>, ntt::fixed_shape_v<1>,
+                           ntt::fixed_shape_v<0>);
+        assert(tc(0, 0, 0, 0) == 4960.f);
+        assert(tc(0, 0, 0, 1) == 5080.f);
+        assert(tc(0, 0, 0, 2) == 5200.f);
+        assert(tc(0, 0, 0, 3) == 5320.f);
+        assert(tc(0, 1, 0, 0) == 12640.f);
+        assert(tc(0, 1, 0, 1) == 12760.f);
+        assert(tc(0, 1, 0, 2) == 12880.f);
+        assert(tc(0, 1, 0, 3) == 13000.f);
+    }
+}
+
+NTT_HOST_DEVICE void test_matmul_transpose_b() {
+    // 1. reference value
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+    auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+    auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+    ntt::matmul<false>(ta, tb, tc);
+
+    // 2. reference transpose B value
+    auto tranb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+    ntt::transpose(tb, tranb, ntt::fixed_shape_v<1, 0>);
+
+    // transB no vectorize
+    {
+        auto tc1 = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+        ntt::matmul<false, false, true>(ta, tranb, tc1);
+        ntt::apply(tc.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tc1(index) == tc(index));
+        });
+    }
+
+    // transB vectorize n
+    {
+        auto vectorizeb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2, 8>);
+        ntt::pack(tranb, vectorizeb, ntt::fixed_shape_v<0>);
+        auto tc2 =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<8, 2>);
+        ntt::matmul<false, false, true>(ta, vectorizeb, tc2, nullptr, {}, {},
+                                        ntt::fixed_shape_v<0>);
+
+        auto tc2devectorize = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+        ntt::unpack(tc2, tc2devectorize, ntt::fixed_shape_v<1>);
+
+        ntt::apply(tc.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tc2devectorize(index) == tc(index));
+        });
+    }
+
+    // transB [M,K]<m> @ [N,K]<n>
+    {
+        auto vectorizea =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2, 8>);
+        ntt::pack(ta, vectorizea, ntt::fixed_shape_v<0>);
+        auto vectorizeb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2, 8>);
+        ntt::pack(tranb, vectorizeb, ntt::fixed_shape_v<0>);
+        auto tc2 = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 2>);
+        ntt::matmul<false, false, true>(
+            vectorizea, vectorizeb, tc2, nullptr, ntt::fixed_shape_v<0>,
+            ntt::fixed_shape_v<>, ntt::fixed_shape_v<0>, ntt::fixed_shape_v<>);
+
+        auto tc2devectorize = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+        ntt::unpack(tc2, tc2devectorize, ntt::fixed_shape_v<0, 1>);
+
+        ntt::apply(tc.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tc2devectorize(index) == tc(index));
+        });
+    }
+
+    // A[m,k]<m,k> @ B[n,k]<k,n>
+    {
+        auto vectorizeb = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 2>);
+        ntt::pack(tranb, vectorizeb, ntt::fixed_shape_v<1, 0>); // [n,k]<k,n>
+        auto vectorizea = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 2>);
+        // note actully a should vectorize as [m,k]<k,m>
+        ntt::pack(ta, vectorizea, ntt::fixed_shape_v<0, 1>); // [m,k]<m,k>
+        // [m,n]<m,n>
+        auto tc2 = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<2, 2>);
+        ntt::matmul<false, false, true>(
+            vectorizea, vectorizeb, tc2, nullptr, ntt::fixed_shape_v<0, 1>,
+            ntt::fixed_shape_v<>, ntt::fixed_shape_v<1, 0>,
+            ntt::fixed_shape_v<>);
+
+        auto tc2devectorize = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 8>);
+        ntt::unpack(tc2, tc2devectorize, ntt::fixed_shape_v<0, 1>);
+
+        ntt::apply(tc.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tc2devectorize(index) == tc(index));
+        });
+    }
+}
+
+template <class T> struct act_1 {
+    constexpr auto operator()(const T &v) const noexcept { return v * 2.0f; }
+};
+
+template <class T> struct mul_scalar2 {
+  public:
+    constexpr auto operator()(const T &a) const { return ntt::mul(a, 2.3f); };
+};
+
+template <class T> struct mul_scalar {
+  public:
+    constexpr auto operator()(const T &a) const { return ntt::mul(a, 1.2f); };
+};
+
+NTT_HOST_DEVICE void test_unary_binary() {
+    // unary
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        ntt::unary<ntt::ops::erf>(ta, tb.view());
+        assert(are_floats_equal(tb(0, 0), erf(1.f)));
+    }
+
+    // scalar binary
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 16>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 16>);
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        std::fill(tb.elements().begin(), tb.elements().end(), 2.f);
+
+        ntt::binary<ntt::ops::add, mul_scalar>(ta, tb, tc.view());
+        assert(are_floats_equal(tc(0, 0), 3.0f * 1.2f));
+    }
+
+    // binary
+    {
+        auto ta = ntt::make_tensor<float>(ntt::make_shape(1_dim));
+        auto tb = ntt::make_tensor<float>(ntt::make_shape(1_dim));
+        auto tc = ntt::make_tensor<float>(ntt::make_shape(1_dim));
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        ntt::unary<ntt::ops::sin>(ta, tb.view());
+        assert(tb(0) == sinf(1.f));
+        ntt::binary<ntt::ops::mul>(ta, tb, tc);
+        assert(tc(0) == sinf(1.f));
+    }
+    // fixed
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        ntt::unary<ntt::ops::sin>(ta, tb.view());
+        assert(tb(0, 0) == sinf(1.f));
+        ntt::binary<ntt::ops::mul>(ta, tb, tc);
+        assert(tc(0, 0) == sinf(1.f));
+    }
+
+    // 2d binary
+    // pack and broadcast
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8>);
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        std::fill(tb.elements().begin(), tb.elements().end(), 1.f);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 4, 2>);
+        auto pc = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 4, 2>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<1, 2>);
+        ntt::pack(tb, pb, ntt::fixed_shape_v<0>);
+        ntt::binary<ntt::ops::add>(pa, pb, pc.view());
+    }
+
+    // swish
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::unary<ntt::ops::swish>(ta, tb);
+
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+        ntt::unary<ntt::ops::swish>(pa, pb);
+    }
+
+    // swishb
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        std::iota(tb.elements().begin(), tb.elements().end(), 1.f);
+        ntt::binary<ntt::ops::swishb>(ta, tb, tc);
+
+        auto pa =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+        ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+        auto pc =
+            ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+        ntt::binary<ntt::ops::swishb>(pa, tb, pc);
+    }
+}
+
+NTT_HOST_DEVICE void test_tensor_view() {
+    // reshape
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 3>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 1, 3>);
+        ntt::tensor_copy_sync(ta.reshape(ntt::fixed_shape_v<2, 1, 3>),
+                              tb.view());
+        assert(ta(0, 0) == tb(0, 0, 0));
+        assert(ta(0, 1) == tb(0, 0, 1));
+        assert(ta(0, 2) == tb(0, 0, 2));
+        assert(ta(1, 0) == tb(1, 0, 0));
+        assert(ta(1, 1) == tb(1, 0, 1));
+        assert(ta(1, 2) == tb(1, 0, 2));
+    }
+
+    // nocontigious copy
+    {
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 6>);
+        std::iota(tc.elements().begin(), tc.elements().end(), 0.f);
+        auto td = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 3>);
+        ntt::tensor_copy_sync(
+            tc.view(ntt::make_shape(0, 3), ntt::fixed_shape_v<2, 3>), td);
+        ntt::apply(ntt::fixed_shape_v<2, 3>, [&](NNCASE_UNUSED auto index) {
+            assert(tc(index[0], index[1] + 3) == td(index));
+        });
+    }
+
+    // view & squeeze & unsqueeze.
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 2, 4>);
+        for (size_t i = 0; i < ta.shape()[1]; i++) {
+            for (size_t j = 0; j < ta.shape()[2]; j++) {
+                ta(0, i, j) = i * 4 + j;
+            }
+        }
+
+        auto tb = ta.view(ntt::make_shape(0, 0, 0), ntt::make_shape(1, 2, 1))
+                      .squeeze(ntt::fixed_shape_v<0, 2>);
+        assert(tb.strides()[0] == 4);
+        NNCASE_UNUSED auto tc = tb.unsqueeze(ntt::fixed_shape_v<0>);
+        assert(tc.shape()[0] == 1);
+        assert(tc.shape()[1] == 2);
+        assert(tc.strides()[0] == 0);
+        assert(tc.strides()[1] == 4);
+    }
+}
+
+NTT_HOST_DEVICE void test_vectorize() {
+    // fixed vectorize
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 64, 32>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 16, 32>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb.view(), fixed_shape_v<1>);
+        ntt::apply(tb.shape(), [&](auto index) {
+            ntt::dynamic_shape_t<tb.shape().rank()> inIndex;
+            ntt::loop<inIndex.rank()>([&](auto &i) { inIndex[i] = index[i]; });
+            NNCASE_UNUSED auto b = tb(index);
+            auto start = index[1_dim];
+            for (ntt::dim_t i = 0; i < 4; i++) {
+                index[1_dim] = start * 4 + i;
+                NNCASE_UNUSED auto va = ta(index);
+                NNCASE_UNUSED auto vb = b(i);
+                assert(vb == va);
+            }
+        });
+    }
+
+    // fixed pack 2
+    // {
+    //     auto dim_0 = 32_dim / 4;
+    //     auto seq_length = 28;
+    //     auto dim_1 = (seq_length + 63) / 64;
+    //     auto buffer_1 = ntt::make_tensor<float>(
+    //         ntt::make_shape(dim_0, dim_1), ntt::make_strides(2_dim, 1_dim));
+    //     for (size_t i = 0; i < 8_dim; i++) {
+    //         buffer_1(i, 0) = 3.0f;
+    //     }
+    //     auto buffer_2 = ntt::make_tensor<ntt::vector<float, 8>>(
+    //         ntt::make_shape(dim_0 / 8, dim_1), ntt::make_strides(0_dim,
+    //         1_dim));
+    //     ntt::pack(buffer_1, buffer_2, fixed_shape_v<0>);
+    //     for (size_t i = 0; i < 8_dim; i++) {
+    //         for (size_t j = 0; j < dim_1; j++) {
+    //             assert(buffer_2(0, j)(i) == buffer_1(i, j));
+    //         }
+    //     }
+    // }
+
+    // fixed pack with pad
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 3, 4>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb, fixed_shape_v<1>);
+        assert(tb(0, 0, 0)(0) == ta(0, 0, 0));
+        assert(tb(0, 0, 0)(1) == ta(0, 1, 0));
+        assert(tb(0, 0, 0)(2) == ta(0, 2, 0));
+        assert(are_floats_equal(tb(0, 0, 0)(3), 0.f));
+
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<16>);
+        auto td =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<4>);
+        std::iota(tc.elements().begin(), tc.elements().end(), 0.f);
+        ntt::pack(tc, td, fixed_shape_v<0>);
+        for (ntt::dim_t i = 0; i < 4; i++) {
+            assert(td(i)(0) == tc(i * 4 + 0));
+            assert(td(i)(1) == tc(i * 4 + 1));
+            assert(td(i)(2) == tc(i * 4 + 2));
+            assert(td(i)(3) == tc(i * 4 + 3));
+        }
+    }
+
+    // fixed vectorize with pad, and unary
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 3, 4>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 4>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb, fixed_shape_v<1>);
+        auto tc = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 4>);
+        ntt::unary<ntt::ops::cos>(tb, tc);
+        assert(tc(0, 0, 0)(0) == std::cos(ta(0, 0, 0)));
+        assert(tc(0, 0, 0)(1) == std::cos(ta(0, 1, 0)));
+        assert(tc(0, 0, 0)(2) == std::cos(ta(0, 2, 0)));
+        assert(are_floats_equal(tc(0, 0, 0)(3), std::cos(0.0f)));
+    }
+
+    // pack(fixed_shape + fixed_shape)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 64, 32>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 8>>(
+            ntt::fixed_shape_v<1, 8, 32>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb.view(), fixed_shape_v<1>);
+        ntt::apply(tb.shape(), [&](auto index) {
+            ntt::dynamic_shape_t<tb.shape().rank()> inIndex;
+            ntt::loop<inIndex.rank()>([&](auto &i) { inIndex[i] = index[i]; });
+            auto b = tb(index);
+            auto start = index[1_dim];
+            for (ntt::dim_t i = 0; i < 8; i++) {
+                index[1_dim] = start * 8 + i;
+                auto va = ta(index);
+                auto vb = b(i);
+                if (va != vb) {
+                    printf("va(%f) != vb(%f)\n", ntt::unwrap_proxy(va),
+                           ntt::unwrap_proxy(vb));
+                    assert(false);
+                }
+            }
+        });
+    }
+
+    {
+        // pack and broadcast
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<8>);
+        std::fill(ta.elements().begin(), ta.elements().end(), 1.f);
+        std::fill(tb.elements().begin(), tb.elements().end(), 1.f);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 4, 2>);
+        auto pc = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 4, 2>);
+        auto pb =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2>);
+        ntt::pack(ta, pa, fixed_shape_v<1, 2>);
+        ntt::pack(tb, pb, fixed_shape_v<0>);
+        ntt::binary<ntt::ops::add>(pa, pb, pc.view());
+    }
+
+    // unpack(fixed_shape + fixed_shape)
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 64, 32>);
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 64, 32>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 16, 32>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb.view(), fixed_shape_v<1>);
+        ntt::unpack(tb, tc.view(), fixed_shape_v<1>);
+        ntt::apply(tc.shape(), [&](auto index) {
+            NNCASE_UNUSED auto a = ta(index);
+            NNCASE_UNUSED auto c = tc(index);
+            assert(a == c);
+        });
+    }
+
+    // vector unary
+    {
+        ntt::vector<float, 8> v1(1.f);
+        NNCASE_UNUSED auto v2 = ntt::cos(v1);
+        assert(v2(0) == std::cos(1.f));
+    }
+}
+
+#if 0
+NTT_HOST_DEVICE void test_im2col() {
+    // im2col
+    {
+        auto input = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 4, 4>);
+        std::iota(input.elements().begin(), input.elements().end(), 0.f);
+        auto output = ntt::make_tensor<float>(ntt::fixed_shape_v<9, 16>);
+        ntt::im2col(input, output, ntt::fixed_shape_v<3, 3>,
+                    ntt::fixed_shape_v<1, 1>,
+                    ntt::fixed_paddings_v<1, 1, 1, 1>);
+        // clang-format off
+      assert(output(0,0) == 0.f); assert(output(0,1) == 0.f); assert(output(0,2) == 0.f); assert(output(0,3) == 0.f); assert(output(0,4) == 0.f); assert(output(0,5) == 0.f); assert(output(0,6) == 1.f); assert(output(0,7) == 2.f); assert(output(0,8) == 0.f); assert(output(0,9) == 4.f); assert(output(0,10) == 5.f); assert(output(0,11) == 6.f); assert(output(0,12) == 0.f); assert(output(0,13) == 8.f); assert(output(0,14) == 9.f); assert(output(0,15) == 10.f);
+      assert(output(1,0) == 0.f); assert(output(1,1) == 0.f); assert(output(1,2) == 0.f); assert(output(1,3) == 0.f); assert(output(1,4) == 0.f); assert(output(1,5) == 1.f); assert(output(1,6) == 2.f); assert(output(1,7) == 3.f); assert(output(1,8) == 4.f); assert(output(1,9) == 5.f); assert(output(1,10) == 6.f); assert(output(1,11) == 7.f); assert(output(1,12) == 8.f); assert(output(1,13) == 9.f); assert(output(1,14) == 10.f); assert(output(1,15) == 11.f);
+      assert(output(2,0) == 0.f); assert(output(2,1) == 0.f); assert(output(2,2) == 0.f); assert(output(2,3) == 0.f); assert(output(2,4) == 1.f); assert(output(2,5) == 2.f); assert(output(2,6) == 3.f); assert(output(2,7) == 0.f); assert(output(2,8) == 5.f); assert(output(2,9) == 6.f); assert(output(2,10) == 7.f); assert(output(2,11) == 0.f); assert(output(2,12) == 9.f); assert(output(2,13) == 10.f); assert(output(2,14) == 11.f); assert(output(2,15) == 0.f);
+      assert(output(3,0) == 0.f); assert(output(3,1) == 0.f); assert(output(3,2) == 1.f); assert(output(3,3) == 2.f); assert(output(3,4) == 0.f); assert(output(3,5) == 4.f); assert(output(3,6) == 5.f); assert(output(3,7) == 6.f); assert(output(3,8) == 0.f); assert(output(3,9) == 8.f); assert(output(3,10) == 9.f); assert(output(3,11) == 10.f); assert(output(3,12) == 0.f); assert(output(3,13) == 12.f); assert(output(3,14) == 13.f); assert(output(3,15) == 14.f);
+      assert(output(4,0) == 0.f); assert(output(4,1) == 1.f); assert(output(4,2) == 2.f); assert(output(4,3) == 3.f); assert(output(4,4) == 4.f); assert(output(4,5) == 5.f); assert(output(4,6) == 6.f); assert(output(4,7) == 7.f); assert(output(4,8) == 8.f); assert(output(4,9) == 9.f); assert(output(4,10) == 10.f); assert(output(4,11) == 11.f); assert(output(4,12) == 12.f); assert(output(4,13) == 13.f); assert(output(4,14) == 14.f); assert(output(4,15) == 15.f);
+      assert(output(5,0) == 1.f); assert(output(5,1) == 2.f); assert(output(5,2) == 3.f); assert(output(5,3) == 0.f); assert(output(5,4) == 5.f); assert(output(5,5) == 6.f); assert(output(5,6) == 7.f); assert(output(5,7) == 0.f); assert(output(5,8) == 9.f); assert(output(5,9) == 10.f); assert(output(5,10) == 11.f); assert(output(5,11) == 0.f); assert(output(5,12) == 13.f); assert(output(5,13) == 14.f); assert(output(5,14) == 15.f); assert(output(5,15) == 0.f);
+      assert(output(6,0) == 0.f); assert(output(6,1) == 4.f); assert(output(6,2) == 5.f); assert(output(6,3) == 6.f); assert(output(6,4) == 0.f); assert(output(6,5) == 8.f); assert(output(6,6) == 9.f); assert(output(6,7) == 10.f); assert(output(6,8) == 0.f); assert(output(6,9) == 12.f); assert(output(6,10) == 13.f); assert(output(6,11) == 14.f); assert(output(6,12) == 0.f); assert(output(6,13) == 0.f); assert(output(6,14) == 0.f); assert(output(6,15) == 0.f);
+      assert(output(7,0) == 4.f); assert(output(7,1) == 5.f); assert(output(7,2) == 6.f); assert(output(7,3) == 7.f); assert(output(7,4) == 8.f); assert(output(7,5) == 9.f); assert(output(7,6) == 10.f); assert(output(7,7) == 11.f); assert(output(7,8) == 12.f); assert(output(7,9) == 13.f); assert(output(7,10) == 14.f); assert(output(7,11) == 15.f); assert(output(7,12) == 0.f); assert(output(7,13) == 0.f); assert(output(7,14) == 0.f); assert(output(7,15) == 0.f);
+      assert(output(8,0) == 5.f); assert(output(8,1) == 6.f); assert(output(8,2) == 7.f); assert(output(8,3) == 0.f); assert(output(8,4) == 9.f); assert(output(8,5) == 10.f); assert(output(8,6) == 11.f); assert(output(8,7) == 0.f); assert(output(8,8) == 13.f); assert(output(8,9) == 14.f); assert(output(8,10) == 15.f); assert(output(8,11) == 0.f); assert(output(8,12) == 0.f); assert(output(8,13) == 0.f); assert(output(8,14) == 0.f); assert(output(8,15) == 0.f);
+        // clang-format on
+    }
+
+    // im2col vectorized on ic
+    {
+        auto input = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 4, 4, 4>);
+        std::iota(input.elements().begin(), input.elements().end(), 0.f);
+        auto vectorized_input = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 4, 4>);
+        ntt::pack(input, vectorized_input, ntt::fixed_shape_v<1>);
+        auto vectorized_output =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<9, 16>);
+        ntt::im2col(vectorized_input, vectorized_output,
+                    ntt::fixed_shape_v<3, 3>, ntt::fixed_shape_v<1, 1>,
+                    ntt::fixed_paddings_v<1, 1, 1, 1>, ntt::fixed_shape_v<1>,
+                    ntt::fixed_shape_v<0>);
+        auto devectorized_output =
+            ntt::make_tensor<float>(ntt::fixed_shape_v<36, 16>);
+        // vectorized [n,c/4,h,w,4] => [c/4 * h * w, b * oh * ow]
+        // so devectorize should after reshape
+        ntt::unpack(vectorized_output.reshape(ntt::fixed_shape_v<1, 9, 16>),
+                    devectorized_output.reshape(ntt::fixed_shape_v<4, 9, 16>),
+                    ntt::fixed_shape_v<0>);
+        auto output = ntt::make_tensor<float>(ntt::fixed_shape_v<36, 16>);
+        ntt::im2col(input, output, ntt::fixed_shape_v<3, 3>,
+                    ntt::fixed_shape_v<1, 1>,
+                    ntt::fixed_paddings_v<1, 1, 1, 1>);
+        ntt::apply(output.shape(), [&](auto index) {
+            NNCASE_UNUSED auto a = output(index);
+            NNCASE_UNUSED auto c = devectorized_output(index);
+            assert(a == c);
+        });
+    }
+}
+#endif
+
+NTT_HOST_DEVICE void test_concat() {
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 8>);
+    auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 16>);
+    auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    std::iota(tb.elements().begin(), tb.elements().end(), 0.f);
+    auto pa = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 1>);
+    auto pb = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 2>);
+    auto pc = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+    ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+    ntt::pack(tb, pb, ntt::fixed_shape_v<1>);
+    ntt::concat(ntt::make_tuple(pa, pb), pc, 1_dim);
+    ntt::unpack(pc, tc, ntt::fixed_shape_v<1>);
+
+    assert(tc(0, 0) == 0.f);
+    assert(tc(0, 1) == 1.f);
+    assert(tc(0, 2) == 2.f);
+    assert(tc(0, 3) == 3.f);
+    assert(tc(0, 4) == 4.f);
+    assert(tc(0, 5) == 5.f);
+    assert(tc(0, 6) == 6.f);
+    assert(tc(0, 7) == 7.f);
+    assert(tc(0, 8) == 0.f);
+    assert(tc(0, 9) == 1.f);
+    assert(tc(0, 10) == 2.f);
+    assert(tc(0, 11) == 3.f);
+    assert(tc(0, 12) == 4.f);
+    assert(tc(0, 13) == 5.f);
+    assert(tc(0, 14) == 6.f);
+    assert(tc(0, 15) == 7.f);
+    assert(tc(0, 16) == 8.f);
+    assert(tc(0, 17) == 9.f);
+    assert(tc(0, 18) == 10.f);
+    assert(tc(0, 19) == 11.f);
+    assert(tc(0, 20) == 12.f);
+    assert(tc(0, 21) == 13.f);
+    assert(tc(0, 22) == 14.f);
+    assert(tc(0, 23) == 15.f);
+}
+
+NTT_HOST_DEVICE void test_slice() {
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+    auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 8>);
+    auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 16>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    ntt::slice(ta, tb, ntt::fixed_shape_v<0>, ntt::fixed_shape_v<8>,
+               ntt::fixed_shape_v<1>);
+    ntt::slice(ta, tc, ntt::fixed_shape_v<8>, fixed_shape_v<24>,
+               ntt::fixed_shape_v<1>);
+    assert(tb(0, 0) == 0.f);
+    assert(tb(0, 1) == 1.f);
+    assert(tb(0, 2) == 2.f);
+    assert(tb(0, 3) == 3.f);
+    assert(tb(0, 4) == 4.f);
+    assert(tb(0, 5) == 5.f);
+    assert(tb(0, 6) == 6.f);
+    assert(tb(0, 7) == 7.f);
+    assert(tc(0, 0) == 8.f);
+    assert(tc(0, 1) == 9.f);
+    assert(tc(0, 2) == 10.f);
+    assert(tc(0, 3) == 11.f);
+    assert(tc(0, 4) == 12.f);
+    assert(tc(0, 5) == 13.f);
+    assert(tc(0, 6) == 14.f);
+    assert(tc(0, 7) == 15.f);
+}
+
+NTT_HOST_DEVICE void test_transpose() {
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<3, 24>);
+    auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<24, 3>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    ntt::transpose(ta, tb, ntt::fixed_shape_v<1, 0>);
+    assert(tb(0, 0) == 0.0f);
+    assert(tb(0, 1) == 24.f);
+    assert(tb(0, 2) == 48.f);
+
+    auto pa = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+    auto pb = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
+    ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
+    ntt::transpose(pa, pb.view(), ntt::fixed_shape_v<1, 0>);
+    assert(pb(0, 0)(0) == 0.0f);
+    assert(pb(0, 0)(1) == 1.0f);
+    assert(pb(0, 0)(2) == 2.0f);
+    assert(pb(0, 0)(3) == 3.0f);
+    assert(pb(0, 1)(0) == 24.f);
+    assert(pb(0, 1)(1) == 25.f);
+    assert(pb(0, 1)(2) == 26.f);
+    assert(pb(0, 1)(3) == 27.f);
+    assert(pb(0, 2)(0) == 48.f);
+    assert(pb(0, 2)(1) == 49.f);
+    assert(pb(0, 2)(2) == 50.f);
+    assert(pb(0, 2)(3) == 51.f);
+}
+
+NTT_HOST_DEVICE void test_gather() {
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<6, 3>);
+    auto tb = ntt::make_tensor<int64_t>(ntt::fixed_shape_v<1, 3>);
+    auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 3, 3>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    std::iota(tb.elements().rbegin(), tb.elements().rend(), 0);
+    ntt::gather(ta, tb, tc, 0_dim);
+    assert(tc(0, 2, 0) == 0.0f);
+    assert(tc(0, 2, 1) == 1.0f);
+    assert(tc(0, 2, 2) == 2.0f);
+    assert(tc(0, 1, 0) == 3.0f);
+    assert(tc(0, 1, 1) == 4.0f);
+    assert(tc(0, 1, 2) == 5.0f);
+    assert(tc(0, 0, 0) == 6.0f);
+    assert(tc(0, 0, 1) == 7.0f);
+    assert(tc(0, 0, 2) == 8.0f);
+
+    auto td = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 3, 3>);
+    auto te = ntt::make_tensor<int64_t>(ntt::fixed_shape_v<1, 2>);
+    auto tf = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 1, 2, 3>);
+    std::iota(td.elements().begin(), td.elements().end(), 0.f);
+    std::iota(te.elements().rbegin(), te.elements().rend(), 0);
+    ntt::gather(td, te, tf, 1_dim);
+    assert(tf(0, 0, 1, 0) == 0.0f);
+    assert(tf(0, 0, 1, 1) == 1.0f);
+    assert(tf(0, 0, 1, 2) == 2.0f);
+    assert(tf(0, 0, 0, 0) == 3.0f);
+    assert(tf(0, 0, 0, 1) == 4.0f);
+    assert(tf(0, 0, 0, 2) == 5.0f);
+}
+
+NTT_HOST_DEVICE void test_pad() {
+    auto td = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 2, 3>);
+    auto te = ntt::make_tensor<float>(ntt::fixed_shape_v<8, 2, 3>);
+    std::iota(td.elements().begin(), td.elements().end(), 0.f);
+    ntt::pad(td, te, ntt::fixed_paddings_v<0, 7, 0, 0, 0, 0>, 1.3f);
+    assert(te(0, 0, 1) == 1.f);
+    assert(te(1, 0, 1) == 1.3f);
+    assert(te(2, 0, 1) == 1.3f);
+    assert(te(3, 0, 1) == 1.3f);
+}
+
+NTT_HOST_DEVICE void test_reduce() {
+    // vectorize 1d
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 16>);
+        auto tav =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<2, 4>);
+        std::fill(ta.elements().begin(), ta.elements().begin() + 16, 1.f);
+        std::fill(ta.elements().begin() + 16, ta.elements().end(), 3.2f);
+        ntt::pack(ta, tav.view(), ntt::fixed_shape_v<1>);
+
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 1>);
+        ntt::reduce_sum(tav, tb, ntt::fixed_shape_v<1>, ntt::fixed_shape_v<1>);
+        assert(are_floats_equal(tb(0, 0), 16.f));
+        assert(are_floats_equal(tb(1, 0), 51.2f));
+
+        // vectorize 1d and tiled.
+        auto tc = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 1>);
+        ntt::reduce_sum(
+            tav.view(ntt::make_shape(0, 0), ntt::fixed_shape_v<2, 2>), tc,
+            ntt::fixed_shape_v<1>, ntt::fixed_shape_v<1>);
+        ntt::reduce_sum<true>(
+            tav.view(ntt::make_shape(0, 2), ntt::fixed_shape_v<2, 2>), tc,
+            ntt::fixed_shape_v<1>, ntt::fixed_shape_v<1>);
+        assert(are_floats_equal(tb(0, 0), 16.f));
+        assert(are_floats_equal(tb(1, 0), 51.2f));
+    }
+
+    // vectorize 2d, inner reduce 0
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 32, 8>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 8>);
+        auto upb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 8>);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 8, 2>);
+        auto pb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 2>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, pa.view(), ntt::fixed_shape_v<1, 2>);
+
+        ntt::reduce_sum(ta, tb, ntt::fixed_shape_v<1>);
+
+        ntt::reduce_sum(pa, pb, ntt::fixed_shape_v<1>,
+                        ntt::fixed_shape_v<1, 2>);
+
+        ntt::unpack(pb, upb.view(), ntt::fixed_shape_v<2>);
+        ntt::apply(tb.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tb(index) == upb(index));
+        });
+
+        // tiling on reduced axis
+        auto pc = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 1, 2>);
+        auto upc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 1, 8>);
+        ntt::reduce_sum(
+            pa.view(ntt::make_shape(0, 0, 0), ntt::fixed_shape_v<1, 4, 2>), pc,
+            ntt::fixed_shape_v<1>, ntt::fixed_shape_v<1, 2>);
+        ntt::reduce_sum<true>(
+            pa.view(ntt::make_shape(0, 4, 0), ntt::fixed_shape_v<1, 4, 2>), pc,
+            ntt::fixed_shape_v<1>, ntt::fixed_shape_v<1, 2>);
+
+        ntt::unpack(pc, upc.view(), ntt::fixed_shape_v<2>);
+        ntt::apply(tb.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tb(index) == upc(index));
+        });
+    }
+
+    // vectorize 2d, inner reduce 1
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 8, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 8, 1>);
+        auto upb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 8, 1>);
+        auto upc = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 8, 1>);
+        auto pa = ntt::make_tensor<ntt::vector<float, 4, 4>>(
+            ntt::fixed_shape_v<1, 2, 4>);
+        auto pb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 2, 1>);
+        auto pc = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 2, 1>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, pa.view(), ntt::fixed_shape_v<1, 2>);
+
+        ntt::reduce_mean(ta, tb, ntt::fixed_shape_v<2>);
+        ntt::reduce_mean(pa, pb, ntt::fixed_shape_v<2>,
+                         ntt::fixed_shape_v<1, 2>);
+
+        ntt::unpack(pb, upb.view(), ntt::fixed_shape_v<1>);
+        ntt::apply(tb.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tb(index) == upb(index));
+        });
+
+        // tiling on reduced axis
+        ntt::reduce_max(ta, tb, ntt::fixed_shape_v<2>);
+        ntt::reduce_max(
+            pa.view(ntt::make_shape(0, 0, 0), ntt::fixed_shape_v<1, 2, 1>), pc,
+            ntt::fixed_shape_v<2>, ntt::fixed_shape_v<1, 2>);
+        ntt::reduce_max<true>(
+            pa.view(ntt::make_shape(0, 0, 1), ntt::fixed_shape_v<1, 2, 2>), pc,
+            ntt::fixed_shape_v<2>, ntt::fixed_shape_v<1, 2>);
+        ntt::reduce_max<true>(
+            pa.view(ntt::make_shape(0, 0, 3), ntt::fixed_shape_v<1, 2, 1>), pc,
+            ntt::fixed_shape_v<2>, ntt::fixed_shape_v<1, 2>);
+
+        ntt::unpack(pc, upc.view(), ntt::fixed_shape_v<1>);
+        ntt::apply(tb.shape(), [&]([[maybe_unused]] auto index) {
+            assert(tb(index) == upc(index));
+        });
+    }
+}
+
+NTT_HOST_DEVICE void test_cast() {
+    // normal cast
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        auto tb = ntt::make_tensor<int32_t>(ntt::fixed_shape_v<1, 16>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::cast(ta, tb.view(), ntt::fixed_shape_v<>);
+        assert(tb(0, 0) == 0);
+        assert(tb(0, 2) == 2);
+    }
+
+    // vectorized cast
+    {
+        auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 64, 32>);
+        auto tb = ntt::make_tensor<ntt::vector<float, 4>>(
+            ntt::fixed_shape_v<1, 16, 32>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+        ntt::pack(ta, tb.view(), ntt::fixed_shape_v<1>);
+        auto tc = ntt::make_tensor<ntt::vector<int32_t, 4>>(
+            ntt::fixed_shape_v<1, 16, 32>);
+        ntt::cast(tb, tc, ntt::fixed_shape_v<1>);
+        assert(tc(0, 0, 0)(0) == 0);
+        assert(tc(0, 0, 0)(1) == 32);
+        assert(tc(0, 0, 0)(2) == 64);
+    }
+
+    // cast with postops
+    {
+        auto ta = ntt::make_tensor<int32_t>(ntt::fixed_shape_v<1, 16>);
+        auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 16>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0);
+        ntt::cast<mul_scalar>(ta, tb.view(), ntt::fixed_shape_v<>);
+        assert(tb(0, 0) == 0.f);
+        assert(tb(0, 2) == (2.f * 1.2f));
+    }
+
+    // vectorize cast with postops
+    {
+        auto ta = ntt::make_tensor<int32_t>(ntt::fixed_shape_v<4, 8>);
+        auto tb =
+            ntt::make_tensor<ntt::vector<int32_t, 4>>(ntt::fixed_shape_v<1, 8>);
+        std::iota(ta.elements().begin(), ta.elements().end(), 0);
+        ntt::pack(ta, tb.view(), ntt::fixed_shape_v<0>);
+        auto tc =
+            ntt::make_tensor<ntt::vector<float, 4>>(ntt::fixed_shape_v<1, 8>);
+        ntt::cast<mul_scalar>(tb, tc, ntt::fixed_shape_v<0>);
+        assert(tc(0, 0)(0) == (0.f * 1.2f));
+        assert(tc(0, 0)(1) == (8.f * 1.2f));
+        assert(tc(0, 0)(2) == (16.f * 1.2f));
+        assert(tc(0, 0)(3) == (24.f * 1.2f));
+    }
+}
+
+NTT_HOST_DEVICE void test_expand() {
+    // [1, 3, 5, 7] strides = [3*5*7, 5*7, 7 , 1]
+    //          [1] strides = [1] -> shape [1, 3, 5, 7] strides = [0, 0, 0, 1]
+    // [1, 3, 5, 7] strides = [3*5*7, 5*7, 7 , 1]
+    //          [7] strides = [1] -> shape [1, 3, 5, 7] strides = [0, 0, 0, 1]
+    // [1, 3, 5, 7] strides = [3*5*7, 5*7, 7 , 1]
+    //       [5, 7] strides = [7, 1] -> shape [1, 3, 5, 7] strides = [0, 0, 7,
+    //       1]
+    // [1, 3, 5, 7] strides = [3*5*7, 5*7, 7 , 1]
+    //       [5, 1] strides = [1, 1] -> shape [1, 3, 5, 7] strides = [0, 0, 1,
+    //       1]
+    auto ta = ntt::make_tensor<float>(ntt::fixed_shape_v<1, 2>);
+    auto tb = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 2>);
+    std::iota(ta.elements().begin(), ta.elements().end(), 0.f);
+    ntt::expand(ta, tb.view());
+    assert(are_floats_equal(tb(0, 0), 0.f));
+    assert(are_floats_equal(tb(0, 1), 1.f));
+    assert(are_floats_equal(tb(1, 0), 0.f));
+    assert(are_floats_equal(tb(1, 1), 1.f));
+}
+
+NTT_HOST_DEVICE void test_where() {
+    auto tcond = ntt::make_tensor<bool>(ntt::fixed_shape_v<2, 2>);
+    auto tx = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 2>);
+    auto ty = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 2>);
+    auto tout = ntt::make_tensor<float>(ntt::fixed_shape_v<2, 2>);
+    tcond(0, 0) = true;
+    tcond(0, 1) = false;
+    tcond(1, 0) = false;
+    tcond(1, 1) = true;
+    std::iota(tx.elements().begin(), tx.elements().end(), 0.f);
+    std::iota(ty.elements().begin(), ty.elements().end(), 4.f);
+    ntt::where(tcond, tx, ty, tout.view());
+    assert(are_floats_equal(tout(0, 0), 0.f));
+    assert(are_floats_equal(tout(0, 1), 5.f));
+    assert(are_floats_equal(tout(1, 0), 6.f));
+    assert(are_floats_equal(tout(1, 1), 3.f));
+}
+
+#if 0
+void test_reduce_arg() {
+    ntt::tensor<float, ntt::fixed_shape<2, 4>> ta;
+    ta(0, 0) = 0.f;
+    ta(0, 1) = 2.f;
+    ta(0, 2) = 4.f;
+    ta(0, 3) = 6.f;
+    ta(1, 0) = 7.f;
+    ta(1, 1) = 5.f;
+    ta(1, 2) = 3.f;
+    ta(1, 3) = 7.f;
+
+    ntt::tensor<int64_t, ntt::fixed_shape<2, 1>> tb;
+    ntt::reduce_arg<ntt::ops::max, 1, false, true>(
+        ta, tb.view(), ntt::fixed_shape<>(), ntt::fixed_shape<>());
+    assert(tb(0, 0) == 3);
+    assert(tb(1, 0) == 0);
+
+    ntt::tensor<int64_t, ntt::fixed_shape<1, 4>> tc;
+    ntt::reduce_arg<ntt::ops::max, 0, false, true>(
+        ta, tc.view(), ntt::fixed_shape<>(), ntt::fixed_shape<>());
+    assert(tc(0, 0) == 1);
+    assert(tc(0, 1) == 1);
+    assert(tc(0, 2) == 0);
+    assert(tc(0, 3) == 1);
+
+    ntt::tensor<int64_t, ntt::fixed_shape<2>> td;
+    ntt::reduce_arg<ntt::ops::max, 1, true, false>(
+        ta, td.view(), ntt::fixed_shape<>(), ntt::fixed_shape<>());
+    assert(td(0) == 3);
+    assert(td(1) == 3);
+}
+#endif
+
+__global__ void test_device() {
+    printf("Start Device tests...\n");
+    test_shape();
+    test_strides();
+    test_matmul_normal();
+    test_matmul_transpose_b();
+    test_unary_binary();
+    test_tensor_view();
+    test_vectorize();
+    // test_im2col();
+    test_concat();
+    test_slice();
+    test_transpose();
+    test_gather();
+    test_pad();
+    test_reduce();
+    test_cast();
+    test_expand();
+    test_where();
+    printf("All Device tests passed!\n");
+}
+
+int main() {
+#if 0
+    nncase_clr_initialize(
+        R"(E:\Work\Repos\nncase-v2\nncase\src\Nncase.Compiler\bin\Debug\net6.0\Nncase.Compiler.dll)");
+    auto target_name = "cpu"sv;
+    auto nncapi = nncase_clr_api();
+    clr_object_ptr target, compile_session, compiler, compile_options;
+    compile_options = nncapi->compile_options_create();
+    target = nncapi->target_create(target_name.data(), target_name.length());
+    nncapi->compile_session_create(target.get(), compile_options.get());
+    compiler = nncapi->compile_session_get_compiler(compile_session.get());
+#endif
+    printf("Start Host tests...\n");
+    test_shape();
+    test_strides();
+    test_matmul_normal();
+    test_matmul_transpose_b();
+    test_unary_binary();
+    test_tensor_view();
+    test_vectorize();
+    // test_im2col();
+    test_concat();
+    test_slice();
+    test_transpose();
+    test_gather();
+    test_pad();
+    test_reduce();
+    test_cast();
+    test_expand();
+    test_where();
+#if 0
+    test_reduce_arg();
+#endif
+    printf("All Host tests passed!\n");
+
+    test_device<<<1, 1>>>();
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+#if 0
+    auto kmodel = read_file(
+        R"(/mnt/home-nas/work/repo/nncase/tests_output/UnitTestCPUTarget/TestSimpleUnary/TestSimpleUnary.kmodel)");
+
+    interpreter *interp;
+    TRY(nncase_interp_create(&interp));
+    TRY(nncase_interp_load_model(interp, kmodel.data(), kmodel.size(), false));
+
+    runtime_function *entry;
+    TRY(nncase_interp_get_entry_func(interp, &entry));
+
+    buffer_allocator *host_alloc;
+    TRY(nncase_buffer_allocator_get_host(&host_alloc));
+
+    datatype_node *dtype_int64, *dtype_float32;
+    TRY(nncase_dtype_create_prime(dt_int64, &dtype_int64));
+    TRY(nncase_dtype_create_prime(dt_float32, &dtype_float32));
+
+    float x[] = {-1.f};
+    buffer_node *x_buf;
+    TRY(nncase_buffer_allocator_alloc(host_alloc, sizeof(x), nullptr, &x_buf));
+    {
+        host_buffer_node *x_host_buf;
+        void *x_buf_data;
+        TRY(nncase_buffer_as_host(x_buf, &x_host_buf));
+        TRY(nncase_host_buffer_map(x_host_buf, map_write, &x_buf_data,
+                                   nullptr));
+        memcpy(x_buf_data, x, sizeof(x));
+        TRY(nncase_host_buffer_unmap(x_host_buf));
+        TRY(nncase_object_release((object_node *)x_host_buf));
+    }
+
+    tensor_node *x_tensor;
+    uint32_t dims[] = {1, 1};
+    uint32_t strides[] = {1, 1};
+    nncase_buffer_slice x_buffer_slice{x_buf, 0, sizeof(x)};
+    TRY(nncase_tensor_create(dtype_float32, dims, 1, strides, 1,
+                             &x_buffer_slice, &x_tensor));
+
+    value_node *params[] = {(value_node *)x_tensor};
+    tensor_node *ret = nullptr;
+
+    auto time_begin = std::chrono::steady_clock::now();
+
+    TRY(nncase_func_invoke(entry, params, 1, (value_node **)&ret));
+
+    auto time_end = std::chrono::steady_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        time_end - time_begin);
+    printf("Duration: %.2fms\n", duration.count() / 1e3);
+
+    uint32_t ret_dims_len;
+    TRY(nncase_tensor_get_dims(ret, nullptr, &ret_dims_len));
+    std::vector<uint32_t> ret_dims(ret_dims_len);
+    TRY(nncase_tensor_get_dims(ret, ret_dims.data(), &ret_dims_len));
+
+    nncase_buffer_slice out_buffer_slice;
+    TRY(nncase_tensor_get_buffer(ret, &out_buffer_slice));
+    {
+        host_buffer_node *ret_host_buf;
+        void *ret_buf_data;
+        uint32_t ret_bytes;
+        TRY(nncase_buffer_as_host(out_buffer_slice.buffer, &ret_host_buf));
+        TRY(nncase_host_buffer_map(ret_host_buf, map_read, &ret_buf_data,
+                                   &ret_bytes));
+
+        auto ret_float_data = (float *)ret_buf_data;
+        std::cout << *ret_float_data << std::endl;
+
+        TRY(nncase_host_buffer_unmap(ret_host_buf));
+        TRY(nncase_object_release((object_node *)ret_host_buf));
+    }
+
+    TRY(nncase_object_release((object_node *)out_buffer_slice.buffer));
+    TRY(nncase_object_release((object_node *)ret));
+    TRY(nncase_object_release((object_node *)x_buf));
+    TRY(nncase_object_release((object_node *)x_tensor));
+    TRY(nncase_object_release((object_node *)dtype_int64));
+    TRY(nncase_interp_free(interp));
+#endif
+    return 0;
+}
diff --git a/src/Native/src/runtime/CMakeLists.txt b/src/Native/src/runtime/CMakeLists.txt
index 8fa38c51b..f98ed4c40 100644
--- a/src/Native/src/runtime/CMakeLists.txt
+++ b/src/Native/src/runtime/CMakeLists.txt
@@ -17,7 +17,7 @@ set(SRCS buffer.cpp
 		 type_serializer.cpp
 		 tensor_serializer.cpp
 		 runtime_tensor.cpp
-         dump_manager.cpp)
+     dump_manager.cpp)
 
 if ((NOT BUILDING_RUNTIME) OR DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL)
     list(APPEND SRCS host_allocator.cpp)
@@ -33,11 +33,14 @@ if (BUILDING_RUNTIME)
     if (ENABLE_DUMP_MANAGER)
       target_compile_definitions(runtime PUBLIC -DNNCASE_DUMP_MANAGER)
     endif()
+    if (ENABLE_CUDA_RUNTIME)
+      target_compile_definitions(runtime PRIVATE -DNNCASE_ENABLE_CUDA_RUNTIME)
+    endif()
     set_property(TARGET runtime PROPERTY POSITION_INDEPENDENT_CODE ON)
     install(TARGETS runtime EXPORT nncaseruntimeTargets)
 
     add_library(nncaseruntime STATIC dummy.cpp)
-    target_link_libraries(nncaseruntime PUBLIC nlohmann_json::nlohmann_json PRIVATE nncasebase runtime runtime_cpu cpu_loaders)
+    target_link_libraries(nncaseruntime PUBLIC nlohmann_json::nlohmann_json PRIVATE nncasebase runtime)
     set_target_properties(nncaseruntime PROPERTIES
                                         OUTPUT_NAME "Nncase.Runtime.Native")
     install(TARGETS nncaseruntime EXPORT nncaseruntimeTargets
@@ -63,10 +66,13 @@ else()
     if (ENABLE_DUMP_MANAGER)
       target_compile_definitions(simulator PUBLIC -DNNCASE_DUMP_MANAGER)
     endif()
+    if (ENABLE_CUDA_RUNTIME)
+      target_compile_definitions(simulator PRIVATE -DNNCASE_ENABLE_CUDA_RUNTIME)
+    endif()
     set_property(TARGET simulator PROPERTY POSITION_INDEPENDENT_CODE ON)
     
     add_library(nncaseruntime SHARED dummy.cpp)
-    target_link_libraries(nncaseruntime PUBLIC nlohmann_json::nlohmann_json PRIVATE nncasebase simulator compiler simulator_cpu cpu_loaders fmt::fmt)
+    target_link_libraries(nncaseruntime PUBLIC nlohmann_json::nlohmann_json PRIVATE nncasebase simulator compiler fmt::fmt)
     target_compile_definitions(nncaseruntime PUBLIC -DNNCASE_SHARED_LIBS)
     if (NOT (WIN32 OR APPLE))
       target_link_libraries(nncaseruntime PRIVATE dl pthread)
@@ -89,3 +95,6 @@ else()
 endif()
 
 add_subdirectory(cpu)
+if (ENABLE_CUDA_RUNTIME)
+    add_subdirectory(cuda)
+endif()
diff --git a/src/Native/src/runtime/cpu/CMakeLists.txt b/src/Native/src/runtime/cpu/CMakeLists.txt
index e356fce51..f32b775b9 100644
--- a/src/Native/src/runtime/cpu/CMakeLists.txt
+++ b/src/Native/src/runtime/cpu/CMakeLists.txt
@@ -11,8 +11,10 @@ if (BUILDING_RUNTIME)
     target_link_libraries(runtime_cpu PUBLIC runtime cpu_loaders)
     set_property(TARGET runtime_cpu PROPERTY POSITION_INDEPENDENT_CODE ON)
     install(TARGETS runtime_cpu EXPORT nncaseruntimeTargets)
+    target_link_libraries(nncaseruntime PUBLIC runtime_cpu)
 else()
     add_library(simulator_cpu OBJECT ${SRCS})
     target_link_libraries(simulator_cpu PUBLIC simulator cpu_loaders)
     set_property(TARGET simulator_cpu PROPERTY POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(nncaseruntime PRIVATE simulator_cpu cpu_loaders)
 endif()
diff --git a/src/Native/src/runtime/cpu/runtime_function.cpp b/src/Native/src/runtime/cpu/runtime_function.cpp
index 95295c8d4..e867598e1 100644
--- a/src/Native/src/runtime/cpu/runtime_function.cpp
+++ b/src/Native/src/runtime/cpu/runtime_function.cpp
@@ -45,6 +45,7 @@ typedef struct {
     uint32_t local_data_align;
     uint64_t output_pool_size;
     uint64_t local_data_pool_size;
+    uint64_t warp_local_data_pool_size;
     uint64_t block_local_data_pool_size;
 } kernel_desc_header;
 
@@ -59,8 +60,9 @@ cpu_runtime_module &cpu_runtime_function::module() const noexcept {
 
 result<void> cpu_runtime_function::initialize_core(
     runtime_function_init_context &context) noexcept {
+    const auto blocks_count = module().cdim() * module().bdim();
     try_(context.read_section(
-        ".desc", [this](auto reader, size_t) -> result<void> {
+        ".desc", [this, blocks_count](auto reader, size_t) -> result<void> {
             auto header = reader.template read<kernel_desc_header>();
 
             // Allocate output buffer
@@ -74,7 +76,6 @@ result<void> cpu_runtime_function::initialize_core(
 
             // Allocate thread local datas
             options.alignment = header.local_data_align;
-            auto blocks_count = module().cdim() * module().bdim();
             thread_local_datas_.resize(blocks_count);
             for (size_t i = 0; i < blocks_count; i++) {
                 try_var(buffer,
@@ -122,6 +123,21 @@ result<void> cpu_runtime_function::initialize_core(
         };
     }
 
+    // Allocate profiling records
+    if (module()
+            .interp()
+            .options()
+            .get_scalar_opt<uint8_t>("enable_profiling")
+            .or_(false)) {
+        profile_records_.resize(blocks_count);
+        profile_record_counts_.resize(blocks_count);
+        for (size_t i = 0; i < blocks_count; i++) {
+            profile_records_[i].resize(module().tdim() *
+                                       default_profile_record_count);
+            profile_record_counts_[i].resize(module().tdim());
+        }
+    }
+
     return ok();
 }
 
diff --git a/src/Native/src/runtime/cpu/runtime_function.h b/src/Native/src/runtime/cpu/runtime_function.h
index e5e615bff..fea1c3fd1 100644
--- a/src/Native/src/runtime/cpu/runtime_function.h
+++ b/src/Native/src/runtime/cpu/runtime_function.h
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 #pragma once
+#include "nncase/ntt/profiling.h"
 #include "runtime_module.h"
 #include <nncase/ntt/arch/cpu/runtime.h>
 #include <nncase/runtime/host_buffer.h>
@@ -24,6 +25,8 @@ BEGIN_NS_NNCASE_RT_MODULE(cpu)
 
 class cpu_runtime_function final : public runtime_function {
   public:
+    static constexpr size_t default_profile_record_count = 10000;
+
     cpu_runtime_function(runtime_module &rt_module);
     virtual ~cpu_runtime_function();
 
@@ -45,6 +48,16 @@ class cpu_runtime_function final : public runtime_function {
         return mapped_local_data.buffer();
     }
 
+    const std::span<ntt::runtime::profile_record>
+    thread_local_profile_records(size_t block_id) noexcept {
+        return profile_records_[block_id];
+    }
+
+    const std::span<uint32_t>
+    thread_local_profile_record_counts(size_t block_id) noexcept {
+        return profile_record_counts_[block_id];
+    }
+
   protected:
     result<void>
     initialize_core(runtime_function_init_context &context) noexcept override;
@@ -66,6 +79,9 @@ class cpu_runtime_function final : public runtime_function {
     std::vector<ntt::runtime::thread_inout_desc> output_descs_;
     std::vector<dims_t> output_shapes_;
     std::vector<dims_t> output_strides_;
+
+    std::vector<std::vector<ntt::runtime::profile_record>> profile_records_;
+    std::vector<std::vector<uint32_t>> profile_record_counts_;
 };
 
 END_NS_NNCASE_RT_MODULE
diff --git a/src/Native/src/runtime/cpu/runtime_function.run.cpp b/src/Native/src/runtime/cpu/runtime_function.run.cpp
index ff3246ef4..2721013fa 100644
--- a/src/Native/src/runtime/cpu/runtime_function.run.cpp
+++ b/src/Native/src/runtime/cpu/runtime_function.run.cpp
@@ -13,7 +13,6 @@
  * limitations under the License.
  */
 #include "runtime_function.h"
-#include <nncase/ntt/arch/cpu/profiling.h>
 #include <nncase/ntt/arch/cpu/runtime.h>
 #include <nncase/runtime/dbg.h>
 #include <nncase/runtime/interpreter.h>
@@ -29,7 +28,6 @@ using namespace nncase::ntt::runtime;
 
 result<void> cpu_runtime_function::run(std::byte *output_data) noexcept {
     std::vector<std::thread> blocks;
-    timer_record timer_records[24];
     try_var(enable_profiling,
             module().interp().options().get_scalar_opt<uint8_t>(
                 "enable_profiling"));
@@ -45,7 +43,7 @@ result<void> cpu_runtime_function::run(std::byte *output_data) noexcept {
                 module().block_local_rdata_content().subspan(
                     block_local_rdata_offset, block_local_rdata_size);
             blocks.emplace_back([cid, bid, linear_bid, tid_offset,
-                                 enable_profiling, timer_records, output_data,
+                                 enable_profiling, output_data,
                                  block_local_rdata, this] {
                 cpu_block_entry_params_t block_entry_params{
                     .tdim = module().tdim(),
@@ -54,14 +52,11 @@ result<void> cpu_runtime_function::run(std::byte *output_data) noexcept {
                     .bid = bid,
                     .cid = cid,
                     .cpu_id_offset = tid_offset,
+                    .enable_profiling = enable_profiling,
                     .input_descs = this->input_descs_.data(),
                     .output_descs = this->output_descs_.data(),
                     .rdata = module().rdata(),
                     .output = output_data,
-                    .enable_profiling = enable_profiling,
-                    .timer_records = const_cast<timer_record *>(
-                        &timer_records[cid * module().bdim() * module().tdim() +
-                                       bid * module().tdim()]),
                     .thread_local_rdata_header =
                         module().thread_local_rdata_header(tid_offset),
                     .thread_local_cache_header =
@@ -73,6 +68,15 @@ result<void> cpu_runtime_function::run(std::byte *output_data) noexcept {
                     .block_local_rdata = block_local_rdata,
                     .thread_local_data = thread_local_data(linear_bid),
                     .block_local_data = block_local_data(linear_bid),
+                    .profile_records =
+                        enable_profiling
+                            ? thread_local_profile_records(linear_bid)
+                            : std::span<ntt::runtime::profile_record>{},
+                    .profile_record_counts =
+                        enable_profiling
+                            ? thread_local_profile_record_counts(linear_bid)
+                                  .data()
+                            : nullptr,
 #ifdef __APPLE__
                     .cpu_thread_context_key = module().cpu_thread_context_key(),
 #endif
diff --git a/src/Native/src/runtime/cpu/runtime_module.cpp b/src/Native/src/runtime/cpu/runtime_module.cpp
index 434b165c3..cc5610f13 100644
--- a/src/Native/src/runtime/cpu/runtime_module.cpp
+++ b/src/Native/src/runtime/cpu/runtime_module.cpp
@@ -28,9 +28,9 @@ using namespace nncase::ntt::runtime;
 
 typedef struct {
     uint32_t tdim;
+    uint32_t wdim;
     uint32_t bdim;
     uint32_t cdim;
-    uint32_t reserved0;
 } module_desc_header;
 
 cpu_runtime_module::cpu_runtime_module() noexcept
diff --git a/src/Native/src/runtime/cuda/CMakeLists.txt b/src/Native/src/runtime/cuda/CMakeLists.txt
new file mode 100644
index 000000000..4fbe145f7
--- /dev/null
+++ b/src/Native/src/runtime/cuda/CMakeLists.txt
@@ -0,0 +1,22 @@
+﻿cmake_minimum_required (VERSION 3.13)
+
+find_package(CUDAToolkit REQUIRED)
+
+add_subdirectory(loaders)
+
+set(SRCS runtime_module.cpp
+         runtime_function.cpp
+         runtime_function.run.cpp)
+
+if (BUILDING_RUNTIME)
+    add_library(runtime_cuda OBJECT ${SRCS})
+    set_property(TARGET runtime_cuda PROPERTY POSITION_INDEPENDENT_CODE ON)
+    install(TARGETS runtime_cuda EXPORT nncaseruntimeTargets)
+    target_link_libraries(runtime_cuda PUBLIC CUDA::cudart)
+    target_link_libraries(nncaseruntime PUBLIC runtime_cuda cuda_loaders)
+else()
+    add_library(simulator_cuda OBJECT ${SRCS})
+    target_link_libraries(simulator_cuda PUBLIC simulator cuda_loaders)
+    set_property(TARGET simulator_cuda PROPERTY POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(nncaseruntime PRIVATE simulator_cuda cuda_loaders)
+endif()
diff --git a/src/Native/src/runtime/cuda/loaders/CMakeLists.txt b/src/Native/src/runtime/cuda/loaders/CMakeLists.txt
new file mode 100644
index 000000000..a66a2b4a5
--- /dev/null
+++ b/src/Native/src/runtime/cuda/loaders/CMakeLists.txt
@@ -0,0 +1,11 @@
+﻿cmake_minimum_required (VERSION 3.13)
+
+set(SRCS cuda_loader.cpp)
+
+add_library(cuda_loaders OBJECT ${SRCS})
+set_property(TARGET cuda_loaders PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(cuda_loaders PUBLIC CUDA::cudart)
+
+if (BUILDING_RUNTIME)
+    install(TARGETS cuda_loaders EXPORT nncaseruntimeTargets)
+endif()
diff --git a/src/Native/src/runtime/cuda/loaders/cuda_loader.cpp b/src/Native/src/runtime/cuda/loaders/cuda_loader.cpp
new file mode 100644
index 000000000..b17a473ee
--- /dev/null
+++ b/src/Native/src/runtime/cuda/loaders/cuda_loader.cpp
@@ -0,0 +1,44 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuda_loader.h"
+#include <cuda_runtime_api.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <nncase/runtime/result.h>
+#include <nncase/runtime/span_reader.h>
+
+using namespace nncase::runtime;
+
+#define THROW_CUDA_IF_FAILED(x)                                                \
+    if ((x) != cudaSuccess) {                                                  \
+        throw std::runtime_error(cudaGetErrorString(x));                       \
+    }
+
+cuda_loader::~cuda_loader() {
+    if (mod_) {
+        cudaLibraryUnload(mod_);
+    }
+}
+
+void cuda_loader::load(std::span<const std::byte> fatbin) {
+    THROW_CUDA_IF_FAILED(cudaLibraryLoadData(&mod_, fatbin.data(), nullptr,
+                                             nullptr, 0, nullptr, nullptr, 0));
+    THROW_CUDA_IF_FAILED(cudaLibraryGetKernel(&sym_, mod_, "block_entry"));
+}
+void cuda_loader::load_from_file(std::string_view path) {
+    THROW_CUDA_IF_FAILED(cudaLibraryLoadFromFile(
+        &mod_, path.data(), nullptr, nullptr, 0, nullptr, nullptr, 0));
+    THROW_CUDA_IF_FAILED(cudaLibraryGetKernel(&sym_, mod_, "block_entry"));
+}
diff --git a/src/Native/src/runtime/cuda/loaders/cuda_loader.h b/src/Native/src/runtime/cuda/loaders/cuda_loader.h
new file mode 100644
index 000000000..efd14986e
--- /dev/null
+++ b/src/Native/src/runtime/cuda/loaders/cuda_loader.h
@@ -0,0 +1,46 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cstdint>
+#include <cuda_runtime_api.h>
+#include <nncase/compiler_defs.h>
+#include <span>
+#include <string_view>
+
+BEGIN_NS_NNCASE_RUNTIME
+
+class cuda_loader {
+  public:
+    cuda_loader() noexcept
+        :
+#if 0
+     ofi_(nullptr),
+#endif
+          mod_(nullptr),
+          sym_(nullptr) {
+    }
+    ~cuda_loader();
+
+    void load(std::span<const std::byte> fatbin);
+    void load_from_file(std::string_view path);
+    uintptr_t handle() const noexcept { return (uintptr_t)mod_; }
+    cudaKernel_t entry() const noexcept { return sym_; }
+
+  private:
+    cudaLibrary_t mod_;
+    cudaKernel_t sym_;
+};
+
+END_NS_NNCASE_RUNTIME
diff --git a/src/Native/src/runtime/cuda/runtime_function.cpp b/src/Native/src/runtime/cuda/runtime_function.cpp
new file mode 100644
index 000000000..9b6bca168
--- /dev/null
+++ b/src/Native/src/runtime/cuda/runtime_function.cpp
@@ -0,0 +1,327 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "runtime_function.h"
+#include "nncase/runtime/buffer.h"
+#include <cstdint>
+#include <nncase/llm/paged_attention_kv_cache.h>
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/runtime/allocator.h>
+#include <nncase/runtime/dbg.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_op_utility.h>
+#include <nncase/runtime/util.h>
+#include <nncase/type.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::cuda;
+using namespace nncase::ntt::runtime;
+
+typedef struct {
+    uint32_t output_align;
+    uint32_t local_data_align;
+    uint64_t output_pool_size;
+    uint64_t thread_local_data_pool_size;
+    uint64_t warp_local_data_pool_size;
+    uint64_t block_local_data_pool_size;
+} kernel_desc_header;
+
+cuda_runtime_function::cuda_runtime_function(runtime_module &rt_module)
+    : runtime_function(rt_module), block_entry_(nullptr) {}
+
+cuda_runtime_function::~cuda_runtime_function() {}
+
+cuda_runtime_module &cuda_runtime_function::module() const noexcept {
+    return static_cast<cuda_runtime_module &>(runtime_function::module());
+}
+
+result<void> cuda_runtime_function::initialize_core(
+    runtime_function_init_context &context) noexcept {
+    try_(context.read_section(
+        ".desc", [this](auto reader, size_t) -> result<void> {
+            auto header = reader.template read<kernel_desc_header>();
+
+            // Allocate output buffer
+            buffer_allocate_options options{};
+            options.flags = HOST_BUFFER_ALLOCATE_CPU_ONLY;
+            options.alignment = header.output_align;
+            try_var(output_buffer, buffer_allocator::host().allocate(
+                                       header.output_pool_size, options));
+            try_set(this->output_buffer_,
+                    output_buffer.template as<host_buffer_t>());
+
+            const size_t thread_local_data_size =
+                header.thread_local_data_pool_size * module().tdim() *
+                module().wdim() * module().bdim();
+            const size_t warp_local_data_size =
+                header.warp_local_data_pool_size * module().wdim() *
+                module().bdim();
+            const size_t block_local_data_size =
+                header.block_local_data_pool_size * module().bdim();
+            for (size_t cid = 0; cid < module().cdim(); cid++) {
+                CHECK_CUDA(cudaSetDevice(cid));
+
+                // Allocate thread local datas
+                std::byte *thread_local_data_dev_ptr;
+                CHECK_CUDA(cudaMalloc((void **)&thread_local_data_dev_ptr,
+                                      thread_local_data_size));
+                thread_local_datas_.emplace_back(thread_local_data_dev_ptr,
+                                                 thread_local_data_size);
+
+                // Allocate warp local datas
+                std::byte *warp_local_data_dev_ptr;
+                CHECK_CUDA(cudaMalloc((void **)&warp_local_data_dev_ptr,
+                                      warp_local_data_size));
+                warp_local_datas_.emplace_back(warp_local_data_dev_ptr,
+                                               warp_local_data_size);
+
+                // Allocate block local datas
+                std::byte *block_local_data_dev_ptr;
+                CHECK_CUDA(cudaMalloc((void **)&block_local_data_dev_ptr,
+                                      block_local_data_size));
+                block_local_datas_.emplace_back(block_local_data_dev_ptr,
+                                                block_local_data_size);
+            }
+            return ok();
+        }));
+    try_set(block_entry_, module().block_entry());
+
+    // Allocate input descs
+    auto input_size = parameters_size();
+    input_descs_.resize(input_size);
+
+    // Allocate output descs
+    auto output_size = return_size();
+    output_descs_.resize(output_size);
+    output_shapes_.resize(output_size);
+    output_strides_.resize(output_size);
+    for (size_t i = 0; i < output_size; i++) {
+        try_var(type, return_type(i));
+        try_var(ttype, type.as<tensor_type>());
+        auto rank = ttype->shape().rank();
+        CHECK_WITH_ERR(rank.has_value(), std::errc::invalid_argument);
+        output_shapes_[i].resize(*rank);
+        output_strides_[i].resize(*rank);
+        output_descs_[i] = thread_inout_desc{
+            .data = nullptr,
+            .size = 0,
+            .shape = output_shapes_[i].data(),
+            .strides = output_strides_[i].data(),
+        };
+    }
+
+    // Allocate profiling records
+    if (module()
+            .interp()
+            .options()
+            .get_scalar_opt<uint8_t>("enable_profiling")
+            .or_(false)) {
+        // profile_records_.resize(blocks_count);
+        // profile_record_counts_.resize(blocks_count);
+        // for (size_t i = 0; i < blocks_count; i++) {
+        //     profile_records_[i].resize(module().tdim() *
+        //                                default_profile_record_count);
+        //     profile_record_counts_[i].resize(module().tdim());
+        // }
+    }
+
+    return ok();
+}
+
+result<value_t> cuda_runtime_function::invoke_core(
+    std::span<value_t> parameters,
+    [[maybe_unused]] value_t return_value) noexcept {
+    size_t input_id = 0;
+    std::vector<thread_paged_attention_kv_cache_desc *> inout_paged_kvcaches;
+    for (auto arg : parameters) {
+        try_var(t, arg.as<tensor>());
+        try_var(hb, t->buffer().as_host());
+        try_var(m, hb.map(map_read_write));
+
+        if (t->dtype().is_a<reference_type_t>()) {
+            auto rt = t->dtype().as<reference_type_t>().expect(
+                "now only support reference value type!");
+            auto vt = rt->elemtype().as<value_type_t>().expect(
+                "now only support reference value type!");
+            if (vt->uuid() == datatype_t::paged_attention_kv_cache->uuid()) {
+                auto refspan =
+                    as_span<llm::paged_attention_kv_cache_node *>(m.buffer());
+                thread_paged_attention_kv_cache_desc *descs =
+                    new thread_paged_attention_kv_cache_desc[refspan.size()];
+                for (size_t i = 0; i < refspan.size(); i++) {
+                    auto &node = refspan[i];
+                    auto &desc = descs[i];
+                    {
+                        desc.num_seqs = node->num_seqs();
+                        desc.num_tokens = node->num_tokens();
+                        {
+                            try_var(hbf,
+                                    node->context_lens()->buffer().as_host());
+                            try_var(mbf, hbf.map(map_read));
+                            desc.context_lens = (int64_t *)mbf.buffer().data();
+                            desc.context_lens_size =
+                                mbf.buffer().size_bytes() / sizeof(int64_t);
+                        }
+                        {
+                            try_var(hbf, node->seq_lens()->buffer().as_host());
+                            try_var(mbf, hbf.map(map_read));
+                            desc.seq_lens = (int64_t *)mbf.buffer().data();
+                            desc.seq_lens_size =
+                                mbf.buffer().size_bytes() / sizeof(int64_t);
+                        }
+
+                        // Paged attention specific parameters
+                        {
+                            try_var(hbf,
+                                    node->block_tables()->buffer().as_host());
+                            try_var(mbf, hbf.map(map_read));
+                            desc.block_table = (int64_t *)mbf.buffer().data();
+                            desc.block_table_shape[0] =
+                                node->block_tables()->shape()[0];
+                            desc.block_table_shape[1] =
+                                node->block_tables()->shape()[1];
+                            desc.block_table_shape[2] =
+                                node->block_tables()->shape()[2];
+                        }
+                        {
+                            try_var(hbf,
+                                    node->slot_mapping()->buffer().as_host());
+                            try_var(mbf, hbf.map(map_read));
+                            desc.slot_mapping = (int64_t *)mbf.buffer().data();
+                            desc.slot_mapping_shape[0] =
+                                node->slot_mapping()->shape()[0];
+                            desc.slot_mapping_shape[1] =
+                                node->slot_mapping()->shape()[1];
+                        }
+
+                        {
+                            auto &kv_cache = node->kv_caches()[0];
+                            if (kv_cache->dtype().equals(datatype_t::int64)) {
+                                // FIXME: TP is not supported yet
+                                CHECK_WITH_ERR(node->kv_caches().size() == 1,
+                                               std::errc::not_supported);
+                                // 1. kv_cache is addresses of kv cache buffers
+                                try_var(hbf, kv_cache->buffer().as_host());
+                                try_var(mbf, hbf.map(map_read));
+                                auto kv_cache_addrs_span =
+                                    runtime::as_span<const intptr_t>(
+                                        mbf.buffer());
+                                std::copy(kv_cache_addrs_span.begin(),
+                                          kv_cache_addrs_span.end(),
+                                          desc.kv_cache_addrs.begin());
+                            } else {
+                                // 2. kv_cache is kv cache buffers
+                                size_t i = 0;
+                                for (auto kv_cache : node->kv_caches()) {
+                                    try_var(hbf, kv_cache->buffer().as_host());
+                                    try_var(mbf, hbf.map(map_read));
+                                    desc.kv_cache_addrs[i++] =
+                                        reinterpret_cast<intptr_t>(
+                                            mbf.buffer().data());
+                                }
+                            }
+                        }
+                    }
+                }
+                inout_paged_kvcaches.push_back(descs);
+                input_descs_[input_id++] = thread_inout_desc{
+                    .data = (std::byte *)descs,
+                    .size = sizeof(thread_paged_attention_kv_cache_desc) *
+                            refspan.size(),
+                    .shape = const_cast<size_t *>(t->shape().data()),
+                    .strides = const_cast<size_t *>(t->strides().data()),
+                };
+            } else {
+                return err(std::errc::not_supported);
+            }
+        } else {
+            input_descs_[input_id++] = thread_inout_desc{
+                .data = m.buffer().data(),
+                .size = m.buffer().size(),
+                .shape = const_cast<size_t *>(t->shape().data()),
+                .strides = const_cast<size_t *>(t->strides().data()),
+            };
+        }
+        m.release();
+    }
+
+    try_var(mapped_output, output_buffer_->map(map_read_write));
+    auto output_data = mapped_output.buffer().data();
+
+    try_(run(output_data));
+
+    std::vector<value_t> outputs(return_size());
+    for (size_t i = 0; i < outputs.size(); i++) {
+        try_set(outputs[i], create_output_tensor(i, parameters, output_data));
+    }
+
+    for (auto arg : parameters) {
+        try_var(t, arg.as<tensor>());
+        try_var(hb, t->buffer().as_host());
+        try_(hb.unmap());
+    }
+
+    for (auto ptrs : inout_paged_kvcaches) {
+        delete[] ptrs;
+    }
+
+    auto output_value = outputs.size() == 1
+                            ? outputs[0]
+                            : tuple(std::in_place, std::move(outputs));
+    return ok(output_value);
+}
+
+result<tensor>
+cuda_runtime_function::create_output_tensor(size_t output_id,
+                                            std::span<value_t> parameters,
+                                            std::byte *output_data) noexcept {
+    auto &output_desc = output_descs_[output_id];
+    buffer_slice buffer;
+    intptr_t offset;
+    // 1. Find in inputs
+    for (size_t i = 0; i < input_descs_.size(); i++) {
+        auto &candidate_desc = input_descs_[i];
+        if (candidate_desc.data <= output_desc.data &&
+            candidate_desc.data + candidate_desc.size >=
+                output_desc.data + output_desc.size) {
+            try_var(t, parameters[i].as<tensor>());
+            buffer = t->buffer();
+            offset = output_desc.data - candidate_desc.data;
+            break;
+        }
+    }
+
+    // 2. Find in output buffer
+    if (buffer.buffer().empty()) {
+        if (output_data <= output_desc.data &&
+            output_data + output_buffer_->size_bytes() >=
+                output_desc.data + output_desc.size) {
+            buffer = buffer_slice(output_buffer_);
+            offset = output_desc.data - output_data;
+        }
+    }
+
+    if (buffer.buffer().empty()) {
+        return err(std::errc::invalid_argument);
+    }
+
+    // 2. Fix offset & size
+    buffer = buffer_slice(buffer.buffer(), buffer.start() + offset,
+                          output_desc.size);
+    try_var(output_type, return_type(output_id));
+    try_var(ttype, output_type.as<tensor_type>());
+    return ok(tensor(std::in_place, ttype->dtype(), output_shapes_[output_id],
+                     output_strides_[output_id], buffer));
+}
diff --git a/src/Native/src/runtime/cuda/runtime_function.h b/src/Native/src/runtime/cuda/runtime_function.h
new file mode 100644
index 000000000..818925e19
--- /dev/null
+++ b/src/Native/src/runtime/cuda/runtime_function.h
@@ -0,0 +1,93 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "nncase/ntt/profiling.h"
+#include "runtime_module.h"
+#include <cstddef>
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/runtime/host_buffer.h>
+#include <nncase/runtime/runtime_function.h>
+#include <nncase/tensor.h>
+#include <vector>
+
+BEGIN_NS_NNCASE_RT_MODULE(cuda)
+
+#define CHECK_CUDA(x)                                                          \
+    if ((x) != cudaSuccess) {                                                  \
+        std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - "  \
+                  << cudaGetErrorString(x) << std::endl;                       \
+        return err(std::errc::io_error);                                       \
+    }
+
+class cuda_runtime_function final : public runtime_function {
+  public:
+    static constexpr size_t default_profile_record_count = 10000;
+
+    cuda_runtime_function(runtime_module &rt_module);
+    virtual ~cuda_runtime_function();
+
+    cuda_runtime_module &module() const noexcept;
+
+    const std::span<std::byte>
+    thread_local_data(size_t chip_id) const noexcept {
+        return thread_local_datas_[chip_id];
+    }
+
+    const std::span<std::byte> warp_local_data(size_t chip_id) const noexcept {
+        return warp_local_datas_[chip_id];
+    }
+
+    const std::span<std::byte> block_local_data(size_t chip_id) const noexcept {
+        return block_local_datas_[chip_id];
+    }
+
+    const std::span<ntt::runtime::profile_record>
+    thread_local_profile_records(size_t chip_id) noexcept {
+        return profile_records_[chip_id];
+    }
+
+    const std::span<uint32_t>
+    thread_local_profile_record_counts(size_t chip_id) noexcept {
+        return profile_record_counts_[chip_id];
+    }
+
+  protected:
+    result<void>
+    initialize_core(runtime_function_init_context &context) noexcept override;
+    result<value_t> invoke_core(std::span<value_t> parameters,
+                                value_t return_value) noexcept override;
+
+  private:
+    result<void> run(std::byte *output_data) noexcept;
+    result<tensor> create_output_tensor(size_t output_id,
+                                        std::span<value_t> parameters,
+                                        std::byte *output_data) noexcept;
+
+  private:
+    block_entry_t block_entry_;
+    std::vector<std::span<std::byte>> thread_local_datas_;
+    std::vector<std::span<std::byte>> warp_local_datas_;
+    std::vector<std::span<std::byte>> block_local_datas_;
+    host_buffer_t output_buffer_;
+    std::vector<ntt::runtime::thread_inout_desc> input_descs_;
+    std::vector<ntt::runtime::thread_inout_desc> output_descs_;
+    std::vector<dims_t> output_shapes_;
+    std::vector<dims_t> output_strides_;
+
+    std::vector<std::vector<ntt::runtime::profile_record>> profile_records_;
+    std::vector<std::vector<uint32_t>> profile_record_counts_;
+};
+
+END_NS_NNCASE_RT_MODULE
diff --git a/src/Native/src/runtime/cuda/runtime_function.run.cpp b/src/Native/src/runtime/cuda/runtime_function.run.cpp
new file mode 100644
index 000000000..256c394d4
--- /dev/null
+++ b/src/Native/src/runtime/cuda/runtime_function.run.cpp
@@ -0,0 +1,87 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "runtime_function.h"
+#include <cuda_runtime_api.h>
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/runtime/dbg.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_op_utility.h>
+#include <nncase/runtime/type_serializer.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::cuda;
+using namespace nncase::ntt::runtime;
+
+#define WARP_SIZE 32
+
+result<void> cuda_runtime_function::run(std::byte *output_data) noexcept {
+    auto enable_profiling = module()
+                                .interp()
+                                .options()
+                                .get_scalar_opt<uint8_t>("enable_profiling")
+                                .or_(false);
+    for (size_t cid = 0; cid < module().cdim(); cid++) {
+        CHECK_CUDA(cudaSetDevice(cid));
+
+        cuda_block_entry_params_t *params;
+        CHECK_CUDA(cudaMallocHost((void **)&params,
+                                  sizeof(cuda_block_entry_params_t)));
+
+        cuda_block_entry_params_t src_params{
+            .tdim = module().tdim(),
+            .bdim = module().bdim(),
+            .cdim = module().cdim(),
+            .cid = cid,
+            .enable_profiling = enable_profiling,
+            .input_descs = this->input_descs_.data(),
+            .output_descs = this->output_descs_.data(),
+            .rdata = module().rdata(),
+            .output = output_data,
+            .thread_local_rdata_header = module().thread_local_rdata_header(
+                cid * module().bdim() * module().wdim() * module().tdim()),
+            .thread_local_rdata = module().thread_local_rdata_content(),
+            .warp_local_rdata_header = module().warp_local_rdata_header(
+                cid * module().bdim() * module().wdim()),
+            .warp_local_rdata = module().warp_local_rdata_content(),
+            .block_local_rdata_header =
+                module().block_local_rdata_header(cid * module().bdim()),
+            .block_local_rdata = module().block_local_rdata_content(),
+            .thread_local_data = thread_local_data(cid),
+            .warp_local_data = warp_local_data(cid),
+            .block_local_data = block_local_data(cid),
+            .profile_records = enable_profiling
+                                   ? thread_local_profile_records(cid)
+                                   : std::span<ntt::runtime::profile_record>{},
+            .profile_record_counts =
+                enable_profiling
+                    ? thread_local_profile_record_counts(cid).data()
+                    : nullptr,
+        };
+        memcpy(params, &src_params, sizeof(cuda_block_entry_params_t));
+
+        void *args[] = {&params};
+        CHECK_CUDA(cudaLaunchKernel(
+            (const void *)block_entry_, dim3(module().bdim()),
+            dim3(module().wdim() * module().tdim()), args, 0, nullptr));
+    }
+
+    for (size_t cid = 0; cid < module().cdim(); cid++) {
+        CHECK_CUDA(cudaSetDevice(cid));
+        CHECK_CUDA(cudaDeviceSynchronize());
+    }
+
+    return ok();
+}
diff --git a/src/Native/src/runtime/cuda/runtime_module.cpp b/src/Native/src/runtime/cuda/runtime_module.cpp
new file mode 100644
index 000000000..2b95ed34e
--- /dev/null
+++ b/src/Native/src/runtime/cuda/runtime_module.cpp
@@ -0,0 +1,122 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "runtime_module.h"
+#include "nncase/runtime/host_buffer.h"
+#include "runtime_function.h"
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/runtime/dbg.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_loader.h>
+#include <nncase/runtime/runtime_op_utility.h>
+#include <string_view>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::cuda;
+using namespace nncase::ntt::runtime;
+
+typedef struct {
+    uint32_t tdim;
+    uint32_t wdim;
+    uint32_t bdim;
+    uint32_t cdim;
+} module_desc_header;
+
+cuda_runtime_module::cuda_runtime_module() noexcept
+    : tdim_(0), wdim_(0), bdim_(0), cdim_(0) {}
+
+result<void> cuda_runtime_module::initialize_before_functions(
+    runtime_module_init_context &context) noexcept {
+    try_(context.read_section(
+        ".desc", [this](auto reader, size_t) -> result<void> {
+            auto header = reader.template read<module_desc_header>();
+            this->tdim_ = header.tdim;
+            this->wdim_ = header.wdim;
+            this->bdim_ = header.bdim;
+            this->cdim_ = header.cdim;
+            return ok();
+        }));
+
+    try_(initialize_text(context));
+    try_set(rdata_, initialize_section(context, ".rdata"));
+    try_set(thread_local_rdata_,
+            initialize_section(context, ".thread_local_rdata"));
+    try_set(warp_local_rdata_,
+            initialize_section(context, ".warp_local_rdata"));
+    try_set(block_local_rdata_,
+            initialize_section(context, ".block_local_rdata"));
+    return ok();
+}
+
+result<void> cuda_runtime_module::initialize_text(
+    runtime_module_init_context &context) noexcept {
+    auto cuda_external_module_path =
+        context.interp().options().get<std::string>(
+            "cuda_external_module_path");
+    if (cuda_external_module_path.is_ok() &&
+        !cuda_external_module_path.unwrap().empty()) {
+        loader_.load_from_file(cuda_external_module_path.unwrap());
+    } else {
+        try_set(text_,
+                context.get_or_read_section(".text", text_storage_, false));
+        loader_.load(text_);
+    }
+
+    return ok();
+}
+
+result<std::span<const std::byte>>
+cuda_runtime_module::initialize_section(runtime_module_init_context &context,
+                                        const char *name) noexcept {
+    host_buffer_t host_storage;
+    try_var(host_span, context.get_or_read_section(name, host_storage, false));
+    if (host_span.empty()) {
+        return ok(host_span);
+    } else {
+        std::byte *device_ptr;
+        CHECK_CUDA(cudaMalloc((void **)&device_ptr, host_span.size_bytes()));
+        CHECK_CUDA(cudaMemcpy(device_ptr, host_span.data(),
+                              host_span.size_bytes(), cudaMemcpyHostToDevice));
+        return ok(
+            std::span<const std::byte>(device_ptr, host_span.size_bytes()));
+    }
+}
+
+result<uintptr_t>
+cuda_runtime_module::native_handle(uint32_t flags) const noexcept {
+    CHECK_WITH_ERR(flags == 0, std::errc::invalid_argument);
+    return ok(loader_.handle());
+}
+
+result<block_entry_t> cuda_runtime_module::block_entry() const noexcept {
+    return ok((block_entry_t)loader_.entry());
+}
+
+result<std::unique_ptr<runtime_function>>
+cuda_runtime_module::create_function() noexcept {
+    std::unique_ptr<runtime_function> mod(new (std::nothrow)
+                                              cuda_runtime_function(*this));
+    if (mod)
+        return ok(std::move(mod));
+    return err(std::errc::not_enough_memory);
+}
+
+result<std::unique_ptr<runtime_module>> cuda::create_cuda_runtime_module() {
+    std::unique_ptr<runtime_module> mod(new (std::nothrow)
+                                            cuda_runtime_module());
+    if (mod)
+        return ok(std::move(mod));
+    return err(std::errc::not_enough_memory);
+}
diff --git a/src/Native/src/runtime/cuda/runtime_module.h b/src/Native/src/runtime/cuda/runtime_module.h
new file mode 100644
index 000000000..ac683962d
--- /dev/null
+++ b/src/Native/src/runtime/cuda/runtime_module.h
@@ -0,0 +1,118 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "loaders/cuda_loader.h"
+#include <cstdint>
+#include <nncase/ntt/arch/cuda/runtime.h>
+#include <nncase/runtime/cuda/runtime_module.h>
+
+BEGIN_NS_NNCASE_RT_MODULE(cuda)
+
+class cuda_runtime_module : public runtime_module {
+  public:
+    cuda_runtime_module() noexcept;
+    virtual ~cuda_runtime_module() = default;
+
+    result<uintptr_t> native_handle(uint32_t flags) const noexcept override;
+
+    uint64_t tdim() const noexcept { return tdim_; }
+    uint64_t wdim() const noexcept { return wdim_; }
+    uint64_t bdim() const noexcept { return bdim_; }
+    uint64_t cdim() const noexcept { return cdim_; }
+
+    result<block_entry_t> block_entry() const noexcept;
+    std::span<const std::byte> rdata() const noexcept { return rdata_; }
+
+    const std::span<const std::byte> thread_local_rdata() const noexcept {
+        return thread_local_rdata_;
+    }
+
+    const uint64_t *thread_local_rdata_header(size_t offset) const noexcept {
+        return reinterpret_cast<const uint64_t *>(thread_local_rdata_.data()) +
+               offset * 2;
+    }
+
+    const std::span<const std::byte>
+    thread_local_rdata_content() const noexcept {
+        return thread_local_rdata_.subspan(cdim_ * bdim_ * wdim_ * tdim_ * 2 *
+                                           sizeof(uint64_t));
+    }
+
+    const std::span<const std::byte> warp_local_rdata() const noexcept {
+        return warp_local_rdata_;
+    }
+
+    const uint64_t *warp_local_rdata_header(size_t offset) const noexcept {
+        return reinterpret_cast<const uint64_t *>(warp_local_rdata_.data()) +
+               offset * 2;
+    }
+
+    const std::span<const std::byte> warp_local_rdata_content() const noexcept {
+        return warp_local_rdata_.subspan(cdim_ * bdim_ * wdim_ * 2 *
+                                         sizeof(uint64_t));
+    }
+
+    const std::span<const std::byte> block_local_rdata() const noexcept {
+        return block_local_rdata_;
+    }
+
+    const uint64_t *block_local_rdata_header(size_t offset) const noexcept {
+        return reinterpret_cast<const uint64_t *>(block_local_rdata_.data()) +
+               offset * 2;
+    }
+
+    const std::span<const std::byte>
+    block_local_rdata_content() const noexcept {
+        return block_local_rdata_.subspan(cdim_ * bdim_ * 2 * sizeof(uint64_t));
+    }
+
+#ifdef __APPLE__
+    pthread_key_t cpu_thread_context_key() const noexcept {
+        return cpu_thread_context_key_;
+    }
+#endif
+
+  protected:
+    result<void> initialize_before_functions(
+        runtime_module_init_context &context) noexcept override;
+    result<std::unique_ptr<runtime_function>>
+    create_function() noexcept override;
+
+  private:
+    result<void> initialize_text(runtime_module_init_context &context) noexcept;
+    result<std::span<const std::byte>>
+    initialize_section(runtime_module_init_context &context,
+                       const char *name) noexcept;
+
+  private:
+    uint64_t tdim_;
+    uint64_t wdim_;
+    uint64_t bdim_;
+    uint64_t cdim_;
+    std::span<const std::byte> text_;
+    std::span<const std::byte> rdata_;
+    std::span<const std::byte> thread_local_rdata_;
+    std::span<const std::byte> warp_local_rdata_;
+    std::span<const std::byte> block_local_rdata_;
+    host_buffer_t text_storage_;
+    host_buffer_t rdata_storage_;
+    host_buffer_t thread_local_rdata_storage_;
+    host_buffer_t warp_local_rdata_storage_;
+    host_buffer_t block_local_rdata_storage_;
+
+    cuda_loader loader_;
+};
+
+END_NS_NNCASE_RT_MODULE
diff --git a/src/Native/src/runtime/runtime_loader.cpp b/src/Native/src/runtime/runtime_loader.cpp
index 0d8fe7dfd..0a6d1777a 100644
--- a/src/Native/src/runtime/runtime_loader.cpp
+++ b/src/Native/src/runtime/runtime_loader.cpp
@@ -12,6 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "nncase/runtime/model.h"
 #ifdef WIN32
 #include <Windows.h>
 #elif defined(__unix__) || defined(__APPLE__)
@@ -20,6 +21,9 @@
 
 #include <cstring>
 #include <nncase/runtime/cpu/runtime_module.h>
+#ifdef NNCASE_ENABLE_CUDA_RUNTIME
+#include <nncase/runtime/cuda/runtime_module.h>
+#endif
 #include <nncase/runtime/runtime_loader.h>
 #include <nncase/runtime/runtime_module.h>
 
@@ -111,17 +115,45 @@ FindRuntimeMethod(activator)
 #undef FindRuntimeMethod
 #endif
 
+namespace {
+std::pair<module_kind_t, rt_module_activator_t> builtin_activators[] = {
+    {cpu::cpu_module_kind,
+     [](result<std::unique_ptr<runtime_module>> &out) {
+         out = cpu::create_cpu_runtime_module();
+     }},
+#ifdef NNCASE_ENABLE_CUDA_RUNTIME
+    {cuda::cuda_module_kind,
+     [](result<std::unique_ptr<runtime_module>> &out) {
+         out = cuda::create_cuda_runtime_module();
+     }},
+#endif
+};
+
+result<rt_module_activator_t>
+create_builtin_activator(const module_kind_t &kind) {
+    for (auto &activator : builtin_activators) {
+        if (!strncmp(kind.data(), activator.first.data(),
+                     MAX_MODULE_KIND_LENGTH)) {
+            return ok(activator.second);
+        }
+    }
+    return err(nncase_errc::runtime_not_found);
+}
+} // namespace
+
 result<std::unique_ptr<runtime_module>>
 runtime_module::create(const module_kind_t &kind) {
-    if (!strncmp(kind.data(), cpu::cpu_module_kind.data(),
-                 MAX_MODULE_KIND_LENGTH)) {
-        return cpu::create_cpu_runtime_module();
+    auto activator = create_builtin_activator(kind);
+    if (activator.is_err()) {
+        activator = find_runtime_activator(kind);
     }
 
+    if (activator.is_err())
+        return err(activator.unwrap_err());
+
     result<std::unique_ptr<runtime_module>> rt_module(
         nncase_errc::runtime_not_found);
-    try_var(activator, find_runtime_activator(kind));
-    activator(rt_module);
+    activator.unwrap()(rt_module);
     return rt_module;
 }
 
diff --git a/src/Native/src/test.cpp b/src/Native/src/test.cpp
index c8298c3ee..d0788c2f7 100644
--- a/src/Native/src/test.cpp
+++ b/src/Native/src/test.cpp
@@ -134,6 +134,7 @@ void test_strides() {
 void test_sharding() {
     // local_index
     {
+        using namespace ntt::distributed;
         using mesh_type =
             ntt::distributed::mesh<ntt::distributed::topology::thread, 1>;
 
@@ -601,6 +602,7 @@ void test_matmul_transpose_b() {
 }
 
 void test_caching() {
+    using namespace ntt::distributed;
     {
         constexpr auto NumLayer = fixed_dim_v<1>;
         constexpr auto NumKVHead = fixed_dim_v<2>;
@@ -1269,7 +1271,7 @@ void test_concat() {
     auto pc = ntt::make_tensor<ntt::vector<float, 8>>(ntt::fixed_shape_v<3, 3>);
     ntt::pack(ta, pa, ntt::fixed_shape_v<1>);
     ntt::pack(tb, pb, ntt::fixed_shape_v<1>);
-    ntt::concat(std::make_tuple(pa, pb), pc, 1_dim);
+    ntt::concat(ntt::make_tuple(pa, pb), pc, 1_dim);
     ntt::unpack(pc, tc, ntt::fixed_shape_v<1>);
 
     assert(tc(0, 0) == 0.f);
diff --git a/src/Native/src/test_cli.cpp b/src/Native/src/test_cli.cpp
index 3c0135297..4ad340061 100644
--- a/src/Native/src/test_cli.cpp
+++ b/src/Native/src/test_cli.cpp
@@ -161,6 +161,7 @@ result<void> write_tensor_buffer(value_t value, std::ofstream &of) {
 result<void> run_core(const std::string &kmodel_path,
                       const std::vector<std::string> &files, size_t loop_count,
                       bool warmup, const std::string &cpu_external_module_path,
+                      const std::string &cuda_external_module_path,
                       const std::string &xpu_external_module_path) {
     std::ifstream kmodel(kmodel_path, std::ios::binary | std::ios::in);
     if (!kmodel.is_open())
@@ -171,6 +172,10 @@ result<void> run_core(const std::string &kmodel_path,
         interp.options().set("cpu_external_module_path",
                              cpu_external_module_path);
     }
+    if (!cuda_external_module_path.empty()) {
+        interp.options().set("cuda_external_module_path",
+                             cuda_external_module_path);
+    }
     if (!xpu_external_module_path.empty()) {
         interp.options().set("xpu_external_module_path",
                              xpu_external_module_path);
@@ -297,6 +302,7 @@ int main(int argc, char **argv) {
       ("l,loop", "Number of inference iterations", cxxopts::value<size_t>()->default_value("1"))
       ("w,warmup", "Enable warmup before inference", cxxopts::value<bool>()->default_value("false")->implicit_value("true"))
       ("cpu_external_module_path", "Path to external module for CPU backend (optional, for testing purposes)", cxxopts::value<std::string>()->default_value(""))
+      ("cuda_external_module_path", "Path to external module for CUDA backend (optional, for testing purposes)", cxxopts::value<std::string>()->default_value(""))
       ("xpu_external_module_path", "Path to external module for XPU backend (optional, for testing purposes)", cxxopts::value<std::string>()->default_value(""))
       ("h,help", "Print usage");
     // clang-format on
@@ -326,6 +332,7 @@ int main(int argc, char **argv) {
 
     run_core(kmodel_bin, bins, loop_count, warmup,
              result["cpu_external_module_path"].as<std::string>(),
+             result["cuda_external_module_path"].as<std::string>(),
              result["xpu_external_module_path"].as<std::string>())
         .unwrap_or_throw();
     return 0;
diff --git a/src/Native/src/test_dynamic_cli.cpp b/src/Native/src/test_dynamic_cli.cpp
index 6fac0d1d1..98767efb1 100644
--- a/src/Native/src/test_dynamic_cli.cpp
+++ b/src/Native/src/test_dynamic_cli.cpp
@@ -128,7 +128,7 @@ result<std::vector<value_t>> to_values(value_t v) {
 //     auto d = (T *)get_output_span(t).unwrap().data();
 //     std::vector<T> dec_result(d, d + t->length());
 //     auto dec_shape = t->shape();
-//     return std::make_tuple(dec_result, dec_shape);
+//     return ntt::make_tuple(dec_result, dec_shape);
 // }
 
 std::vector<std::string> split(const std::string &s, char delim) {
diff --git a/src/Nncase.CodeGen/CodeGen/SectionManager.cs b/src/Nncase.CodeGen/CodeGen/SectionManager.cs
index 6daa0b7df..d49ec0173 100644
--- a/src/Nncase.CodeGen/CodeGen/SectionManager.cs
+++ b/src/Nncase.CodeGen/CodeGen/SectionManager.cs
@@ -23,6 +23,8 @@ public static class WellknownSectionNames
 
     public static readonly string ThreadLocalCache = ".thread_local_cache";
 
+    public static readonly string WarpLocalRdata = ".warp_local_rdata";
+
     public static readonly string BlockLocalRdata = ".block_local_rdata";
 }
 
diff --git a/src/Nncase.Compiler/Compiler.cs b/src/Nncase.Compiler/Compiler.cs
index b59ed241b..03cdd293d 100644
--- a/src/Nncase.Compiler/Compiler.cs
+++ b/src/Nncase.Compiler/Compiler.cs
@@ -394,8 +394,8 @@ await RunPassAsync(
         await RunPassAsync(AutoVectorizePass, "AutoVectorizePass");
         await RunPassAsync(AutoPackingPass, "AutoPackingPass");
         await RunPassAsync(AutoDistributedPass, "AutoDistributedPass");
-        await RunPassAsync(AutoTilingPass, "AutoTilingPass");
 
+        // await RunPassAsync(AutoTilingPass, "AutoTilingPass");
         await RunPassAsync(TIRPass, "TIRPass");
 
         await RunPassAsync(
diff --git a/src/Nncase.Compiler/Interop/CApi.cs b/src/Nncase.Compiler/Interop/CApi.cs
index fda76003e..eaad42513 100644
--- a/src/Nncase.Compiler/Interop/CApi.cs
+++ b/src/Nncase.Compiler/Interop/CApi.cs
@@ -660,9 +660,17 @@ private static void CompilerCompile(IntPtr compilerHandle)
     [UnmanagedCallersOnly]
     private static void CompilerGencode(IntPtr compilerHandle, IntPtr streamHandle)
     {
-        var compiler = Get<Compiler>(compilerHandle);
-        var stream = Get<CStream>(streamHandle);
-        compiler.Gencode(stream);
+        try
+        {
+            var compiler = Get<Compiler>(compilerHandle);
+            var stream = Get<CStream>(streamHandle);
+            compiler.Gencode(stream);
+        }
+        catch (Exception ex)
+        {
+            Console.Error.WriteLine(ex);
+            Environment.FailFast(ex.ToString());
+        }
     }
 
     [UnmanagedCallersOnly]
diff --git a/src/Nncase.Core/DistributedType.cs b/src/Nncase.Core/DistributedType.cs
index ccb7bb6bb..f52e0dd04 100644
--- a/src/Nncase.Core/DistributedType.cs
+++ b/src/Nncase.Core/DistributedType.cs
@@ -154,6 +154,8 @@ public sealed record Placement(IRArray<int> Hierarchy, string Name, HierarchyKin
     // }
     public int Rank => Hierarchy.Count;
 
+    public bool HasWarp => Name.Contains('w', StringComparison.Ordinal);
+
     public override string ToString() => $"[{string.Join(',', Hierarchy.Zip(Name).Select(t => t.Second.ToString() + ':' + t.First.ToString()))}]";
 }
 
diff --git a/src/Nncase.Core/IR/TensorConst.cs b/src/Nncase.Core/IR/TensorConst.cs
index be3e7f756..f1f4334c5 100644
--- a/src/Nncase.Core/IR/TensorConst.cs
+++ b/src/Nncase.Core/IR/TensorConst.cs
@@ -142,12 +142,25 @@ public TensorConst(Tensor tensor, IRArray<SBP> ndsbp, Placement placement)
 
     public MemoryLocation GetMemoryLocation()
     {
-        return ValueType switch
+        if (ValueType is DistributedType dt)
         {
-            DistributedType dt when dt.AxisPolicies.Any(p => p is SBPSplit split && split.Axes.Contains(dt.Placement.Rank - 1)) => MemoryLocation.ThreadLocalRdata,
-            DistributedType => MemoryLocation.BlockLocalRdata,
-            _ => MemoryLocation.Rdata,
-        };
+            if (dt.AxisPolicies.Any(p => p is SBPSplit split && split.Axes.Contains(dt.Placement.Rank - 1)))
+            {
+                return MemoryLocation.ThreadLocalRdata;
+            }
+            else if (dt.Placement.HasWarp && dt.AxisPolicies.Any(p => p is SBPSplit split && split.Axes.Contains(dt.Placement.Rank - 2)))
+            {
+                return MemoryLocation.WarpLocalRdata;
+            }
+            else
+            {
+                return MemoryLocation.BlockLocalRdata;
+            }
+        }
+        else
+        {
+            return MemoryLocation.Rdata;
+        }
     }
 
     /// <inheritdoc/>
diff --git a/src/Nncase.Core/Schedule/ScheduleTypes.cs b/src/Nncase.Core/Schedule/ScheduleTypes.cs
index cd8c8271e..33ab2205b 100644
--- a/src/Nncase.Core/Schedule/ScheduleTypes.cs
+++ b/src/Nncase.Core/Schedule/ScheduleTypes.cs
@@ -85,6 +85,7 @@ public SchedFunctionResult()
     {
         Rdatas = new(ReferenceEqualityComparer.Instance);
         ThreadLocalRdatas = new(ReferenceEqualityComparer.Instance);
+        WarpLocalRdatas = new(ReferenceEqualityComparer.Instance);
         BlockLocalRdatas = new(ReferenceEqualityComparer.Instance);
         DataUsage = 0;
         BlockLocalDataPoolSize = 0;
@@ -101,6 +102,11 @@ public SchedFunctionResult()
     /// </summary>
     public Dictionary<IR.Const, ValueRange<ulong>> ThreadLocalRdatas { get; }
 
+    /// <summary>
+    /// Gets the buffer allocation.
+    /// </summary>
+    public Dictionary<IR.Const, ValueRange<ulong>> WarpLocalRdatas { get; }
+
     /// <summary>
     /// Gets the buffer allocation.
     /// </summary>
@@ -111,6 +117,11 @@ public SchedFunctionResult()
     /// </summary>
     public ulong DataUsage { get; set; }
 
+    /// <summary>
+    /// Gets or sets the warp local data section length.
+    /// </summary>
+    public ulong WarpLocalDataPoolSize { get; set; }
+
     /// <summary>
     /// Gets or sets the block local data section length.
     /// </summary>
diff --git a/src/Nncase.Core/TIR/PhysicalBuffer.cs b/src/Nncase.Core/TIR/PhysicalBuffer.cs
index b3ce916fd..09098f9f5 100644
--- a/src/Nncase.Core/TIR/PhysicalBuffer.cs
+++ b/src/Nncase.Core/TIR/PhysicalBuffer.cs
@@ -30,32 +30,42 @@ public enum MemoryLocation
     /// <summary>
     /// thread local constant data.
     /// </summary>
-    ThreadLocalRdata = 1 << 4,
+    ThreadLocalRdata = 1 << 5,
+
+    /// <summary>
+    /// lane local constant data.
+    /// </summary>
+    WarpLocalRdata = 1 << 4,
 
     /// <summary>
     /// block local constant data.
     /// </summary>
-    BlockLocalRdata = 1 << 5,
+    BlockLocalRdata = 1 << 6,
 
     /// <summary>
     /// compute temp data.
     /// </summary>
-    Data = 1 << 6,
+    Data = 1 << 7,
+
+    /// <summary>
+    /// warp local data.
+    /// </summary>
+    WarpLocalData = 1 << 8,
 
     /// <summary>
     /// block local data.
     /// </summary>
-    BlockLocalData = 1 << 7,
+    BlockLocalData = 1 << 9,
 
     /// <summary>
     /// cache.
     /// </summary>
-    Cache = 1 << 8,
+    Cache = 1 << 10,
 
     /// <summary>
     /// base addr.
     /// </summary>
-    PrivateBase = 1 << 10,
+    PrivateBase = 1 << 11,
 }
 
 public sealed class PhysicalBuffer : BaseExpr
diff --git a/src/Nncase.Evaluator/NN/Conv2D.cs b/src/Nncase.Evaluator/NN/Conv2D.cs
index 6309e0588..fdedb0e53 100644
--- a/src/Nncase.Evaluator/NN/Conv2D.cs
+++ b/src/Nncase.Evaluator/NN/Conv2D.cs
@@ -131,8 +131,8 @@ private IRType Visit(ITypeInferenceContext context, Conv2D target, DistributedTy
         var ndsbp = new SBP[input.Placement.Rank];
         for (int i = 0; i < ndsbp.Length; i++)
         {
-            var invalid = new InvalidType($"({input.AxisPolicies[i]}, {weights.AxisPolicies[i]}) not support");
-            switch (input.AxisPolicies[i], weights.AxisPolicies[i])
+            var invalid = new InvalidType($"({ndsbpsIf[i]}, {ndsbpsW[i]}) not support");
+            switch (ndsbpsIf[i], ndsbpsW[i])
             {
                 case (SBPSplit sa, SBPSplit sb):
                     // split on ic
diff --git a/src/Nncase.Importer/BaseImporter.cs b/src/Nncase.Importer/BaseImporter.cs
index fbb505950..ffb9243cb 100644
--- a/src/Nncase.Importer/BaseImporter.cs
+++ b/src/Nncase.Importer/BaseImporter.cs
@@ -184,7 +184,7 @@ protected Paddings GetPaddings(Expr expr)
 
     private IRModule CreateModule(IVar[] inputs, Dictionary<IVar, Dimension[]> varMap, BaseExpr body)
     {
-        var mainFunc = new Function("main", body, inputs, varMap);
+        var mainFunc = new Function("main", CompileSession.Target.Name, body, inputs, varMap);
         var module = new IRModule(mainFunc);
         return module;
     }
diff --git a/src/Nncase.Importer/Onnx/QLinearConv.cs b/src/Nncase.Importer/Onnx/QLinearConv.cs
index 3ab4f7dfe..26540e8c2 100644
--- a/src/Nncase.Importer/Onnx/QLinearConv.cs
+++ b/src/Nncase.Importer/Onnx/QLinearConv.cs
@@ -60,7 +60,7 @@ private Expr VisitQLinearConv(in NodeProto op)
             if (bias == null)
             {
                 int? ocNumber = (int)((TensorConst)weights).CheckedShape[0].FixedValue;
-                var zeroBias = new TensorConst(new int[ocNumber == null ? default(int) : ocNumber.Value]);
+                var zeroBias = new TensorConst(new int[ocNumber == null ? default : ocNumber.Value]);
                 var conv = F.NN.Conv2D(inputDeq, weightsDeq, zeroBias, strideArr, pads, dilationArr, PadMode.Constant, group);
                 return Quantize(conv, new QuantParam(((TensorConst)yZeroPoint).Value.ToScalar<int>(), ((TensorConst)yScale).Value.ToScalar<float>()), ((TensorConst)yZeroPoint).CheckedDataType);
             }
diff --git a/src/Nncase.Schedule/Schedule/Bufferize/BufferizeVisitor.cs b/src/Nncase.Schedule/Schedule/Bufferize/BufferizeVisitor.cs
index 4b1c6c7d6..6214bb854 100644
--- a/src/Nncase.Schedule/Schedule/Bufferize/BufferizeVisitor.cs
+++ b/src/Nncase.Schedule/Schedule/Bufferize/BufferizeVisitor.cs
@@ -18,6 +18,7 @@ public sealed class BufferizeVisitor : ExprRewriter
     private readonly IGrouping<string, PrimFunction> _functions;
     private long _currentRdataStart;
     private long _currentThreadLocalRdataStart;
+    private long _currentWarpLocalRdataStart;
     private long _currentBlockLocalRdataStart;
     private int _dataBufferId;
 
@@ -44,6 +45,7 @@ protected override BaseExpr RewriteLeafPrimFunction(PrimFunction func)
             {
                 MemoryLocation.Rdata => new BufferScheduleOptions(_currentRdataStart),
                 MemoryLocation.ThreadLocalRdata => new BufferScheduleOptions(_currentThreadLocalRdataStart),
+                MemoryLocation.WarpLocalRdata => new BufferScheduleOptions(_currentWarpLocalRdataStart),
                 MemoryLocation.BlockLocalRdata => new BufferScheduleOptions(_currentBlockLocalRdataStart),
                 _ => new BufferScheduleOptions(),
             });
@@ -56,9 +58,11 @@ protected override BaseExpr RewriteLeafPrimFunction(PrimFunction func)
 
             AssignOutputResult(func, scheduleResult);
             AssignDataResult(func, scheduleResult);
+            AssignWarpLocalDataResult(func, scheduleResult);
             AssignBlockLocalDataResult(func, scheduleResult);
             AssignRdataResult(func, scheduleResult);
             AssignThreadLocalRdataResult(func, scheduleResult);
+            AssignWarpLocalRdataResult(func, scheduleResult);
             AssignBlockLocalRdataResult(func, scheduleResult);
 
             var bufferReplaces = scheduleResult.SelectMany(x => x.Value.Buffers).ToDictionary(ReferenceEqualityComparer.Instance);
@@ -81,10 +85,14 @@ protected override BaseExpr RewriteLeafCall(Call expr)
             T.CreateBuffer(new TensorType(DataTypes.UInt8, [(long)func.SchedResult.DataUsage]), MemoryLocation.Data, out var dataBuffer, $"data_{_dataBufferId++}");
             var dataVar = new Var("data", TensorType.Scalar(new PointerType(DataTypes.UInt8)));
 
+            T.CreateBuffer(new TensorType(DataTypes.UInt8, [(long)func.SchedResult.WarpLocalDataPoolSize]), MemoryLocation.WarpLocalData, out var warpLocalDataBuffer, $"warp_local_data_{_dataBufferId++}");
+            var warpLocalDataVar = new Var("warp_local_data", TensorType.Scalar(new PointerType(DataTypes.UInt8)));
+
             T.CreateBuffer(new TensorType(DataTypes.UInt8, [(long)func.SchedResult.BlockLocalDataPoolSize]), MemoryLocation.BlockLocalData, out var blockLocalDataBuffer, $"block_local_data_{_dataBufferId++}");
             var blockLocalDataVar = new Var("block_local_data", TensorType.Scalar(new PointerType(DataTypes.UInt8)));
-            var funcParams = func.Parameters.ToArray().Append(dataVar).Append(blockLocalDataVar).ToArray();
-            var funcArgs = expr.Arguments.ToArray().Append(dataBuffer).Append(blockLocalDataBuffer).ToArray();
+
+            var funcParams = func.Parameters.ToArray().Append(dataVar).Append(warpLocalDataVar).Append(blockLocalDataVar).ToArray();
+            var funcArgs = expr.Arguments.ToArray().Append(dataBuffer).Append(warpLocalDataBuffer).Append(blockLocalDataBuffer).ToArray();
             var newFunc = func.With(parameters: funcParams);
             return expr.With(target: newFunc, arguments: funcArgs);
         }
@@ -110,6 +118,15 @@ private void AssignDataResult(PrimFunction func, IReadOnlyDictionary<MemoryLocat
         }
     }
 
+    private void AssignWarpLocalDataResult(PrimFunction func, IReadOnlyDictionary<MemoryLocation, BufferScheduleResult> scheduleResult)
+    {
+        if (scheduleResult.TryGetValue(MemoryLocation.WarpLocalData, out var warpLocalDataResult))
+        {
+            func.SchedResult.DataAlign = Math.Max(8, (ulong)warpLocalDataResult.Alignment);
+            func.SchedResult.WarpLocalDataPoolSize = MathUtility.AlignUp((ulong)warpLocalDataResult.MemoryPoolEnd, func.SchedResult.DataAlign);
+        }
+    }
+
     private void AssignBlockLocalDataResult(PrimFunction func, IReadOnlyDictionary<MemoryLocation, BufferScheduleResult> scheduleResult)
     {
         if (scheduleResult.TryGetValue(MemoryLocation.BlockLocalData, out var blockLocalDataResult))
@@ -136,16 +153,31 @@ private void AssignRdataResult(PrimFunction func, IReadOnlyDictionary<MemoryLoca
 
     private void AssignThreadLocalRdataResult(PrimFunction func, IReadOnlyDictionary<MemoryLocation, BufferScheduleResult> scheduleResult)
     {
-        if (scheduleResult.TryGetValue(MemoryLocation.ThreadLocalRdata, out var threadLocalRdataResult))
+        if (scheduleResult.TryGetValue(MemoryLocation.ThreadLocalRdata, out var threadOrWarpLocalRdataResult))
         {
-            foreach ((var buffer, var lifetime) in threadLocalRdataResult.Buffers)
+            foreach ((var buffer, var lifetime) in threadOrWarpLocalRdataResult.Buffers)
             {
                 var constValue = (Const)((Call)buffer.Start)[IR.Buffers.AddressOf.Input];
                 var range = new ValueRange<ulong>((ulong)lifetime.Memory.Start, (ulong)lifetime.Memory.Stop);
                 func.SchedResult.ThreadLocalRdatas.Add(constValue, range);
             }
 
-            _currentThreadLocalRdataStart = threadLocalRdataResult.MemoryPoolEnd;
+            _currentThreadLocalRdataStart = threadOrWarpLocalRdataResult.MemoryPoolEnd;
+        }
+    }
+
+    private void AssignWarpLocalRdataResult(PrimFunction func, IReadOnlyDictionary<MemoryLocation, BufferScheduleResult> scheduleResult)
+    {
+        if (scheduleResult.TryGetValue(MemoryLocation.WarpLocalRdata, out var warpLocalRdataResult))
+        {
+            foreach ((var buffer, var lifetime) in warpLocalRdataResult.Buffers)
+            {
+                var constValue = (Const)((Call)buffer.Start)[IR.Buffers.AddressOf.Input];
+                var range = new ValueRange<ulong>((ulong)lifetime.Memory.Start, (ulong)lifetime.Memory.Stop);
+                func.SchedResult.WarpLocalRdatas.Add(constValue, range);
+            }
+
+            _currentWarpLocalRdataStart = warpLocalRdataResult.MemoryPoolEnd;
         }
     }
 
diff --git a/src/Nncase.Schedule/Transforms/AutoTilePass.cs b/src/Nncase.Schedule/Transforms/AutoTilePass.cs
index b6e0ccb6a..bf0bb6031 100644
--- a/src/Nncase.Schedule/Transforms/AutoTilePass.cs
+++ b/src/Nncase.Schedule/Transforms/AutoTilePass.cs
@@ -58,7 +58,7 @@ protected override Task<BaseFunction> RunCoreAsync(BaseFunction input, RunPassCo
         using var ctx = IntegerSetLibrary.ctx.Create();
         ctx.set_ast_build_detect_min_max(1);
         var tiler = new GraphTiler();
-        if (!(input is Function func && func.ModuleKind == ModuleKind))
+        if (!(input is Function func))
         {
             return Task.FromResult(input);
         }
diff --git a/src/Nncase.Schedule/Transforms/TIRSelectionPass.cs b/src/Nncase.Schedule/Transforms/TIRSelectionPass.cs
index 6cfc000de..d931f2e4e 100644
--- a/src/Nncase.Schedule/Transforms/TIRSelectionPass.cs
+++ b/src/Nncase.Schedule/Transforms/TIRSelectionPass.cs
@@ -25,8 +25,7 @@ public TIRSelectionPass(string moduleKind)
 
     protected override Task<BaseFunction> RunCoreAsync(BaseFunction input, RunPassContext context)
     {
-        if (input.ModuleKind == ModuleKind
-            && input is Function func)
+        if (input is Function func)
         {
             var callers = func.Users.Where(x => x is Call or FunctionWrapper).ToArray();
             var isEntry = callers.Length == 0;
diff --git a/src/Nncase.Tests/Rewrite/Fusion/UnitTestGraphPartition.cs b/src/Nncase.Tests/Rewrite/Fusion/UnitTestGraphPartition.cs
index 3c11ba8b8..e164dc4a7 100644
--- a/src/Nncase.Tests/Rewrite/Fusion/UnitTestGraphPartition.cs
+++ b/src/Nncase.Tests/Rewrite/Fusion/UnitTestGraphPartition.cs
@@ -63,7 +63,7 @@ public async Task TestLineSameModuleI()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -98,7 +98,7 @@ public async Task TestLineSameModuleC()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -135,7 +135,7 @@ public async Task TestLineDiffModuleC2I()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -172,7 +172,7 @@ public async Task TestLineDiffModuleI2C()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -213,7 +213,7 @@ public async Task TestYSameModuleI()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -253,7 +253,7 @@ public async Task TestYSameModuleC()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -288,7 +288,7 @@ public async Task TestHandInHandSameModuleI()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -323,7 +323,7 @@ public async Task TestHandInHandSameModuleC()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -361,7 +361,7 @@ public async Task TestCircle1SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -399,7 +399,7 @@ public async Task TestCircle2SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -440,7 +440,7 @@ public async Task TestCircle2DiffModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -480,7 +480,7 @@ public async Task TestCircle3SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -518,7 +518,7 @@ public async Task TestCircle4SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -555,7 +555,7 @@ public async Task TestCircle5SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -591,7 +591,7 @@ public async Task TestTuple1SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -629,7 +629,7 @@ public async Task TestTuple2SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
@@ -666,7 +666,7 @@ public async Task TestConcat1SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -705,7 +705,7 @@ public async Task TestConcat2SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -741,7 +741,7 @@ public async Task TestConcat3SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -781,7 +781,7 @@ public async Task TestConcat4SameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
 
         await prmg.RunAsync(module);
 
@@ -821,7 +821,7 @@ public async Task TestSplitSameModule()
         var module = new IRModule(main);
 
         var prmg = CompileSession.CreatePassManager("prmg");
-        prmg.Add<ModulePartitionPass>(new NTTModuleCompiler());
+        prmg.Add<ModulePartitionPass>(new CPUModuleCompiler());
         await prmg.RunAsync(module);
 
         tv.Clear();
diff --git a/src/Nncase.Tests/Targets/UnitTestCUDAKernels.cs b/src/Nncase.Tests/Targets/UnitTestCUDAKernels.cs
new file mode 100644
index 000000000..5e40b1c87
--- /dev/null
+++ b/src/Nncase.Tests/Targets/UnitTestCUDAKernels.cs
@@ -0,0 +1,2129 @@
+﻿// Copyright (c) Canaan Inc. All rights reserved.
+// Licensed under the Apache license. See LICENSE file in the project root for full license information.
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Runtime;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using NetFabric.Hyperlinq;
+using Nncase.CodeGen;
+using Nncase.IR;
+using Nncase.IR.Math;
+using Nncase.IR.NN;
+using Nncase.IR.Tensors;
+using Nncase.Passes;
+using Nncase.Passes.Transforms;
+using Nncase.Runtime.Interop;
+using Nncase.Targets;
+using Nncase.Tests.TestFixture;
+using Nncase.Utilities;
+using OrtKISharp;
+using Xunit;
+using static Nncase.Evaluator.OrtKIExtensions;
+
+namespace Nncase.Tests.TargetTest;
+
+public class CUDAKernelCase
+{
+    public CUDAKernelCase(string name, Fusion fusion, IVar[] vars, Tensor[] inputs, Tensor[] rtinputs)
+    {
+        Name = name;
+        Fusion = fusion;
+        Vars = vars;
+        Inputs = inputs;
+        RTInputs = rtinputs;
+    }
+
+    public string Name { get; }
+
+    public Fusion Fusion { get; }
+
+    public IReadOnlyList<IVar> Vars { get; }
+
+    public IReadOnlyList<Tensor> Inputs { get; }
+
+    public IReadOnlyList<Tensor> RTInputs { get; set; }
+}
+
+[Collection(nameof(NotThreadSafeResourceCollection))]
+[AutoSetupTestMethod(InitSession = true)]
+public sealed class UnitTestCUDAKernels : TestClassBase
+{
+    public UnitTestCUDAKernels()
+    {
+        DefaultTargetName = CUDATarget.Kind;
+        CompileOptions.TargetOptions = new NTTTargetOptions
+        {
+            HierarchyNames = "t",
+            Hierarchies = [[32]],
+        };
+#if DEBUG
+        CompileOptions.DumpFlags = Diagnostics.DumpFlags.PassIR | Diagnostics.DumpFlags.Compile | Diagnostics.DumpFlags.Schedule | Diagnostics.DumpFlags.Rewrite | Diagnostics.DumpFlags.CodeGen | Diagnostics.DumpFlags.EGraphCost | Diagnostics.DumpFlags.Tiling;
+#endif
+    }
+
+    public enum PostOpKind
+    {
+        None,
+        MulScalar,
+        ScalarDiv,
+    }
+
+    public static Placement DefaultPlacement => new Placement(new[] { 1 }, "t");
+
+    public static int Lane => 16;
+
+    public static int Rank => 1;
+
+    public static TheoryData<long[], int[], List<int[][]>, int> TestReshardData { get; } = new()
+    {
+        { [1, 77, 768], [2, 32, 4], new() { new int[][] { [-1, 1], [-1, 1], [0, 2] }, new int[][] { [-1, 2], [-1, 2], [0, 1] } }, 0 },
+    };
+
+    public static TheoryData<BinaryOp, long[], long[], int[], int[][], PostOpKind[], int> TestVectorizeBinaryData { get; } = new()
+    {
+        { BinaryOp.Add, [8, 2], [8, 2], [1], [], [], 0 },
+        { BinaryOp.Mul, [1, 8, 64, 2 * 8], [1, 1, 64, 2 * 8], [1], [], [], 1 },
+        { BinaryOp.Add, [8, 16], [16], [1], [], [], 2 },
+        { BinaryOp.Mul, [1, 8, 64, 2 * 8], [1, 1, 64, 2 * 8], [4], [[-1], [-1], [0], [-1]], [], 3 },
+        { BinaryOp.Add, [8, 2], [8, 2], [1], [], [PostOpKind.MulScalar], 4 },
+        { BinaryOp.Mul, [1, 8, 64, 2 * 8], [1, 1, 64, 2 * 8], [1], [], [PostOpKind.MulScalar, PostOpKind.MulScalar], 5 },
+    };
+
+    public static TheoryData<ReduceOp, long[], int[], float, bool, int[], int[][], int> TestVectorizeReduceData { get; } = new()
+    {
+        { ReduceOp.Sum, new long[] { 1, 64, 384, 128 }, new[] { 3 }, 0, true, new[] { 1 }, [], 0 },
+        { ReduceOp.Mean, new long[] { 1, 384, 128 }, new[] { 2 }, 0, true, new[] { 1 }, [], 1 },
+        { ReduceOp.Mean, new long[] { 1, 384, 1024 }, new[] { 2 }, 0, true, new[] { 4 }, [[-1], [0], [-1]], 2 },
+        { ReduceOp.Max, new long[] { 1, 384, 1024 }, new[] { 2 }, 0, true, new[] { 4 }, [[-1], [0], [-1]], 3 },
+        { ReduceOp.Min, new long[] { 1, 384, 1024 }, new[] { 2 }, 0, true, new[] { 4 }, [[-1], [0], [-1]], 4 },
+        { ReduceOp.Sum, new long[] { 1, 384, 1024 }, new[] { 2 }, 0, true, new[] { 4 }, [[-1], [0], [-1]], 5 },
+        { ReduceOp.Mean, new long[] { 1, 3, 1024 }, new[] { 2 }, 0, true, new[] { 4 }, [[-1], [-1], [-1]], 6 },
+        { ReduceOp.Sum, new long[] { 1, 64, 384, 384 }, new[] { 3 }, 0, true, new[] { 64 }, [], 7 },
+    };
+
+    [Theory]
+    [ClassData(typeof(TestUpdatePagedAttentionCaseData))]
+    public async Task TestUpdatePagedAttentionCase(PagedAttentionKVCacheTestFixture fixture, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.Vectorize = false;
+
+        var placement = new Placement(hierarchy, targetOptions.HierarchyNames);
+        var dataGeneratorOptions = new PagedAttentionKVCacheTestFixture.DataGeneratorOptions(Random: true, IncreaseBy: [AttentionDimKind.Head], ResetForKV: true);
+        var referenceResults = PagedAttentionKVCacheTestFixture.PrepareReferenceResults(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.Config.NumKVHeads, fixture.Config.HeadDim, fixture.Config.NumLayers, fixture.Config.KVPrimType, dataGeneratorOptions);
+
+        var (root, queryVar, kVVars, kVCacheObjVar) = Evaluator.NN.RefPagedAttentionKVCache.BuildPagedAttentionKernel(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.NumBlocks, fixture.QLayout, fixture.KLayout, fixture.Config, new(true));
+
+        var kvinputs = PagedAttentionKVCacheTestFixture.PrepareKVInputs(fixture.QueryLens, fixture.SeqLens, fixture.ContextLens, fixture.NumBlocks, placement, referenceResults, fixture.Config);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        var rtFeedDict = new Dictionary<IVar, IValue>();
+        {
+            var queryTensor = referenceResults.GetQueryTensor();
+            feedDict.Add(queryVar, Value.FromTensor(queryTensor));
+            rtFeedDict.Add(queryVar, Value.FromTensor(queryTensor));
+            for (int layerId = 0; layerId < fixture.Config.NumLayers; layerId++)
+            {
+                feedDict.Add(kVVars[layerId][0], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 0)));
+                feedDict.Add(kVVars[layerId][1], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 1)));
+                rtFeedDict.Add(kVVars[layerId][0], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 0)));
+                rtFeedDict.Add(kVVars[layerId][1], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 1)));
+            }
+
+            feedDict.Add(kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(kvinputs.KVCacheObj))));
+
+            var kvCacheAddrs = new List<long>();
+            {
+                var logicalKVShape = kvinputs.KVCacheObj.KVCaches.Dimensions.ToArray();
+                foreach (var topoIndices in hierarchy.Select(i => Enumerable.Range(0, i)).CartesianProduct().Select(arr => arr.Select(i => (long)i).ToArray()))
+                {
+                    var indices = topoIndices.Concat(Enumerable.Repeat(0L, logicalKVShape.Length - hierarchy.Length)).ToArray();
+                    var shape = Enumerable.Repeat(1L, hierarchy.Length).Concat(logicalKVShape[hierarchy.Length..]).ToArray();
+                    var kvStorage = kvinputs.KVCacheObj.KVCaches.View(indices, shape);
+
+                    // FIXME: Memory leak here
+                    unsafe
+                    {
+                        kvCacheAddrs.Add((long)kvStorage.PinBuffer().Pointer);
+                    }
+                }
+            }
+
+            var rtkvObj = RTPagedAttentionKVCache.Create(
+                    kvinputs.KVCacheObj.NumSeqs,
+                    kvinputs.KVCacheObj.NumTokens,
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.ContextLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SeqLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.BlockTables),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SlotMapping),
+                    RTTensor.FromTensor(kvCacheAddrs.ToArray()));
+            rtFeedDict.Add(kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(rtkvObj))));
+        }
+
+        await RunCases($"Theory{count}", feedDict, new[] { root }, rtFeedDict);
+    }
+
+    [Theory]
+    [ClassData(typeof(TestPagedAttentionCaseData))]
+    public async Task TestPagedAttentionCase(PagedAttentionKVCacheTestFixture fixture, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.Vectorize = false;
+
+        var placement = new Placement(hierarchy, targetOptions.HierarchyNames);
+        var dataGeneratorOptions = new PagedAttentionKVCacheTestFixture.DataGeneratorOptions(Random: true, IncreaseBy: [AttentionDimKind.Head, AttentionDimKind.Seq], ResetForKV: true);
+        var referenceResults = PagedAttentionKVCacheTestFixture.PrepareReferenceResults(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.Config.NumKVHeads, fixture.Config.HeadDim, fixture.Config.NumLayers, fixture.Config.KVPrimType, dataGeneratorOptions);
+
+        var (root, queryVar, kVVars, kVCacheObjVar) = Evaluator.NN.RefPagedAttentionKVCache.BuildPagedAttentionKernel(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.NumBlocks, fixture.QLayout, fixture.KLayout, fixture.Config, new());
+
+        var kvinputs = PagedAttentionKVCacheTestFixture.PrepareKVInputs(fixture.QueryLens, fixture.SeqLens, fixture.ContextLens, fixture.NumBlocks, placement, referenceResults, fixture.Config);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        var rtFeedDict = new Dictionary<IVar, IValue>();
+        {
+            var queryTensor = referenceResults.GetQueryTensor();
+            feedDict.Add(queryVar, Value.FromTensor(queryTensor));
+            rtFeedDict.Add(queryVar, Value.FromTensor(queryTensor));
+            for (int layerId = 0; layerId < fixture.Config.NumLayers; layerId++)
+            {
+                feedDict.Add(kVVars[layerId][0], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 0)));
+                feedDict.Add(kVVars[layerId][1], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 1)));
+                rtFeedDict.Add(kVVars[layerId][0], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 0)));
+                rtFeedDict.Add(kVVars[layerId][1], Value.FromTensor(kvinputs.GetKeyValueTensor(layerId, 1)));
+            }
+
+            feedDict.Add(kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(kvinputs.KVCacheObj))));
+            var kvCacheAddrs = new List<long>();
+            {
+                var logicalKVShape = kvinputs.KVCacheObj.KVCaches.Dimensions.ToArray();
+                foreach (var topoIndices in hierarchy.Select(i => Enumerable.Range(0, i)).CartesianProduct().Select(arr => arr.Select(i => (long)i).ToArray()))
+                {
+                    var indices = topoIndices.Concat(Enumerable.Repeat(0L, logicalKVShape.Length - hierarchy.Length)).ToArray();
+                    var shape = Enumerable.Repeat(1L, hierarchy.Length).Concat(logicalKVShape[hierarchy.Length..]).ToArray();
+                    var kvStorage = kvinputs.KVCacheObj.KVCaches.View(indices, shape);
+
+                    // FIXME: Memory leak here
+                    unsafe
+                    {
+                        kvCacheAddrs.Add((long)kvStorage.PinBuffer().Pointer);
+                    }
+                }
+            }
+
+            var rtkvObj = RTPagedAttentionKVCache.Create(
+                    kvinputs.KVCacheObj.NumSeqs,
+                    kvinputs.KVCacheObj.NumTokens,
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.ContextLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SeqLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.BlockTables),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SlotMapping),
+                    RTTensor.FromTensor(kvCacheAddrs.ToArray()));
+            rtFeedDict.Add(kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(rtkvObj))));
+        }
+
+        await RunCases($"Theory{count}", feedDict, new[] { root }, rtFeedDict);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new[] { 32, 64 }, false, new[] { 64, 48 }, false, new[] { 48, 16 }, true, new[] { 1 }, 0 })]
+    [InlineData(new object[] { new[] { 128, 256 }, true, new[] { 256, 384 }, false, new[] { 384, 512 }, true, new[] { 2 }, 1 })]
+    [InlineData(new object[] { new[] { 1024, 2048 }, false, new[] { 2048, 1024 }, true, new[] { 1024, 3072 }, true, new[] { 4 }, 2, true })]
+    [InlineData(new object[] { new[] { 128, 256 }, true, new[] { 256, 384 }, false, new[] { 384, 512 }, true, new[] { 8 }, 3, false })]
+    public async Task TestTileFlowCase(int[] ashape, bool constA, int[] bshape, bool constB, int[] eshape, bool constE, int[] hierarchy, int count, bool vectorize = false)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.Vectorize = vectorize;
+        Expr a = constA ? Const.FromValue(IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, ashape).Evaluate()) : new Var("a", new TensorType(DataTypes.Float32, ashape));
+        Expr b = constB ? Const.FromValue(IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, bshape).Evaluate()) : new Var("b", new TensorType(DataTypes.Float32, bshape));
+        var c = IR.F.Tensors.MatMul(a, b);
+        var d = IR.F.Math.Neg(c);
+        Expr e = constE ? Const.FromValue(IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, eshape).Evaluate()) : new Var("e", new TensorType(DataTypes.Float32, eshape));
+        var f = IR.F.Tensors.MatMul(d, e);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        if (a is Var va)
+        {
+            feedDict.Add(va, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, ashape).Evaluate());
+        }
+
+        if (b is Var vb)
+        {
+            feedDict.Add(vb, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, bshape).Evaluate());
+        }
+
+        if (e is Var ve)
+        {
+            feedDict.Add(ve, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, eshape).Evaluate());
+        }
+
+        await RunCases($"Theory{count}", feedDict, new[] { f });
+    }
+
+    [Theory]
+    [MemberData(nameof(TestReshardData))]
+    public async Task TestReshard(long[] shape, int[] hierarchy, List<int[][]> sbps, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var inputType = new TensorType(DataTypes.Float32, shape);
+        var input = new Var(inputType);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 1.0f, 1.0f, 1, shape).Evaluate() },
+        };
+
+        var placement = new Placement(hierarchy, targetOptions.HierarchyNames);
+        var ndsbps = sbps.Select(sbp => sbp.Select(s => s[0] < 0 ? (SBP)SBP.B : SBP.S(s)).ToArray()).ToArray();
+        Expr boxed = input;
+        foreach (var ndsbp in ndsbps)
+        {
+            boxed = IR.F.Distributed.Boxing(boxed, new DistributedType(inputType, ndsbp, placement));
+        }
+
+        var post = IR.F.Distributed.Boxing(boxed, inputType);
+        post.Metadata = new Passes.Distributed.AutoDistributedMetaData() { Skip = true };
+        await RunCases($"Theory{count}", feedDict, new[] { post }, enableAutoDist: false);
+    }
+
+    [Theory]
+    [InlineData([new long[] { 32, 64 }, new int[] { 2 }, 0])]
+    [InlineData([new long[] { 8, 4 }, new int[] { 4, 2 }, 1])]
+    [InlineData([new long[] { 32, 64, 128 }, new int[] { 8, 4, 2 }, 2])]
+    [InlineData([new long[] { 64, 128 }, new int[] { 2, 4, 8 }, 3])]
+    public async Task TestGatherReduceScatter(long[] shape, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var inputType = new TensorType(DataTypes.Float32, shape);
+        var input = new Var(inputType);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            // { input, IR.F.Tensors.ConstantOfShape(shape, 1.0f).Evaluate() },
+            { input, IR.F.Random.Normal(DataTypes.Float32, 1.0f, 1.0f, 1, shape).Evaluate() },
+        };
+
+        var placement = new Placement(hierarchy, targetOptions.HierarchyNames);
+        var ndsbp = Enumerable.Repeat<SBP>(SBP.B, hierarchy.Length).ToArray();
+        var posts = new List<Call>();
+        var broadcast = IR.F.Distributed.Boxing(input, new DistributedType(inputType, ndsbp, placement));
+        foreach (var comb in LinqUtility.Combination(hierarchy.Length))
+        {
+            var newsbp = ndsbp.ToArray();
+            foreach (var axis in comb)
+            {
+                newsbp[axis] = SBP.P();
+            }
+
+            var partial = IR.F.Distributed.ForceBoxing(broadcast, new DistributedType(inputType, newsbp, placement));
+            var sumed = IR.F.Distributed.Boxing(partial, new DistributedType(inputType, ndsbp, placement));
+            var post = IR.F.Distributed.Boxing(sumed, inputType);
+            post.Metadata = new Passes.Distributed.AutoDistributedMetaData() { Skip = true };
+            posts.Add(post);
+        }
+
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Fact]
+    public async Task TestMatmulBinaryBinary()
+    {
+        var ashape = new[] { 1, 64, 384, 128 };
+        var bshape = new[] { 1, 64, 128, 384 };
+        var a = new Var("a", new TensorType(DataTypes.Float32, ashape));
+        var b = new Var("b", new TensorType(DataTypes.Float32, bshape));
+        var c = IR.F.Tensors.MatMul(a, b);
+        var dshape = new[] { 1 };
+        var d = new Var("d", new TensorType(DataTypes.Float32, dshape));
+        var e = IR.F.Math.Binary(BinaryOp.Div, c, d);
+        var fshape = new[] { 1, 1, 384, 384 };
+        var f = new Var("f", new TensorType(DataTypes.Float32, fshape));
+        var g = IR.F.Math.Binary(BinaryOp.Add, e, f);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { a, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, ashape).Evaluate() },
+            { b, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, bshape).Evaluate() },
+            { d, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, dshape).Evaluate() },
+            { f, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, fshape).Evaluate() },
+        };
+
+        await RunCases(string.Empty, feedDict, new[] { g });
+    }
+
+    [Fact]
+    public async Task TestDynamicMatmulBinaryBinary()
+    {
+        var dimM = new DimVar("m");
+        dimM.Metadata.Range = new(1, 384 * 2);
+        var ashape = new long[] { 1, 64, 384, 128 };
+        var bshape = new long[] { 1, 64, 128, 384 };
+        var aDims = ashape.Select(x => (Dimension)x).ToArray();
+        aDims[^2] = dimM;
+
+        var a = new Var("a", new TensorType(DataTypes.Float32, new RankedShape(aDims)));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(a, aDims.Select(x => x).ToArray());
+        var b = new Var("b", new TensorType(DataTypes.Float32, bshape));
+        var c = IR.F.Tensors.MatMul(a, b);
+        var dshape = new[] { 1 };
+        var d = new Var("d", new TensorType(DataTypes.Float32, dshape));
+        var e = IR.F.Math.Binary(BinaryOp.Div, c, d);
+        var fshape = new[] { 1, 1, 384, 384 };
+        var f = new Var("f", new TensorType(DataTypes.Float32, fshape));
+        var g = IR.F.Math.Binary(BinaryOp.Add, e, f);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { a, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, ashape).Evaluate() },
+            { b, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, bshape).Evaluate() },
+            { d, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, dshape).Evaluate() },
+            { f, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, fshape).Evaluate() },
+            { dimM, Value.FromTensor(ashape[^2]) },
+        };
+
+        await RunCases(string.Empty, feedDict, new[] { g });
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 32, 512, 64, 64 }, 0 })]
+    public async Task TestSwish(long[] shape, int count)
+    {
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pre = IR.F.NN.Swish(input);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeSwish(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 32, 133, 64, 64 }, new[] { 1 }, new[] { 4 }, 0 })]
+    [InlineData(new object[] { new long[] { 32, 12, 34, 49 }, new[] { 2, 3 }, new[] { 4 }, 1 })]
+    public async Task TestDynamicSwish(long[] shape, int[] dynamicAxes, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dynShape = new RankedShape(Enumerable.Range(0, shape.Length).Select(i => dynamicAxes.Contains(i) ? new DimVar($"dim{i}")
+        {
+            Metadata = new() { Range = new(1, Dimension.AlignUp(shape[i] * 2, 64).FixedValue) },
+        } : (Dimension)shape[i]).ToArray());
+        var input = new Var(new TensorType(DataTypes.Float32, dynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, dynShape.ToArray());
+        var pre = IR.F.NN.Swish(input);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, Value.FromTensor(Tensor.FromScalar(1f, shape)) /* IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() */ },
+        };
+        foreach (var axis in dynamicAxes)
+        {
+            feedDict.Add((DimVar)dynShape[axis], Value.FromTensor(shape[axis]));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeSwish(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 4, 8, 16, 32 }, new[] { 1 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 64, 384, 128 }, new[] { 4 }, 1 })]
+    public async Task TestUnary(long[] shape, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pre = IR.F.Math.Unary(UnaryOp.Neg, input);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeUnary(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 4, 8, 16, 32 }, new[] { 1 }, new int[] { 0 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 64, 384, 128 }, new[] { 4 }, new int[] { 1 }, 1 })]
+    [InlineData(new object[] { new long[] { 4, 64, 128, 256 }, new[] { 4 }, new int[] { 2 }, 2 })]
+    [InlineData(new object[] { new long[] { 4, 64, 256, 128 }, new[] { 4 }, new int[] { 3 }, 3 })]
+    public async Task TestDynamicUnary(long[] shape, int[] hierarchy, int[] dynamicAxes, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 40), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVars = new[] { "n", "c", "h", "w" }.Select((x, i) =>
+        {
+            var v = new DimVar(x);
+            v.Metadata.Range = new(1, shape[i] * 2);
+            return v;
+        }).ToArray();
+        var input = new Var(new TensorType(DataTypes.Float32, new RankedShape(Enumerable.Range(0, shape.Length).Select(i => dynamicAxes.Contains(i) ? dimVars[i] : (Dimension)shape[i]).ToArray())));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, dimVars);
+
+        var pre = IR.F.Math.Unary(UnaryOp.Neg, input);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, Value.FromTensor(Tensor.FromScalar<float>(1f, shape)) },
+            { dimVars[0], Value.FromTensor(shape[0]) },
+            { dimVars[1], Value.FromTensor(shape[1]) },
+            { dimVars[2], Value.FromTensor(shape[2]) },
+            { dimVars[3], Value.FromTensor(shape[3]) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeUnary(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 101, 256 }, 1, new[] { 8 }, 0, 0 })]
+    [InlineData(new object[] { new long[] { 13, 64, 256 }, 2, new[] { 2, 4 }, 0, 1 })]
+    public async Task TestDynamicLayerNorm(long[] shape, int axis, int[] hierarchy, int dynamicAxis, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 40), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_len")
+        {
+            Metadata = new() { Range = new(1, 128) },
+        };
+        var inputShape = new RankedShape(Enumerable.Range(0, shape.Length).Select(i => dynamicAxis == i ? dimVar : (Dimension)shape[i]).ToArray());
+        var input = new Var(new TensorType(DataTypes.Float32, inputShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, inputShape.ToArray());
+
+        var pre = IR.F.NN.LayerNorm(axis, 1e-6f, input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, new RankedShape(inputShape[axis])).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, new RankedShape(inputShape[axis])).Evaluate().AsTensor(), false);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+            { dimVar, Value.FromTensor(shape[dynamicAxis]) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeLayerNorm(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [MemberData(nameof(TestVectorizeBinaryData))]
+    public async Task TestVectorizeBinary(BinaryOp op, long[] lhsShape, long[] rhsShape, int[] hierarchy, int[][] sbps, PostOpKind[] postOpKinds, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Math.Binary(op, lhs, rhs);
+        var rule = new Passes.Rules.NTT.VectorizeBinary(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext())).Select(post =>
+        {
+            if (post is not Call { Target: IR.Tensors.Unpack unpack } call)
+            {
+                return post;
+            }
+
+            var newPost = (Expr)call.Arguments[0];
+            if (postOpKinds.Length > 0)
+            {
+                for (int i = 0; i < postOpKinds.Length; i++)
+                {
+                    newPost = postOpKinds[i] switch
+                    {
+                        PostOpKind.MulScalar => IR.F.Math.Binary(BinaryOp.Mul, newPost, 1.32f),
+                        PostOpKind.ScalarDiv => IR.F.Math.Binary(BinaryOp.Div, 0.32f, newPost),
+                        _ => throw new NotSupportedException($"Unsupported post operation kind: {postOpKinds[i]}"),
+                    };
+                }
+            }
+
+            return IR.F.Tensors.Unpack(newPost, unpack.Lanes.ToArray(), unpack.Axes.ToArray());
+        });
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate() },
+        };
+
+        if (sbps.Length > 0)
+        {
+            foreach (var post in posts)
+            {
+                var call = ExprCollector.Collect(post).Where(e => e is Call { Target: IR.NTT.VectorizedBinary or IR.Math.Binary }).First();
+                call.Metadata = new() { OutputNames = new[] { "call" } };
+            }
+
+            var scheme = new Passes.Distributed.DistributedSchema("1", "llama", [new("call", sbps.Select(s => s[0] < 0 ? SBP.B : (SBP)SBP.S(s)).ToArray(), hierarchy, targetOptions.HierarchyNames)]);
+            var options = new JsonSerializerOptions();
+            options.Converters.Add(new SBPConverter());
+            options.WriteIndented = true;
+            var export = System.Text.Json.JsonSerializer.Serialize(scheme, options);
+            var dumpper = Diagnostics.DumpScope.Current.CreateSubDummper($"Theory{count}");
+            targetOptions.DistributedScheme = Path.Join(dumpper.Directory, "schema.json");
+            using (var stream = dumpper.OpenFile("schema.json"))
+            {
+                using (var writer = new StreamWriter(stream))
+                {
+                    writer.Write(export);
+                }
+            }
+        }
+
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { BinaryOp.Max, new long[] { 56, 1 }, new long[] { 56, 1 }, new int[] { 1 }, new int[] { }, new int[] { 0, 2 }, 0 })] // note max(f32[sequence_length,1],f32[sequence_length,1])
+    [InlineData(new object[] { BinaryOp.Div, new long[] { 1 }, new long[] { 36, 1 }, new int[] { 4 }, new int[] { }, new int[] { 1 }, 1 })] // note div(f32[1], f32[sequence_length,1])
+    [InlineData(new object[] { BinaryOp.Mul, new long[] { 112, 32 }, new long[] { 112, 1 }, new int[] { 2 }, new int[] { }, new int[] { 0, 2 }, 2 })] // note mul(f32[sequence_length,32], f32[sequence_length,1])
+    [InlineData(new object[] { BinaryOp.Mul, new long[] { 66, 64 }, new long[] { 66, 1 }, new int[] { 8 }, new int[] { }, new int[] { 0, 2 }, 3 })] // note mul(f32[sequence_length,64], f32[sequence_length,1])
+    [InlineData(new object[] { BinaryOp.Mul, new long[] { 15, 64 }, new long[] { 1, 64 }, new int[] { 4 }, new int[] { }, new int[] { 0 }, 4 })] // note mul(f32[sequence_length,64], const(f32[1,64]))
+    [InlineData(new object[] { BinaryOp.Mul, new long[] { 16, 101, 4 }, new long[] { 1, 101, 4 }, new int[] { 4 }, new int[] { }, new int[] { 1, 4 }, 5 })] // note mul(f32[16,sequence_length,4], f32[1,sequence_length,4])
+    [InlineData(new object[] { BinaryOp.Add, new long[] { 1 }, new long[] { 32, 28 }, new int[] { 4 }, new int[] { }, new int[] { 2 }, 6 })] // note div(f32[1], f32[32, sequence_length])
+    public async Task TestDynamicVectorizeBinary(BinaryOp op, long[] lhsShape, long[] rhsShape, int[] hierarchy, int[] sbps, int[] dynamicAxes, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_length")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 128),
+            },
+        };
+
+        var lhsDynShape = new RankedShape(Enumerable.Range(0, lhsShape.Length).Select(i => dynamicAxes.Contains(i) ? dimVar : (Dimension)lhsShape[i]).ToArray());
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(lhs, lhsDynShape.ToArray());
+        var rhsDynShape = new RankedShape(Enumerable.Range(0, rhsShape.Length).Select(i => dynamicAxes.Contains(lhsDynShape.Rank + i) ? dimVar : (Dimension)rhsShape[i]).ToArray());
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(rhs, rhsDynShape.ToArray());
+        var pre = IR.F.Math.Binary(op, lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate() },
+            { dimVar, Value.FromTensor(lhsShape.Concat(rhsShape).Skip(dynamicAxes[0]).First()) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeBinary(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 113 }, new[] { 0 }, new[] { 2 }, new[] { 0 }, new int[] { }, new int[] { }, 0 })] // note pack(Lanes: {32}, Axes: {1}, [seq_len, 1024])
+    [InlineData(new object[] { new long[] { 68, 128 }, new[] { 0, 1 }, new[] { 4 }, new[] { 0 }, new[] { 0 }, new[] { 64 }, 1 })] // note pack(Lanes: {64, 128}, Axes: {0, 1}, [seq_len + padding, 1024])
+    [InlineData(new object[] { new long[] { 64, 103 }, new[] { 1 }, new[] { 4 }, new[] { 1 }, new int[] { }, new int[] { }, 2 })] // note pack(Lanes: {32}, Axes: {0}, [64, sequence_length])
+    [InlineData(new object[] { new long[] { 1, 99, 128 }, new[] { 1 }, new[] { 4 }, new[] { 1 }, new int[] { }, new int[] { }, 3 })] // note pack(Lanes: {32}, Axes: {2}, [1, sequence_length, 128])
+    public async Task TestDynamicVectorizeDevectorize(long[] shape, int[] axes, int[] hierarchy, int[] dynamicAxes, int[] alignAxes, int[] alignValues, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_len")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 128),
+            },
+        };
+
+        var dynShape = new RankedShape(Enumerable.Range(0, shape.Length).Select(i =>
+        {
+            if (dynamicAxes.Contains(i))
+            {
+                return dimVar;
+            }
+
+            return (Dimension)shape[i];
+        }).ToArray());
+        var input = new Var(new TensorType(DataTypes.Float32, dynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, dynShape.ToArray());
+
+        var lanes = axes.Select(i => 32).ToArray();
+        for (int i = 0; i < alignAxes.Length; i++)
+        {
+            lanes[alignAxes[i]] = alignValues[i];
+        }
+
+        var paded = VectorizeUtility.PadForVectorize(input, dynShape, axes, lanes, 0f, out var padNums);
+        var vectorized = IR.F.Tensors.Pack(paded, lanes, axes);
+        var devectorized = IR.F.Tensors.Unpack(vectorized, lanes, axes);
+        var sliced = VectorizeUtility.SliceForVectorize(devectorized, dynShape, padNums);
+
+        // note 2d vectorize will cause the devectorize issue.
+        // var inputTensor = Tensor.FromScalar<float>(0, shape);
+        // for (int i = 0; i < shape[0]; i++)
+        // {
+        //     for (int j = 0; j < shape[1]; j++)
+        //     {
+        //         inputTensor[i, j] = i;
+        //     }
+        // }
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, /* Value.FromTensor(inputTensor) */ IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+            { dimVar, Value.FromTensor(shape[dynamicAxes[0]]) },
+        };
+
+        await RunCases($"Theory{count}", feedDict, new[] { sliced });
+    }
+
+    [Theory(Skip = "Drop InstanceNorm")]
+    [InlineData(new object[] { new long[] { 1, 2, 16, 32 }, 1e-5, 0 })]
+    [InlineData(new object[] { new long[] { 1, 32, 2048 }, 1e-6, 1 })]
+    public async Task TestInstanceNorm(long[] shape, float epsion, int count)
+    {
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pshape = new[] { shape[1] };
+        var scale = new Var(new TensorType(DataTypes.Float32, pshape));
+        var bias = new Var(new TensorType(DataTypes.Float32, pshape));
+        var pre = IR.F.NN.InstanceNormalization(input, scale, bias, epsion);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+            { scale, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, pshape).Evaluate() },
+            { bias, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, pshape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeInstanceNorm(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 4, 32, 32 }, ImageResizeMode.Bilinear, new long[] { 1, 4, 64, 64 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 8, 32, 32 }, ImageResizeMode.NearestNeighbor, new long[] { 1, 8, 64, 64 }, 1 })]
+    public async Task TestResizeImage(long[] shape, ImageResizeMode resizeMode, long[] newSize, int count)
+    {
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pre = IR.F.Imaging.ResizeImage(resizeMode, input, Array.Empty<float>(), newSize);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeResizeImage(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 256, 64, 64 }, Runtime.TypeCode.Float8E4M3, Runtime.TypeCode.Float32, new PostOpKind[] { }, new int[] { }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 64, 64, 256 }, Runtime.TypeCode.BFloat16, Runtime.TypeCode.Float8E4M3, new PostOpKind[] { PostOpKind.MulScalar }, new int[] { }, 1 })]
+    [InlineData(new object[] { new long[] { 1, 64, 256, 64 }, Runtime.TypeCode.BFloat16, Runtime.TypeCode.Float16, new PostOpKind[] { }, new int[] { }, 2 })]
+    [InlineData(new object[] { new long[] { 64 }, Runtime.TypeCode.Float8E4M3, Runtime.TypeCode.Float32, new PostOpKind[] { PostOpKind.MulScalar }, new int[] { }, 3 })]
+    [InlineData(new object[] { new long[] { 43 /* seq_len */, 16, 256 }, Runtime.TypeCode.Float32, Runtime.TypeCode.BFloat16, new PostOpKind[] { PostOpKind.MulScalar }, new int[] { 0 }, 4 })]
+    [InlineData(new object[] { new long[] { 29 /* seq_len */, 64 }, Runtime.TypeCode.BFloat16, Runtime.TypeCode.Float32, new PostOpKind[] { PostOpKind.MulScalar }, new int[] { 0 }, 5 })]
+    public async Task TestVectorizeCast(long[] shape, Runtime.TypeCode type1, Runtime.TypeCode type2, PostOpKind[] postOpKinds, int[] dynamicAxes, int count)
+    {
+        Expr postOps = None.Default;
+
+        var dynShape = new RankedShape(Enumerable.Range(0, shape.Length).Select(i => dynamicAxes.Contains(i) ? new DimVar($"dim{i}")
+        {
+            Metadata = new() { Range = new(1, Dimension.AlignUp(shape[i] * 2, 64).FixedValue) },
+        } : (Dimension)shape[i]).ToArray());
+        var input = new Var(new TensorType(DataTypes.Float32, dynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, dynShape.ToArray());
+        var casted1 = IR.F.Tensors.Cast(input, DataType.FromTypeCode(type1));
+        var casted2 = IR.F.Tensors.Cast(casted1, DataType.FromTypeCode(type2));
+        var rule = new Passes.Rules.NTT.VectorizeCast(1, Lane);
+        CompilerServices.TryMatchRoot(casted2, rule.Pattern, out var result);
+        var posts = new[] { casted2 }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext())).Select(post =>
+        {
+            if (post is not Call { Target: IR.Tensors.Unpack unpack } call)
+            {
+                return IR.F.Tensors.Cast(post, DataTypes.Float32);
+            }
+
+            var newPost = (Expr)call.Arguments[0];
+            if (postOpKinds.Length > 0)
+            {
+                for (int i = 0; i < postOpKinds.Length; i++)
+                {
+                    newPost = postOpKinds[i] switch
+                    {
+                        PostOpKind.MulScalar => IR.F.Math.Binary(BinaryOp.Mul, newPost, Tensor.FromScalar(1.32f).CastElementTo(DataType.FromTypeCode(type2))),
+                        PostOpKind.ScalarDiv => IR.F.Math.Binary(BinaryOp.Div, Tensor.FromScalar(0.32f).CastElementTo(DataType.FromTypeCode(type2)), newPost),
+                        _ => throw new NotSupportedException($"Unsupported post operation kind: {postOpKinds[i]}"),
+                    };
+                }
+            }
+
+            return IR.F.Tensors.Cast(IR.F.Tensors.Unpack(newPost, unpack.Lanes.ToArray(), unpack.Axes.ToArray()), DataTypes.Float32);
+        });
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        foreach (var axis in dynamicAxes)
+        {
+            feedDict.Add((DimVar)dynShape[axis], Value.FromTensor(shape[axis]));
+        }
+
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 384, 512 }, new long[] { 512, 512 }, false, false, new[] { 1 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 1, 384, 256 }, new long[] { 32, 256, 512 }, false, false, new[] { 1 }, 1 })]
+    [InlineData(new object[] { new long[] { 384, 512 }, new long[] { 512, 512 }, false, false, new[] { 1 }, 2 })]
+    [InlineData(new object[] { new long[] { 1, 384, 512 }, new long[] { 512, 512 }, false, true, new[] { 1 }, 3 })]
+    [InlineData(new object[] { new long[] { 1, 1, 384, 256 }, new long[] { 32, 256, 512 }, false, true, new[] { 1 }, 4 })]
+    [InlineData(new object[] { new long[] { 384, 512 }, new long[] { 512, 512 }, false, true, new[] { 1 }, 5 })]
+    [InlineData(new object[] { new long[] { 384, 512 }, new long[] { 512, 256 }, false, true, new[] { 2 }, 6 })]
+    public async Task TestVectorizeMatMul(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        var rhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+
+        // var lhsTensor = Tensor.From(Enumerable.Range(0, (int)TensorUtilities.GetProduct(lhsShape)).Select(i => (float)i).ToArray(), lhsShape);
+        // var rhsTensor = Tensor.From(Enumerable.Range(0, (int)TensorUtilities.GetProduct(rhsShape)).Select(i => (float)i).ToArray(), rhsShape);
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(DataTypes.Float32, lhsShape));
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: true);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 3, 2 }, new long[] { 2, 64 }, false, false, new[] { 1 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 3, 2 }, new long[] { 2, 64 }, false, true, new[] { 1 }, 1 })]
+    [InlineData(new object[] { new long[] { 1, 2 }, new long[] { 2, 64 }, false, true, new[] { 1 }, 2 })]
+    public async Task TestPackedMatMul(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        var rhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+
+        // var lhsTensor = Tensor.From(Enumerable.Range(0, (int)TensorUtilities.GetProduct(lhsShape)).Select(i => (float)i).ToArray(), lhsShape);
+        // var rhsTensor = Tensor.From(Enumerable.Range(0, (int)TensorUtilities.GetProduct(rhsShape)).Select(i => (float)i).ToArray(), rhsShape);
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(DataTypes.Float32, lhsShape));
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: true);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var vectorizedPosts = rule.GetReplaceCandidates(result!, new Passes.RunPassContext());
+
+        var packRule = new Passes.Rules.NTT.PackMatMulByN(4);
+        var posts = new List<Expr>();
+        foreach (var post in vectorizedPosts)
+        {
+            var context = new Passes.RunPassContext();
+            var newPost = CompilerServices.Rewrite(post, [packRule], context);
+            if (context.IsMutated)
+            {
+                posts.Add((Expr)newPost);
+            }
+        }
+
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { CompareOp.Equal, new long[] { 34 }, new long[] { 1 }, new[] { 4 }, new int[] { }, new int[] { 0 }, 0 })]
+    public async Task TestDynamicVectorizeCompare(CompareOp op, long[] lhsShape, long[] rhsShape, int[] hierarchy, int[] sbps, int[] dynamicAxes, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_length")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 256),
+            },
+        };
+
+        var lhsDynShape = new RankedShape(Enumerable.Range(0, lhsShape.Length).Select(i => dynamicAxes.Contains(i) ? dimVar : (Dimension)lhsShape[i]).ToArray());
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(lhs, lhsDynShape.ToArray());
+        var rhsDynShape = new RankedShape(Enumerable.Range(0, rhsShape.Length).Select(i => dynamicAxes.Contains(lhsDynShape.Rank + i) ? dimVar : (Dimension)rhsShape[i]).ToArray());
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(rhs, rhsDynShape.ToArray());
+        var pre = IR.F.Math.Compare(op, lhs, rhs);
+
+        var lhsElems = Enumerable.Range(0, (int)TensorUtilities.GetProduct(lhsShape)).Select(i => (float)i).ToArray();
+        var rhsElems = Enumerable.Range(0, (int)TensorUtilities.GetProduct(rhsShape)).Select(i => (float)i).ToArray();
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { lhs, Value.FromTensor(Tensor.From(lhsElems, lhsShape)) },
+            { rhs, Value.FromTensor(Tensor.From(rhsElems, rhsShape)) },
+            { dimVar, Value.FromTensor(lhsShape.Concat(rhsShape).Skip(dynamicAxes[0]).First()) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeCompare(MaskVectorStyle.Slim, Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 56 /* seq_length */, 1 }, new long[] { 1 }, new long[] { 56, 16 }, new int[] { 4 }, new int[] { }, new int[] { 0, 3 }, 0 })]
+    public async Task TestDynamicVectorizeWhere(long[] condShape, long[] lhsShape, long[] rhsShape, int[] hierarchy, int[] sbps, int[] dynamicAxes, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_length")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 128),
+            },
+        };
+
+        var condDynShape = new RankedShape(Enumerable.Range(0, condShape.Length).Select(i => dynamicAxes.Contains(i) ? dimVar : (Dimension)condShape[i]).ToArray());
+        var cond = new Var(new TensorType(DataTypes.Boolean, condDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(cond, condDynShape.ToArray());
+
+        var lhsDynShape = new RankedShape(Enumerable.Range(0, lhsShape.Length).Select(i => dynamicAxes.Contains(condShape.Length + i) ? dimVar : (Dimension)lhsShape[i]).ToArray());
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(lhs, lhsDynShape.ToArray());
+
+        var rhsDynShape = new RankedShape(Enumerable.Range(0, rhsShape.Length).Select(i => dynamicAxes.Contains(condShape.Length + lhsShape.Length + i) ? dimVar : (Dimension)rhsShape[i]).ToArray());
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsDynShape));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(rhs, rhsDynShape.ToArray());
+
+        var pre = IR.F.Tensors.Where(cond, lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { cond, IR.F.Random.Normal(DataTypes.Boolean, 0, 1, 1, condShape).Evaluate() },
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate() },
+            { dimVar, Value.FromTensor(condShape.Concat(lhsShape).Concat(rhsShape).Skip(dynamicAxes[0]).First()) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeWhere(MaskVectorStyle.Slim, Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 83 }, new long[] { 83 }, new[] { 4 }, 0 })]
+    [InlineData(new object[] { new long[] { 1 }, new long[] { 84 }, new[] { 4 }, 1 })]
+    [InlineData(new object[] { new long[] { 12, 1 }, new long[] { 12, 84 }, new[] { 4 }, 2 })]
+    [InlineData(new object[] { new long[] { 4, 4, 4, 4 }, new long[] { 4, 12, 20, 28 }, new[] { 4 }, 3 })]
+    [InlineData(new object[] { new long[] { 25, 28, 31, 19, 25 }, new long[] { 25 + 0, 28 + 1, 31 + 16, 19 + 4, 25 + 7 }, new[] { 4 }, 3 })]
+    public async Task TestDynamicGetPositionIds(long[] queryLens, long[] seqLens, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 40), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimVar = new DimVar("seq_length")
+        {
+            Metadata = new() { Range = new(1, MathUtility.AlignUp(queryLens.Sum(), 128)) },
+        };
+
+        var fixture = new PagedAttentionKVCacheTestFixture(queryLens, seqLens, 2, 2, 64, 64, (int)MathUtility.CeilDiv(seqLens.Select(seq_len => MathUtility.CeilDiv(seq_len, 64)).Sum(), hierarchy.Max()) * hierarchy.Max(), Runtime.TypeCode.Float32, 1, [PagedKVCacheDimKind.NumBlocks, PagedKVCacheDimKind.NumLayers, PagedKVCacheDimKind.KV, PagedKVCacheDimKind.NumKVHeads, PagedKVCacheDimKind.HeadDim, PagedKVCacheDimKind.BlockSize], [PagedKVCacheDimKind.HeadDim], [PagedKVCacheDimKind.NumBlocks], [SBP.S(0)], [AttentionDimKind.Seq, AttentionDimKind.Dim, AttentionDimKind.Head], [AttentionDimKind.Seq, AttentionDimKind.Dim, AttentionDimKind.Head]);
+
+        var placement = new Placement(hierarchy, targetOptions.HierarchyNames);
+        var dataGeneratorOptions = new PagedAttentionKVCacheTestFixture.DataGeneratorOptions(Random: true, IncreaseBy: [AttentionDimKind.Head], ResetForKV: true);
+        var referenceResults = PagedAttentionKVCacheTestFixture.PrepareReferenceResults(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.Config.NumKVHeads, fixture.Config.HeadDim, fixture.Config.NumLayers, fixture.Config.KVPrimType, dataGeneratorOptions);
+
+        var (_, _, _, kVCacheObjVar) = Evaluator.NN.RefPagedAttentionKVCache.BuildPagedAttentionKernel(fixture.QueryLens, fixture.SeqLens, fixture.NumQHeads, fixture.NumBlocks, fixture.QLayout, fixture.KLayout, fixture.Config, new(true));
+
+        var kvinputs = PagedAttentionKVCacheTestFixture.PrepareKVInputs(fixture.QueryLens, fixture.SeqLens, fixture.ContextLens, fixture.NumBlocks, placement, referenceResults, fixture.Config);
+
+        var pre = IR.F.NN.GetPositionIds(dimVar, kVCacheObjVar);
+
+        var feedDict = new Dictionary<IVar, IValue>
+        {
+            { kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(kvinputs.KVCacheObj))) },
+            { dimVar, Value.FromTensor(queryLens.Sum()) },
+        };
+
+        var rtFeedDict = new Dictionary<IVar, IValue>();
+        var kvCacheAddrs = new List<long>();
+        {
+            var logicalKVShape = kvinputs.KVCacheObj.KVCaches.Dimensions.ToArray();
+            foreach (var topoIndices in hierarchy.Select(i => Enumerable.Range(0, i)).CartesianProduct().Select(arr => arr.Select(i => (long)i).ToArray()))
+            {
+                var indices = topoIndices.Concat(Enumerable.Repeat(0L, logicalKVShape.Length - hierarchy.Length)).ToArray();
+                var shape = Enumerable.Repeat(1L, hierarchy.Length).Concat(logicalKVShape[hierarchy.Length..]).ToArray();
+                var kvStorage = kvinputs.KVCacheObj.KVCaches.View(indices, shape);
+
+                // FIXME: Memory leak here
+                unsafe
+                {
+                    kvCacheAddrs.Add((long)kvStorage.PinBuffer().Pointer);
+                }
+            }
+        }
+
+        var rtkvObj = RTPagedAttentionKVCache.Create(
+                    kvinputs.KVCacheObj.NumSeqs,
+                    kvinputs.KVCacheObj.NumTokens,
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.ContextLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SeqLens),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.BlockTables),
+                    RTTensor.FromTensor(kvinputs.KVCacheObj.SlotMapping),
+                    RTTensor.FromTensor(kvCacheAddrs.ToArray()));
+        rtFeedDict.Add(kVCacheObjVar, Value.FromTensor(Tensor.FromScalar(new Reference<IPagedAttentionKVCache>(rtkvObj))));
+        rtFeedDict.Add(dimVar, Value.FromTensor(queryLens.Sum()));
+
+        await RunCases($"Theory{count}", feedDict, new[] { pre }, rtFeedDict);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 7, 1024 }, new long[] { 1024, 64 }, false, true, new[] { 1 }, new[] { 0 }, Runtime.TypeCode.Float32, null!, Runtime.TypeCode.Float32, 0 })] // note const(f32[sequence_length,2048]) @ [2048,4096]
+    [InlineData(new object[] { new long[] { 64, 1 }, new long[] { 1, 94 }, true, false, new[] { 4 }, new[] { 3 }, Runtime.TypeCode.Float32, null!, Runtime.TypeCode.Float32, 1 })] // note const(f32[64,1]) @ [1,sequence_length]
+    [InlineData(new object[] { new long[] { 7, 1024 }, new long[] { 1024, 64 }, false, true, new[] { 1 }, new[] { 0 }, Runtime.TypeCode.Float8E4M3, Runtime.TypeCode.Float32, Runtime.TypeCode.Float16, 2 })] // note const(f32[sequence_length,2048]) @ [2048,4096], f8 in, f16 out
+    [InlineData(new object[] { new long[] { 7, 512 }, new long[] { 512, 128 }, false, true, new[] { 1 }, new[] { 0 }, Runtime.TypeCode.Float8E4M3, Runtime.TypeCode.Float32, Runtime.TypeCode.Float32, 3 })] // note const(f32[sequence_length,2048]) @ [2048,4096], f8 in, f32 out
+    public async Task TestDynamicVectorizeMatMul(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int[] dynamicAxes, Runtime.TypeCode inType, Runtime.TypeCode? scaleType, Runtime.TypeCode outType, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var dimVar = new DimVar("seq_len")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 256),
+            },
+        };
+
+        var lhsDynShape = new RankedShape(Enumerable.Range(0, lhsShape.Length).Select(i =>
+        {
+            if (dynamicAxes.Contains(i))
+            {
+                return dimVar;
+            }
+
+            return (Dimension)lhsShape[i];
+        }).ToArray());
+        var lhsType = DataType.FromTypeCode(inType);
+        var lhsTensor = IR.F.Random.Normal(lhsType, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(lhsType, lhsDynShape));
+
+        if (!constA)
+        {
+            CompileOptions.ShapeBucketOptions.VarMap.Add((Var)lhs, lhs.CheckedShape.ToArray());
+        }
+
+        var rhsDynShape = new RankedShape(Enumerable.Range(0, rhsShape.Length).Select(i =>
+        {
+            if (dynamicAxes.Contains(lhsShape.Length + i))
+            {
+                return dimVar;
+            }
+
+            return (Dimension)rhsShape[i];
+        }).ToArray());
+        var rhsTensor = IR.F.Random.Normal(lhsType, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(lhsType, rhsDynShape));
+        if (!constB)
+        {
+            CompileOptions.ShapeBucketOptions.VarMap.Add((Var)rhs, rhs.CheckedShape.ToArray());
+        }
+
+        Expr scale = None.Default;
+        if (scaleType is not null)
+        {
+            scale = Tensor.FromScalar(1.23f).CastElementTo(DataType.FromTypeCode(scaleType.Value));
+        }
+
+        var pre = IR.F.Tensors.MatMul(lhs, rhs, DataType.FromTypeCode(outType), scale);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        foreach (var axis in dynamicAxes)
+        {
+            feedDict.Add(dimVar, Value.FromTensor(lhsShape.Concat(rhsShape).Skip(axis).Take(1).First()));
+        }
+
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: true);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 154, 128 * 8 }, new long[] { 128 * 8, 64 * 32 }, false, true, new[] { 4 }, new[] { 0 }, 0 })] // note const(f32[sequence_length,2048]) @ [2048,4096]
+    [InlineData(new object[] { new long[] { 21, 128 }, new long[] { 128, 1024 }, false, true, new[] { 1 }, new[] { 0 }, 1 })] // note const(f32[sequence_length,2048]) @ [2048,4096]
+    public async Task TestDynamicPackedMatMul(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int[] dynamicAxes, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var dimVar = new DimVar("seq_len")
+        {
+            Metadata = new()
+            {
+                Range = new(1, 255),
+            },
+        };
+
+        var lhsDynShape = new RankedShape(Enumerable.Range(0, lhsShape.Length).Select(i =>
+        {
+            if (dynamicAxes.Contains(i))
+            {
+                return dimVar;
+            }
+
+            return (Dimension)lhsShape[i];
+        }).ToArray());
+        var lhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(DataTypes.Float32, lhsDynShape));
+
+        if (!constA)
+        {
+            CompileOptions.ShapeBucketOptions.VarMap.Add((Var)lhs, lhs.CheckedShape.ToArray());
+        }
+
+        var rhsDynShape = new RankedShape(Enumerable.Range(0, rhsShape.Length).Select(i =>
+        {
+            if (dynamicAxes.Contains(lhsShape.Length + i))
+            {
+                return dimVar;
+            }
+
+            return (Dimension)rhsShape[i];
+        }).ToArray());
+        var rhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(DataTypes.Float32, rhsDynShape));
+        if (!constB)
+        {
+            CompileOptions.ShapeBucketOptions.VarMap.Add((Var)rhs, rhs.CheckedShape.ToArray());
+        }
+
+        var pre = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        foreach (var axis in dynamicAxes)
+        {
+            feedDict.Add(dimVar, Value.FromTensor(lhsShape.Concat(rhsShape).Skip(axis).Take(1).First()));
+        }
+
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: true);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var vectorizedPosts = rule.GetReplaceCandidates(result!, new Passes.RunPassContext());
+
+        var packRule = new Passes.Rules.NTT.PackMatMulByN(4);
+        var posts = new List<Expr>();
+        foreach (var post in vectorizedPosts)
+        {
+            var context = new Passes.RunPassContext();
+            var newPost = CompilerServices.Rewrite(post, [packRule], context);
+            if (context.IsMutated)
+            {
+                posts.Add((Expr)newPost);
+            }
+        }
+
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 384, 512 }, new long[] { 2, 512, 512 }, false, true, new[] { 4, 4 }, 0 })]
+    [InlineData(new object[] { new long[] { 2, 384, 512 }, new long[] { 2, 512, 512 }, false, false, new[] { 4, 8 }, 1 })]
+    [InlineData(new object[] { new long[] { 2, 384, 512 }, new long[] { 2, 512, 512 }, false, true, new[] { 2, 8, 4 }, 2 })]
+    [InlineData(new object[] { new long[] { 2, 384, 512 }, new long[] { 2, 512, 512 }, false, false, new[] { 2, 4, 8 }, 3 })]
+    public async Task TestSUMMA(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        var rhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(DataTypes.Float32, lhsShape));
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: false);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 384, 128 }, 0, new long[] { 1, 384 }, 0 })]
+    public async Task TestGather(long[] shape, int axis, long[] indicesShape, int count)
+    {
+        var vhidden_in = new Var("vhidden_in", new TensorType(DataTypes.Float32, shape));
+        var vposition_ids = new Var("vposition_ids", new TensorType(DataTypes.Int64, indicesShape));
+        var pre = IR.F.Tensors.Gather(vhidden_in, axis, vposition_ids); // f32[1,384,128]
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { vhidden_in, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+            { vposition_ids, IR.F.Random.Uniform(DataTypes.Int64, 6, 1, 1, indicesShape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeGather(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [MemberData(nameof(TestVectorizeReduceData))]
+    public async Task TestVectorizeReduce(ReduceOp reduceOp, long[] shape, int[] axes, float init, bool keepDims, int[] hierarchy, int[][] splitedAxes, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var tensorType = new TensorType(DataTypes.Float32, shape);
+        var input = new Var(tensorType);
+        var pre = IR.F.Tensors.Reduce(reduceOp, input, axes, init, keepDims);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        IEnumerable<BaseExpr> posts;
+        var rule = new Passes.Rules.NTT.VectorizeReduce(Rank, Lane);
+        if (!CompilerServices.TryMatch(pre, rule.Pattern, out var result))
+        {
+            return;
+        }
+
+        posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result, new Passes.RunPassContext()));
+
+        if (splitedAxes.Length > 0)
+        {
+            foreach (var post in posts)
+            {
+                if (post is Call { Target: IR.Tensors.Unpack } callUnVectorize && callUnVectorize.Arguments[0] is Call { Target: IR.NTT.VectorizedReduce } vectorizedReduceCall)
+                {
+                    vectorizedReduceCall.Arguments[0].Metadata = new() { OutputNames = new[] { "reduceIn" } };
+                }
+                else if (post is Call { Target: IR.Math.Reduce } reduceCall)
+                {
+                    reduceCall.Arguments[0].Metadata = new() { OutputNames = new[] { "reduceIn" } };
+                }
+            }
+
+            var scheme = new Passes.Distributed.DistributedSchema("1", "llama", [new("reduceIn", splitedAxes.Select(s => s[0] < 0 ? SBP.B : (SBP)SBP.S(s)).ToArray(), hierarchy, targetOptions.HierarchyNames)]);
+            var options = new JsonSerializerOptions();
+            options.Converters.Add(new SBPConverter());
+            options.WriteIndented = true;
+            var export = System.Text.Json.JsonSerializer.Serialize(scheme, options);
+            var dumpper = Diagnostics.DumpScope.Current.CreateSubDummper($"Theory{number}");
+            targetOptions.DistributedScheme = Path.Join(dumpper.Directory, "schema.json");
+            using (var stream = dumpper.OpenFile("schema.json"))
+            {
+                using (var writer = new StreamWriter(stream))
+                {
+                    writer.Write(export);
+                }
+            }
+        }
+
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 3, 28, 28 }, 0 })]
+    public async Task TestInstanceNormal(long[] shape, int number)
+    {
+        var input = new Var("input", new TensorType(DataTypes.Float32, shape));
+        Expr pre; // f32[1,3,28,28]
+        {
+            var v0 = IR.F.Tensors.Reduce(ReduceOp.Mean, input, new[] { 2, 3 }, 0f, true); // f32[1,3,1,1]
+            var v1 = IR.F.Math.Binary(BinaryOp.Sub, input, v0); // f32[1,3,28,28]
+            var v2 = IR.F.Math.Unary(UnaryOp.Square, v1); // f32[1,3,28,28]
+            var v3 = IR.F.Tensors.Reduce(ReduceOp.Mean, v2, new[] { 2, 3 }, 0f, true); // f32[1,3,1,1]
+            var v4 = IR.F.Math.Binary(BinaryOp.Add, v3, new float[] { 1E-05f }); // f32[1,3,1,1]
+            var v5 = IR.F.Math.Unary(UnaryOp.Rsqrt, v4); // f32[1,3,1,1]
+            var v6 = IR.F.Math.Binary(BinaryOp.Mul, v1, v5); // f32[1,3,28,28]
+            var v7 = IR.F.Math.Binary(BinaryOp.Mul, v6, new float[3, 1, 1] { { { 0.24680786f } }, { { 0.065782584f } }, { { -0.9344868f } } }); // f32[1,3,28,28]
+            pre = IR.F.Math.Binary(BinaryOp.Add, v7, new float[3, 1, 1] { { { 0.6403651f } }, { { -0.7995949f } }, { { 0.46802735f } } }); // f32[1,3,28,28]
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var posts = new[] { pre };
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData([new long[] { 1, 384, 8192 }, new long[] { 1, 384, 64, 128 }, 1, new[] { 1 }, 0])]
+    [InlineData([new long[] { 1, 8192, 384 }, new long[] { 1, 64, 128, 384 }, 1, new[] { 1 }, 1])]
+    [InlineData([new long[] { 1, 8192, 384 }, new long[] { 1, 64, 128, 384 }, 1, new[] { 8 }, 2])]
+    public async Task TestVectorizeReshape(long[] inshape, long[] outshape, int vectorizeRank, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var input = new Var("input", new TensorType(DataTypes.Float32, inshape));
+        Expr pre;
+        {
+            pre = IR.F.Tensors.Reshape(input, outshape);
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, inshape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeReshape(vectorizeRank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData([new long[] { 2, 8, 16, 2 }, new int[] { 0, 2, 1, 3 }, 2, 0])]
+    [InlineData([new long[] { 1, 64, 384, 128 }, new int[] { 0, 2, 1, 3 }, 2, 1])]
+    public async Task TestTranspose(long[] shape, int[] perm, int rank, int number)
+    {
+        var input = new Var("input", new TensorType(DataTypes.Float32, shape));
+        Expr pre; // f32[1,3,28,28]
+        {
+            var v4 = IR.F.Tensors.Transpose(input, perm); // f32[1,64,384,128]
+            pre = v4;
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, Value.FromTensor(Tensor.From(Enumerable.Range(0, (int)TensorUtilities.GetProduct(shape)).Select(i => (float)i).ToArray(), shape)) },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeTranspose(rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData([new[] { 2, 4 }, 0])]
+    public async Task TestTransposeMatmul(int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Vectorize = true;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var v13 = new Var("v13", new TensorType(DataTypes.Float32, new[] { 1, 1, 384, 128 }));
+        var v15 = new Var("v15", new TensorType(DataTypes.Float32, new[] { 1, 64, 384, 128 }));
+        var v19 = new Var("v19", new TensorType(DataTypes.Float32, new[] { 1, 64, 384, 128 }));
+        var v24 = new Var("v24", new TensorType(DataTypes.Float32, new[] { 1, 64, 384, 128 }));
+        Expr pre; // f32[1,3,28,28]
+        {
+            var v25 = IR.F.Math.Binary(BinaryOp.Mul, v24, v13); // f32[1,64,384,128]
+            var v26 = IR.F.Math.Binary(BinaryOp.Add, v19, v25); // f32[1,64,384,128]
+            var v27 = IR.F.Tensors.Transpose(v26, new[] { 0L, 1L, 3L, 2L }); // f32[1,64,128,384]
+            var v28 = IR.F.Math.MatMul(v15, v27); // f32[1,64,384,384]
+            pre = v28;
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { v13, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, v13.CheckedShape).Evaluate() },
+            { v15, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, v15.CheckedShape).Evaluate() },
+            { v19, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, v19.CheckedShape).Evaluate() },
+            { v24, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, v24.CheckedShape).Evaluate() },
+        };
+
+        var posts = new[] { pre };
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 1, 4, 4 }, new long[] { 8, 1, 3, 3 }, new int[] { 1, 1, 1, 1 }, new int[] { 1, 1 }, 0 })]
+    [InlineData(new object[] { new long[] { 3, 2, 4, 4 }, new long[] { 8, 2, 3, 3 }, new int[] { 0, 0, 1, 1 }, new int[] { 1, 2 }, 1 })]
+    [InlineData(new object[] { new long[] { 3, 2, 4, 4 }, new long[] { 8, 2, 3, 3 }, new int[] { 1, 0, 1, 1 }, new int[] { 2, 1 }, 2 })]
+    [InlineData(new object[] { new long[] { 1, 512, 64, 64 }, new long[] { 512, 512, 3, 3 }, new int[] { 1, 1, 1, 1 }, new int[] { 1, 1 }, 3 })]
+    public async Task TestConv2DAndIm2col(long[] inputShape, long[] wShape, int[] padding, int[] strides, int count)
+    {
+        var dilation = new[] { 1, 1 };
+        var groups = 1;
+        var input = new Var(new TensorType(DataTypes.Float32, inputShape));
+        var weights = new Var(new TensorType(DataTypes.Float32, wShape));
+        var bias = IR.F.Random.Normal(DataTypes.Float32, new[] { wShape[0] }).Evaluate().AsTensor();
+        var pre = IR.F.NN.Conv2D(input, weights, bias, strides, new[,] { { padding[0], padding[1] }, { padding[2], padding[3] } }, dilation, PadMode.Constant, groups);
+        var outShape = pre.CheckedShape.ToValueArray();
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, inputShape).Evaluate() },
+            { weights, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, wShape).Evaluate() },
+        };
+
+        Expr post = Passes.Rules.NTT.VectorizeConv2D.AddCandidate(input, weights, bias, strides, padding, wShape, outShape);
+        Expr post2 = Passes.Rules.NTT.VectorizeConv2D.AddVectorizedCandidate(input, weights, bias, strides, padding, wShape, outShape, Lane);
+        var posts = new[] { pre, post, post2 };
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, 48, 64, 16 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 8 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, 64, 768 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 8 }, 1 })]
+    public async Task TestMatMulReshapeUnary(long[] lhsShape, long[] rhsShape, long[] newShape, UnaryOp[] unaryOps, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var matmul = IR.F.Tensors.MatMul(lhs, rhs);
+        var reshaped = IR.F.Tensors.Reshape(matmul, newShape);
+        var unary = reshaped;
+        foreach (var item in unaryOps)
+        {
+            unary = IR.F.Math.Unary(item, unary);
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+        };
+
+        await RunCases($"Theory{number}", feedDict, new[] { unary });
+    }
+
+    [Theory]
+    [InlineData([new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new[] { 8 }, 0])]
+    public async Task TestVectorizePropagation(long[] lhsShape, long[] rhsShape, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+        };
+
+        var candidates = new[] {
+            IR.F.Math.Unary(UnaryOp.Abs, lhs),
+            IR.F.Math.Binary(BinaryOp.Add, lhs, 1f),
+            IR.F.Tensors.Unsqueeze(lhs, new[] { 0 }),
+        };
+        var posts = new List<BaseExpr>();
+
+        foreach (var c in candidates)
+        {
+            var matmul = IR.F.Tensors.MatMul(c, rhs);
+
+            var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane);
+            CompilerServices.TryMatch(matmul, rule.Pattern, out var result);
+            var context = new Passes.RunPassContext();
+            var vectorized = rule.GetReplaceCandidates(result!, context);
+            var rules = new IRewriteRule[] {
+                new Nncase.Passes.Rules.NTT.VectorizeUnaryPropagation(),
+                new Nncase.Passes.Rules.NTT.VectorizeBinaryPropagation(),
+                new Nncase.Passes.Rules.NTT.VectorizeUnsqueezePropagation(),
+            };
+            posts.AddRange(vectorized.Select(ret => CompilerServices.Rewrite(ret, rules, context)));
+        }
+
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData([new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new[] { 8 }, 0])]
+    public async Task TestDevectorizePropagation(long[] lhsShape, long[] rhsShape, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var matmul = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane);
+        CompilerServices.TryMatch(matmul, rule.Pattern, out var result);
+        var context = new Passes.RunPassContext();
+        var vectorized = rule.GetReplaceCandidates(result!, context).Cast<Expr>();
+        var posts = vectorized.Select(ret => CompilerServices.Rewrite(IR.F.Math.Unary(UnaryOp.Abs, ret), [new Nncase.Passes.Rules.NTT.UnaryDevectorizePropagation()], context)).ToList();
+        posts.AddRange(vectorized.Select(ret => CompilerServices.Rewrite(IR.F.Math.Binary(BinaryOp.Add, ret, 1f), [new Nncase.Passes.Rules.NTT.BinaryDevectorizeLhsPropagation()], context)));
+        posts.AddRange(vectorized.Select(ret => CompilerServices.Rewrite(IR.F.Tensors.Transpose(ret, new[] { 0, 2, 1 }), [new Nncase.Passes.Rules.NTT.TransposeDevectorizePropagation()], context)));
+        posts.AddRange(vectorized.Select(ret => CompilerServices.Rewrite(IR.F.Tensors.Unsqueeze(ret, new[] { 2 }), [new Nncase.Passes.Rules.NTT.UnsqueezeDevectorizePropagation()], context)));
+        posts.AddRange(vectorized.Select(ret => CompilerServices.Rewrite(IR.F.Tensors.Reduce(ReduceOp.Max, ret, new[] { 2 }, 0f, true), [new Nncase.Passes.Rules.NTT.ReduceDevectorizePropagation()], context)));
+        await RunCases($"Theory{number}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, -1, 64, 16 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 1 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, -1, 768 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 1 }, 1 })]
+    public async Task TestDynamicMatMulReshapeUnary(long[] lhsShape, long[] rhsShape, long[] newShape, UnaryOp[] unaryOps, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var dimM = new DimVar("m");
+        dimM.Metadata.Range = new(1, 48);
+        var lhsDims = lhsShape.Select(x => (Dimension)x).ToArray();
+        lhsDims[^2] = dimM;
+
+        var lhs = new Var(new TensorType(DataTypes.Float32, new RankedShape(lhsDims)));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(lhs, lhsDims.Select(x => x).ToArray());
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var matmul = IR.F.Tensors.MatMul(lhs, rhs);
+        var reshaped = IR.F.Tensors.Reshape(matmul, newShape);
+        var unary = reshaped;
+        foreach (var item in unaryOps)
+        {
+            unary = IR.F.Math.Unary(item, unary);
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+            { dimM, Value.FromTensor(lhsShape[^2]) },
+        };
+
+        await RunCases($"Theory{number}", feedDict, new[] { unary });
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, 48, 64, 16 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 1 }, 0 })]
+    public async Task TestReshapeAndUnsqueeze(long[] lhsShape, long[] rhsShape, long[] newShape, UnaryOp[] unaryOps, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var lhsDims = lhsShape.Select(x => (Dimension)x).ToArray();
+
+        var lhs = new Var(new TensorType(DataTypes.Float32, new RankedShape(lhsDims)));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(lhs, lhsDims.Select(x => x).ToArray());
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var matmul = IR.F.Tensors.MatMul(lhs, rhs);
+        var reshaped = IR.F.Tensors.Reshape(matmul, newShape);
+        var unary = reshaped;
+        foreach (var item in unaryOps)
+        {
+            unary = IR.F.Math.Unary(item, unary);
+        }
+
+        var unsqueezed = IR.F.Tensors.Unsqueeze(unary, new RankedShape(0));
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+        };
+
+        await RunCases($"Theory{number}", feedDict, new[] { unsqueezed });
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 48, 512 }, new long[] { 1, 512, 1024 }, new long[] { 1, 48, -1, 16 }, new[] { UnaryOp.Neg, UnaryOp.Cos }, new[] { 1 }, 0 })]
+    public async Task TestDynamicReshapeAndUnsqueeze(long[] lhsShape, long[] rhsShape, long[] newShape, UnaryOp[] unaryOps, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var dimN = new DimVar("n");
+        dimN.Metadata.Range = new(1, 1024);
+        var rhsDims = rhsShape.Select(x => (Dimension)x).ToArray();
+        rhsDims[^1] = dimN;
+
+        var lhs = new Var(new TensorType(DataTypes.Float32, new RankedShape(lhsShape)));
+        var rhs = new Var(new TensorType(DataTypes.Float32, new RankedShape(rhsDims)));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(rhs, rhsDims.Select(x => x).ToArray());
+        var matmul = IR.F.Tensors.MatMul(lhs, rhs);
+        var reshaped = IR.F.Tensors.Reshape(matmul, newShape);
+        var unary = reshaped;
+        foreach (var item in unaryOps)
+        {
+            unary = IR.F.Math.Unary(item, unary);
+        }
+
+        var unsqueezed = IR.F.Tensors.Unsqueeze(unary, new RankedShape(0));
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 2, rhsShape).Evaluate() },
+            { dimN, Value.FromTensor(rhsShape[^1]) },
+        };
+
+        await RunCases($"Theory{number}", feedDict, new[] { unsqueezed });
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 2, 48, 512 }, new long[] { 0 }, new[] { 1 }, 0 })]
+    public async Task TestGetItem(long[] inShape, long[] indices, int[] hierarchy, int number)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        targetOptions.HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+
+        var inDims = inShape.Select(x => (Dimension)x).ToArray();
+
+        var input = new Var(new TensorType(DataTypes.Float32, new RankedShape(inDims)));
+        CompileOptions.ShapeBucketOptions.VarMap.Add(input, inDims.Select(x => x).ToArray());
+
+        var output = IR.F.Tensors.GetItem(input, indices);
+        output = IR.F.Math.Unary(UnaryOp.Cos, output);
+
+        var feedDict = new Dictionary<IVar, IValue>()
+        {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, inShape).Evaluate() },
+        };
+
+        await RunCases($"Theory{number}", feedDict, new[] { output });
+    }
+
+    [Theory(Skip = "ToBig")]
+    [InlineData(new object[] { false, 0 })]
+    [InlineData(new object[] { true, 1 })] // enable vectorize
+    public async Task TestDecodeLayer(bool vectorize, int count)
+    {
+        // Memory usage is too high for CI env
+        if (bool.TryParse(Environment.GetEnvironmentVariable("CI"), out var inCI) && inCI)
+        {
+            return;
+        }
+
+        ((NTTTargetOptions)CompileOptions.TargetOptions).Vectorize = vectorize;
+        var hierarchy = new[] { 2, 4 };
+        ((NTTTargetOptions)CompileOptions.TargetOptions).Hierarchies[0] = hierarchy;
+        ((NTTTargetOptions)CompileOptions.TargetOptions).HierarchyNames = string.Join(string.Empty, "cbwt".TakeLast(hierarchy.Length));
+        ((NTTTargetOptions)CompileOptions.TargetOptions).HierarchySizes = Enumerable.Repeat((long)MathF.Pow(2, 30), hierarchy.Length).ToArray();
+        var vhidden_in = new Var("vhidden_in", new TensorType(DataTypes.Float32, new[] { 1, 384, 8192 }));
+        var vattn_mask = new Var("vattn_mask", new TensorType(DataTypes.Float32, new[] { 1, 1, 384, 384 }));
+        var vposition_ids = new Var("vposition_ids", new TensorType(DataTypes.Int64, new[] { 1, 384 }));
+        Expr pre;
+        {
+            var v0 = IR.F.NN.LayerNorm(2, 1E-05f, vhidden_in, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 1, new[] { 8192 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 2, new[] { 8192 }).Evaluate().AsTensor(), false); // f32[1,384,8192]
+            var v1 = IR.F.Tensors.MatMul(v0, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 3, new[] { 8192, 8192 }).Evaluate().AsTensor()); // f32[1,384,8192]
+            var v2 = IR.F.Tensors.Reshape(v1, new long[] { 1L, 384L, 64L, 128L }); // f32[1,384,64,128]
+            var v3 = IR.F.Tensors.Transpose(v2, new long[] { 0L, 2L, 1L, 3L }); // f32[1,64,384,128]
+            var v4 = IR.F.Tensors.Gather(IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 4, new[] { 384, 128 }).Evaluate().AsTensor(), 0, vposition_ids); // f32[1,384,128]
+            var v5 = IR.F.Tensors.Reshape(v4, new[] { 1, 1, 384, 128 }); // f32[1,1,384,128]
+            var v6 = IR.F.Math.Binary(BinaryOp.Mul, v3, v5); // f32[1,64,384,128]
+            var v7 = IR.F.Tensors.Slice(v3, new long[] { 64L }, new long[] { 128L }, new long[] { 3L }, new long[] { 1L }); // f32[1,64,384,64]
+            var v8 = IR.F.Math.Unary(UnaryOp.Neg, v7); // f32[1,64,384,64]
+            var v9 = IR.F.Tensors.Slice(v3, new long[] { 0L }, new long[] { 64L }, new long[] { 3L }, new long[] { 1L }); // f32[1,64,384,64]
+            var v10 = new IR.Tuple(v8, v9); // (f32[1,64,384,64], f32[1,64,384,64])
+            var v11 = IR.F.Tensors.Concat(v10, 3); // f32[1,64,384,128]
+            var v12 = IR.F.Tensors.Gather(IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 5, new[] { 384, 128 }).Evaluate().AsTensor(), 0, vposition_ids); // f32[1,384,128]
+            var v13 = IR.F.Tensors.Reshape(v12, new[] { 1, 1, 384, 128 }); // f32[1,1,384,128]
+            var v14 = IR.F.Math.Binary(BinaryOp.Mul, v11, v13); // f32[1,64,384,128]
+            var v15 = IR.F.Math.Binary(BinaryOp.Add, v6, v14); // f32[1,64,384,128]
+            var v16 = IR.F.Tensors.MatMul(v0, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 6, new[] { 8192, 8192 }).Evaluate().AsTensor()); // f32[1,384,8192]
+            var v17 = IR.F.Tensors.Reshape(v16, new long[] { 1L, 384L, 64L, 128L }); // f32[1,384,64,128]
+            var v18 = IR.F.Tensors.Transpose(v17, new long[] { 0L, 2L, 1L, 3L }); // f32[1,64,384,128]
+            var v19 = IR.F.Math.Binary(BinaryOp.Mul, v18, v5); // f32[1,64,384,128]
+            var v20 = IR.F.Tensors.Slice(v18, new long[] { 64L }, new long[] { 128L }, new long[] { 3L }, new long[] { 1L }); // f32[1,64,384,64]
+            var v21 = IR.F.Math.Unary(UnaryOp.Neg, v20); // f32[1,64,384,64]
+            var v22 = IR.F.Tensors.Slice(v18, new long[] { 0L }, new long[] { 64L }, new long[] { 3L }, new long[] { 1L }); // f32[1,64,384,64]
+            var v23 = new IR.Tuple(v21, v22); // (f32[1,64,384,64], f32[1,64,384,64])
+            var v24 = IR.F.Tensors.Concat(v23, 3); // f32[1,64,384,128]
+            var v25 = IR.F.Math.Binary(BinaryOp.Mul, v24, v13); // f32[1,64,384,128]
+            var v26 = IR.F.Math.Binary(BinaryOp.Add, v19, v25); // f32[1,64,384,128]
+            var v27 = IR.F.Tensors.Transpose(v26, new long[] { 0L, 1L, 3L, 2L }); // f32[1,64,128,384]
+            var v28 = IR.F.Tensors.MatMul(v15, v27); // f32[1,64,384,384]
+            var v29 = IR.F.Math.Binary(BinaryOp.Div, v28, new[] { 11.31370f }); // f32[1,64,384,384]
+            var v30 = IR.F.Math.Binary(BinaryOp.Add, v29, vattn_mask); // f32[1,64,384,384]
+            var v31 = IR.F.NN.Softmax(v30, 3); // f32[1,64,384,384]
+            var v32 = IR.F.Tensors.MatMul(v0, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 7, new[] { 8192, 8192 }).Evaluate().AsTensor()); // f32[1,384,8192]
+            var v33 = IR.F.Tensors.Reshape(v32, new long[] { 1L, 384L, 64L, 128L }); // f32[1,384,64,128]
+            var v34 = IR.F.Tensors.Transpose(v33, new long[] { 0L, 2L, 1L, 3L }); // f32[1,64,384,128]
+            var v35 = IR.F.Tensors.MatMul(v31, v34); // f32[1,64,384,128]
+            var v36 = IR.F.Tensors.Transpose(v35, new long[] { 0L, 2L, 1L, 3L }); // f32[1,384,64,128]
+            var v37 = IR.F.Tensors.Reshape(v36, new long[] { 1L, 384L, 8192L }); // f32[1,384,8192]
+            var v38 = IR.F.Tensors.MatMul(v37, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 8, new[] { 8192, 8192 }).Evaluate().AsTensor()); // f32[1,384,8192]
+            var v39 = IR.F.Math.Binary(BinaryOp.Add, vhidden_in, v38); // f32[1,384,8192]
+            var v40 = IR.F.NN.LayerNorm(2, 1E-05f, v39, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 9, new[] { 8192 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 2, new[] { 8192 }).Evaluate().AsTensor(), false); // f32[1,384,8192]
+            var v41 = IR.F.Tensors.MatMul(v40, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 10, new[] { 8192, 22016 }).Evaluate().AsTensor()); // f32[1,384,22016]
+            var v42 = IR.F.NN.Swish(v41, 1.0f); // f32[1,384,22016]
+            var v43 = IR.F.Tensors.MatMul(v40, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 11, new[] { 8192, 22016 }).Evaluate().AsTensor()); // f32[1,384,22016]
+            var v44 = IR.F.Math.Binary(BinaryOp.Mul, v42, v43); // f32[1,384,22016]
+            var v45 = IR.F.Tensors.MatMul(v44, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 12, new[] { 22016, 8192 }).Evaluate().AsTensor()); // f32[1,384,8192]
+            var v46 = IR.F.Math.Binary(BinaryOp.Add, v39, v45); // f32[1,384,8192]
+            pre = v46;
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { vhidden_in, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 13,  new[] { 1, 384, 8192 }).Evaluate() },
+            { vattn_mask, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 14,  new[] { 1, 1, 384, 384 }).Evaluate() },
+            { vposition_ids, IR.F.Random.Uniform(DataTypes.Int64, 383, 1, 15, new[] { 1, 384 }).Evaluate() },
+        };
+
+        var posts = new[] { pre };
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+
+    // [InlineData(new object[] { false, 0 })]
+    [InlineData(new object[] { true, 1 })] // enable vectorize
+    public async Task TestVAEDecRes(bool vectorize, int count)
+    {
+        CompileOptions.TargetOptions = new NTTTargetOptions() { Vectorize = vectorize };
+        var vlatent_sample = new Var("vlatent_sample", new TensorType(DataTypes.Float32, new[] { 1, 4, 64, 64 }));
+        Expr pre;
+        {
+            var v0 = IR.F.NN.Conv2D(vlatent_sample, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 1, new[] { 4, 4, 1, 1 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 2, new[] { 4 }).Evaluate().AsTensor(), new[] { 1L, 1L }, new[,] { { 0L, 0L }, { 0L, 0L } }, new[] { 1L, 1L }, PadMode.Constant, 1L, new[] { float.NegativeInfinity, float.PositiveInfinity }); // f32[1,4,64,64]
+            var v1 = IR.F.NN.Conv2D(v0, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 3, new[] { 512, 4, 3, 3 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 4, new[] { 512 }).Evaluate().AsTensor(), new[] { 1L, 1L }, new[,] { { 1L, 1L }, { 1L, 1L } }, new[] { 1L, 1L }, PadMode.Constant, 1L, new[] { float.NegativeInfinity, float.PositiveInfinity }); // f32[1,512,64,64]
+            var v2 = IR.F.Tensors.Reshape(v1, new[] { 1L, 32L, 65536L }); // f32[1,32,65536]
+            var v3 = IR.F.NN.InstanceNormalization(v2, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 5, new[] { 32 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 6, new[] { 32 }).Evaluate().AsTensor(), 1E-06f); // f32[1,32,65536]
+            var v4 = IR.F.Tensors.Reshape(v3, new[] { 1L, 512L, 64L, 64L }); // f32[1,512,64,64]
+            var v5 = IR.F.Math.Binary(BinaryOp.Mul, v4, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 7, new[] { 512, 1, 1 }).Evaluate().AsTensor()); // f32[1,512,64,64]
+            var v6 = IR.F.Math.Binary(BinaryOp.Add, v5, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 8, new[] { 512, 1, 1 }).Evaluate().AsTensor()); // f32[1,512,64,64]
+            var v7 = IR.F.NN.Swish(v6, 1f); // f32[1,512,64,64]
+            var v8 = IR.F.NN.Conv2D(v7, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 9, new[] { 512, 512, 3, 3 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 10, new[] { 512 }).Evaluate().AsTensor(), new[] { 1L, 1L }, new[,] { { 1L, 1L }, { 1L, 1L } }, new[] { 1L, 1L }, PadMode.Constant, 1L, new[] { float.NegativeInfinity, float.PositiveInfinity }); // f32[1,512,64,64]
+            var v9 = IR.F.Tensors.Reshape(v8, new[] { 1L, 32L, 65536L }); // f32[1,32,65536]
+            var v10 = IR.F.NN.InstanceNormalization(v9, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 11, new[] { 32 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 12, new[] { 32 }).Evaluate().AsTensor(), 1E-06f); // f32[1,32,65536]
+            var v11 = IR.F.Tensors.Reshape(v10, new[] { 1L, 512L, 64L, 64L }); // f32[1,512,64,64]
+            var v12 = IR.F.Math.Binary(BinaryOp.Mul, v11, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 13, new[] { 512, 1, 1 }).Evaluate().AsTensor()); // f32[1,512,64,64]
+            var v13 = IR.F.Math.Binary(BinaryOp.Add, v12, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 14, new[] { 512, 1, 1 }).Evaluate().AsTensor()); // f32[1,512,64,64]
+            var v14 = IR.F.NN.Swish(v13, 1f); // f32[1,512,64,64]
+            var v15 = IR.F.NN.Conv2D(v14, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 15, new[] { 512, 512, 3, 3 }).Evaluate().AsTensor(), IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 16, new[] { 512 }).Evaluate().AsTensor(), new[] { 1L, 1L }, new[,] { { 1L, 1L }, { 1L, 1L } }, new[] { 1L, 1L }, PadMode.Constant, 1L, new[] { float.NegativeInfinity, float.PositiveInfinity }); // f32[1,512,64,64]
+            pre = IR.F.Math.Binary(BinaryOp.Add, v1, v15); // f32[1,512,64,64]
+        }
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { vlatent_sample, IR.F.Random.Normal(DataTypes.Float32, 0, 0.1, 13,  new[] { 1, 4, 64, 64 }).Evaluate() },
+        };
+
+        var posts = new[] { pre };
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 33, 512 }, new long[] { 512, 255 }, false, false, new[] { 8 }, 0 })]
+    public async Task TestNonUiniformDistMatmul(long[] lhsShape, long[] rhsShape, bool constA, bool constB, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyNames = string.Join(string.Empty, "cbwt".Skip(4 - hierarchy.Length));
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var lhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(lhsShape, 1.0f).Evaluate().AsTensor();
+        var rhsTensor = IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate().AsTensor(); // IR.F.Tensors.ConstantOfShape(rhsShape, 1.0f).Evaluate().AsTensor();
+
+        Expr lhs = constA ? lhsTensor : new Var(new TensorType(DataTypes.Float32, lhsShape));
+        Expr rhs = constB ? rhsTensor : new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Tensors.MatMul(lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>();
+        if (!constA)
+        {
+            feedDict.Add((Var)lhs, Value.FromTensor(lhsTensor));
+        }
+
+        if (!constB)
+        {
+            feedDict.Add((Var)rhs, Value.FromTensor(rhsTensor));
+        }
+
+        var rule = new Passes.Rules.NTT.VectorizeMatMul(2, Lane, transB: false);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 4, 4, 255 }, new[] { 8 }, 0 })]
+    public async Task TestNonUiniformDistUnary(long[] shape, int[] hierarchy, int count)
+    {
+        var targetOptions = (NTTTargetOptions)CompileOptions.TargetOptions;
+        targetOptions.Hierarchies[0] = hierarchy;
+        targetOptions.HierarchyLatencies = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        targetOptions.HierarchyBandWidths = Enumerable.Repeat(1, hierarchy.Length).ToArray();
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pre = IR.F.Math.Unary(UnaryOp.Neg, input);
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeUnary(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { CompareOp.LowerThan, new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 0 })]
+    public async Task TestVectorizeCompare(CompareOp op, long[] lhsShape, long[] rhsShape, int count)
+    {
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Math.Compare(op, lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate() },
+        };
+
+        var maskVectorStyle = RuntimeInformation.ProcessArchitecture switch
+        {
+            Architecture.X64 or Architecture.Arm64 => MaskVectorStyle.Fat,
+            _ => throw new NotSupportedException($"Unsupported architecture: {RuntimeInformation.ProcessArchitecture}"),
+        };
+        var rule = new Passes.Rules.NTT.VectorizeCompare(maskVectorStyle, Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 16, 1, 32 }, new long[] { 1, 16, 32, 32 }, 0 })]
+    [InlineData(new object[] { new long[] { 1, 1, 32, 32 }, new long[] { 1, 16, 32, 32 }, 1 })]
+    public async Task TestVectorizeExpand(long[] shape, long[] newShape, int count)
+    {
+        var input = new Var(new TensorType(DataTypes.Float32, shape));
+        var pre = IR.F.Tensors.Expand(input, newShape);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, shape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeExpand(1, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 0 })]
+    [InlineData(new object[] { new long[] { 1 }, new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 1 })]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1 }, new long[] { 1, 8, 64, 16 }, 2 })]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1, 1, 64, 16 }, new long[] { 1, 8, 64, 16 }, 3 })]
+    public async Task TestVectorizeWhere(long[] condShape, long[] lhsShape, long[] rhsShape, int count)
+    {
+        var cond = new Var(new TensorType(DataTypes.Boolean, condShape));
+        var lhs = new Var(new TensorType(DataTypes.Float32, lhsShape));
+        var rhs = new Var(new TensorType(DataTypes.Float32, rhsShape));
+        var pre = IR.F.Tensors.Where(cond, lhs, rhs);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { cond, IR.F.Random.Normal(DataTypes.Boolean, 0, 1, 1, condShape).Evaluate() },
+            { lhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, lhsShape).Evaluate() },
+            { rhs, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, rhsShape).Evaluate() },
+        };
+
+        var maskVectorStyle = RuntimeInformation.ProcessArchitecture switch
+        {
+            Architecture.X64 or Architecture.Arm64 => MaskVectorStyle.Fat,
+            _ => throw new NotSupportedException($"Unsupported architecture: {RuntimeInformation.ProcessArchitecture}"),
+        };
+        var rule = new Passes.Rules.NTT.VectorizeWhere(maskVectorStyle, Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 1, 0 })]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 2, 1 })]
+    [InlineData(new object[] { new long[] { 1, 8, 64, 16 }, new long[] { 1, 8, 64, 16 }, 3, 2 })]
+    public async Task TestVectorizeConcat(long[] inShape1, long[] inShape2, int axis, int count)
+    {
+        var input1 = new Var(new TensorType(DataTypes.Float32, inShape1));
+        var input2 = new Var(new TensorType(DataTypes.Float32, inShape2));
+        var pre = IR.F.Tensors.Concat(new IR.Tuple(input1, input2), axis);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input1, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, inShape1).Evaluate() },
+            { input2, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, inShape2).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeConcat(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    [Theory]
+    [InlineData(new object[] { new long[] { 16, 16, 16 }, new long[] { 2, 1 }, new long[] { 16, 16, 16 }, 0 })]
+    [InlineData(new object[] { new long[] { 16, 16, 16 }, new long[] { 3, 2 }, new long[] { 16, 16 }, 1 })]
+    [InlineData(new object[] { new long[] { 16, 16, 256, 256 }, new long[] { 16, 16, 256, 256, 4 }, new long[] { 16, 16, 256, 256 }, 2 })]
+    public async Task TestVectorizeScatterND(long[] inShape, long[] indicesShape, long[] updatesShape, int count)
+    {
+        var input = new Var(new TensorType(DataTypes.Float32, inShape));
+        var indices = IR.F.Random.Uniform(DataTypes.Int64, 15, 0, 1, indicesShape).Evaluate().AsTensor();
+        var updates = new Var(new TensorType(DataTypes.Float32, updatesShape));
+        var pre = IR.F.Tensors.ScatterND(input, indices, updates);
+
+        var feedDict = new Dictionary<IVar, IValue>() {
+            { input, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 1, inShape).Evaluate() },
+            { updates, IR.F.Random.Normal(DataTypes.Float32, 0, 1, 3, updatesShape).Evaluate() },
+        };
+
+        var rule = new Passes.Rules.NTT.VectorizeScatterND(Rank, Lane);
+        CompilerServices.TryMatch(pre, rule.Pattern, out var result);
+        var posts = new[] { pre }.Concat(rule.GetReplaceCandidates(result!, new Passes.RunPassContext()));
+        await RunCases($"Theory{count}", feedDict, posts);
+    }
+
+    internal async Task RunCases(string dumpDir, Dictionary<IVar, IValue> feedDict, IEnumerable<BaseExpr> posts, Dictionary<IVar, IValue>? feedDictRT = null, bool enableAutoDist = true)
+    {
+        var postArray = posts.ToArray();
+        using var pinner = new ExprPinner(postArray);
+        for (int i = 0; i < postArray.Length; i++)
+        {
+#if DEBUG
+            System.Console.WriteLine(CompilerServices.Print(postArray[i]));
+#endif
+            var kernelCase = new CpuKernelCase($"Case{i}", new Fusion("kernel", CPUTarget.Kind, postArray[i], feedDict.Keys.ToArray()), feedDict.Keys.ToArray(), feedDict.Values.Select(v => v.AsTensor()).ToArray(), feedDictRT?.Values.Select(v => v.AsTensor()).ToArray() ?? []);
+            await Run(dumpDir, kernelCase, enableAutoDist: enableAutoDist);
+        }
+    }
+
+    internal async Task Run(string dumpDir, CpuKernelCase kernelCase, bool enableAutoDist = true)
+    {
+        using var dumpScope = new Diagnostics.DumpScope(Path.Join(dumpDir, kernelCase.Name), CompileOptions.DumpFlags);
+
+        // convert fusion to prim func
+        var fusion = kernelCase.Fusion;
+        if (fusion.Body.CheckedType is InvalidType)
+        {
+            return;
+        }
+
+        var main = new Function("main", DefaultTargetName, fusion.Body, kernelCase.Vars.ToArray());
+        main.Metadata = fusion.Body.Metadata;
+
+        var module = new IR.IRModule(main);
+        var inputs = kernelCase.Inputs.ToArray();
+        var outputs = fusion.Body.Evaluate(kernelCase.Vars.Zip(inputs).ToDictionary(p => p.First, p => (IValue)Value.FromTensor(p.Second))).AsTensors();
+
+#if DEBUG
+        for (var i = 0; i < inputs.Length; i++)
+        {
+            using (var fs = Diagnostics.DumpScope.Current.OpenFile($"input_{i}.json"))
+            {
+                JsonSerializer.Serialize(fs, inputs[i], JsonSerializerOptions.Default);
+            }
+        }
+
+        for (int i = 0; i < outputs.Length; i++)
+        {
+            using (var fs = Diagnostics.DumpScope.Current.OpenFile($"output_{i}.json"))
+            {
+                JsonSerializer.Serialize(fs, outputs[i], JsonSerializerOptions.Default);
+            }
+        }
+#endif
+        await Compile(module, enableAutoDist: enableAutoDist);
+        var (kmodel_path, _) = Testing.BuildKModel("test", module, CompileSession, false);
+        Tensor[] actuals;
+        if (kernelCase.RTInputs.Any())
+        {
+            actuals = Testing.RunKModel(kmodel_path, Diagnostics.DumpScope.Current.Directory, kernelCase.RTInputs.ToArray()).AsTensors();
+        }
+        else
+        {
+            actuals = Testing.RunKModel(kmodel_path, Diagnostics.DumpScope.Current.Directory, inputs).AsTensors();
+        }
+#if DEBUG
+        for (int i = 0; i < actuals.Length; i++)
+        {
+            using (var fs = Diagnostics.DumpScope.Current.OpenFile($"actual_{i}.json"))
+            {
+                JsonSerializer.Serialize(fs, actuals[i], JsonSerializerOptions.Default);
+            }
+        }
+#endif
+        for (int i = 0; i < outputs.Length; i++)
+        {
+            var cos = Comparator.CosSimilarity(outputs[i], actuals[i]);
+            Assert.True(cos > 0.999, $"the {Diagnostics.DumpScope.Current.Directory} output {i} cos: {cos} ");
+        }
+    }
+
+    private async Task Compile(IRModule module, bool enableAutoDist = true)
+    {
+        var pmgr = CompileSession.CreatePassManager("pmgr");
+        var compiler = (Nncase.Compiler.Compiler)CompileSession.Compiler;
+        compiler.TargetIndependentPass(pmgr);
+        CompileSessionScope.Current!.Target.RegisterPostAutoVectorizePass(pmgr, CompileSessionScope.Current!.CompileOptions);
+        if (enableAutoDist)
+        {
+            compiler.AutoDistributedPass(pmgr);
+        }
+
+        compiler.AutoTilingPass(pmgr);
+        compiler.TIRPass(pmgr);
+        await pmgr.RunAsync(module);
+    }
+}
diff --git a/tests/config.toml b/tests/config.toml
index c28659f82..137e4d7dc 100644
--- a/tests/config.toml
+++ b/tests/config.toml
@@ -139,6 +139,29 @@ Vectorize = true
 Hierarchies = [[1]]
 HierarchyNames = "t"
 
+[target.cuda]
+eval = false
+infer = true
+similarity_name = 'cosine'
+
+[target.cuda.mode.noptq]
+enabled = true
+threshold = 0.999
+
+[target.cuda.mode.ptq]
+enabled = false
+threshold = 0.98
+
+[target.cuda.target_options]
+Hierarchies = [[1]]
+HierarchyNames = "t"
+HierarchySizes = [603979776, 1048576]
+MemoryCapacities = [262144, 150994944]
+MemoryBandWidths = [64, 32]
+UnifiedMemoryArch = false
+Vectorize = true
+HierarchyKind = "nncase.HierarchyKind.SMT"
+
 [target.k510]
 eval = true
 infer = true
diff --git a/tests/other/test_targets.py b/tests/other/test_targets.py
index 4f9ae7b28..c30e4ebac 100644
--- a/tests/other/test_targets.py
+++ b/tests/other/test_targets.py
@@ -19,6 +19,7 @@
 
 def test_targets(request):
     assert nncase.check_target("cpu")
+    assert nncase.check_target("cuda")
     #assert nncase.check_target("k210")
     #assert nncase.check_target("vulkan")
 
diff --git a/toolchains/riscv64.cmake b/toolchains/riscv64.cmake
new file mode 100644
index 000000000..09c3762ef
--- /dev/null
+++ b/toolchains/riscv64.cmake
@@ -0,0 +1,32 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+if(DEFINED ENV{RISCV_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+endif()
+
+if(NOT RISCV_ROOT_PATH)
+    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined for rtos runtime")
+endif()
+
+set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/clang")
+set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/clang++")
+#set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc")
+#set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++")
+set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-elf")
+
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(ENABLE_VULKAN_RUNTIME OFF)
+set(ENABLE_OPENMP OFF)
+set(ENABLE_HALIDE OFF)
+set(DEFAULT_BUILTIN_RUNTIMES OFF)
+set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF)
+set(BUILD_BENCHMARK OFF)
+
+set(BUILDING_RUNTIME ON)
+set(ENABLE_K230_RUNTIME ON)
+set(BUILD_SHARED_LIBS OFF)
\ No newline at end of file
diff --git a/toolchains/x86_64-linux-cuda.profile.jinja b/toolchains/x86_64-linux-cuda.profile.jinja
new file mode 100644
index 000000000..0e607e8d3
--- /dev/null
+++ b/toolchains/x86_64-linux-cuda.profile.jinja
@@ -0,0 +1,12 @@
+[conf]
+tools.cmake.cmaketoolchain:generator=Ninja
+tools.build:compiler_executables={"cpp": "clang++", "c": "clang", "cuda": "clang++"}
+
+[settings]
+os=Linux
+arch=x86_64
+build_type=Release
+compiler=clang
+compiler.cppstd=20
+compiler.libcxx=libstdc++11
+compiler.version=21